Mentorseas Matching Notebook!

I hope that this notebook will be a good guide to make the matching process a lot less daunting! To start, do these imports and then upload a csv of the mentee responses from the google form and a csv of the master list of students that OASA sent.

In [None]:
import io
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()
#click browse under this once you've ran it to upload the 2 csvs

Let's call the oasa list "oasa_df" and the mentees form responses "form_df". All you gotta do is input the name that was uploaded in the cell above to import these.

In [None]:
oasa_df = pd.read_csv(io.BytesIO(uploaded['CleanOASA List.csv']))
form_df = pd.read_csv(io.BytesIO(uploaded['9_10_22_Cleaned_MenteeResponses 1.csv']))

In [None]:
# remove any trailing or leading spaces
form_df['Full Legal Name (First and Last)'] = form_df['Full Legal Name (First and Last)'].str.strip()
form_df['Preferred Email Address'] = form_df['Preferred Email Address'].str.strip()
oasa_df['Admit Status'] = oasa_df['Admit Status'].str.strip()

In [None]:
#just to check it out
oasa_df.head(15)

In [None]:
# now we'll split into a freshman oasa list and a transfer one
freshman_oasa_df = oasa_df[oasa_df['Admit Status'] == 'Freshman']
transfer_oasa_df = oasa_df[oasa_df['Admit Status'] == 'Transfer']

In [None]:
form_df

In [None]:
# we'll also split the mentee form dataframe into a freshman and a transfer one
freshman_form_df = form_df[form_df['Are you entering UCLA as a transfer student?'] == 'No']
transfer_form_df = form_df[form_df['Are you entering UCLA as a transfer student?'] == 'Yes']

In [None]:
freshman_form_df.shape

In [None]:
transfer_form_df.shape

In [None]:
freshman_oasa_df.shape

In [None]:
transfer_oasa_df.shape

In [None]:
##this function takes a dataframe and imputes columns for all possible names (first, midddle (2nd middle, etc), last)
def generate_names(df):
  #define a set to keep track of all columns involving names that are being added to the dataframe
  column_labels= set()
  for index, row in df.iterrows():
    #grab name
    name = row['Full Name']
    #since name is in format 'LAST, FIRST MIDDLE.., possible suffix' split by comma first
    name_arr = name.split(',')
    #dont care about suffix so get the 2nd value of the split and split by space
    pre_last_name_arr = name_arr[1].split()
    last_name = name_arr[0]
    first_name = ""
    #iterate through the list of names before last name
    for x in range(len(pre_last_name_arr)):
      #get the number of middle names being included and label the column as such
      column_label = str(x) + " middle names"
      column_labels.add(column_label)
      first_name += pre_last_name_arr[x] + ' '
      full_name = first_name + last_name
      #add the full name at the index with the column label generated
      df.loc[index, column_label] = full_name

  #return the dataframe and a list of the names of columns added
  return df, list(sorted(column_labels))

In [None]:
freshman_oasa_df, freshman_names_col = generate_names(freshman_oasa_df)
freshman_oasa_df.apply(lambda x: x.count(), axis= 0)

In [None]:
transfer_oasa_df, transfer_names_col = generate_names(transfer_oasa_df)
transfer_oasa_df.apply(lambda x: x.count(), axis= 0)

In [None]:
##this function takes in dataframes of the oasa list and form responses, finds which form responses have the same email as the oasa list and merges
def match_email(oasa_df, form_df):
  #left merge on oasa datafram
  #MAKE SURE TO CHECK WATCH COLUMNS ARE BEING INDEXED: the point is to merge using emails, but the names in the dataframes are different ('Email' in oasa, 'Preferred Email Address' on form)
  oasa_df = oasa_df.merge(form_df['Preferred Email Address'], left_on=oasa_df['Email'].str.lower(), right_on=form_df['Preferred Email Address'].str.lower() , how='left')
  #count how many non null values are in each column
  counts = oasa_df.apply(lambda x: x.count(), axis=0)
  print(counts)
  num_found = counts['Preferred Email Address']
  #print out how many emails match between OASA and google form
  print(str(num_found) + ' emails are matched from form to oasa list')
  oasa_df = oasa_df.drop(['key_0'], 1)
  return oasa_df

In [None]:
freshman_oasa_df = match_email(freshman_oasa_df, freshman_form_df)

In [None]:
##this functions takes in datafram of the oasa list and form responses, along with the labels of the name columns in the oasa list used to merge
def match_names(oasa_df, form_df, column_labels):
  #iterate through all columns ('0 middle names', '1 middle names' , etc)
  print(oasa_df.shape)
  for label in column_labels:
    #get the preferred emails from the form into the oasa list, but make the names column (used to merge) lower case such that the matches are found
    #left merge, meaning you attempt to match all names in the oasa list with some name in the form, and grab the email associated with it
    oasa_df = pd.merge(oasa_df, form_df[['Full Legal Name (First and Last)', 'Preferred Email Address']], left_on=oasa_df[label].str.lower() , right_on=form_df['Full Legal Name (First and Last)'].str.lower(), how='left')
    #preferred email addresses will show up in two columns, so you merge the two by filling the nulls with the new column
    oasa_df['Preferred Email Address'] = oasa_df['Preferred Email Address_x'].fillna(oasa_df['Preferred Email Address_y'])
    oasa_df = oasa_df.drop_duplicates(subset=['Full Name', 'Email'])
    print(oasa_df.shape)
    #get counts of columns
    counts = oasa_df.apply(lambda x: x.count(), axis= 0)
    total_matched = counts['Preferred Email Address']
    #the number of new people matched is the difference between the updated number and the old number of matched
    new_matched = total_matched - counts['Preferred Email Address_x']
    print('Matched ' + str(new_matched) + ' mentees using ' + label)
    print('In total ' + str(total_matched) + ' mentees matched')
    #drop unnecessary columns
    oasa_df = oasa_df.drop(['key_0', 'Full Legal Name (First and Last)', 'Preferred Email Address_x', 'Preferred Email Address_y'], 1)
  return oasa_df

In [None]:
freshman_oasa_df = match_names(freshman_oasa_df, freshman_form_df, freshman_names_col)

In [None]:
transfer_oasa_df = match_email(transfer_oasa_df, transfer_form_df)

In [None]:
transfer_oasa_df = match_names(transfer_oasa_df, transfer_form_df, transfer_names_col)

In [None]:
def finalize_oasa_list(oasa_df, form_df, names_col):
  df = pd.merge(form_df, oasa_df[['Preferred Email Address', 'Full Name', 'Admit Status']], on='Preferred Email Address', how='left')
  unmatched_from_form = df[df['Full Name'].isna()]
  print(unmatched_from_form.shape)
  unmatched_from_form.to_csv('unmatched.csv')
  #files.download('OASA-Freshman.csv')
  if (unmatched_from_form.shape[0] != 0):
    print("NOT ALL FORM ENTRIES MATCHED")
    return 0
  else:
    oasa_df['Preferred Email Address'] = oasa_df['Preferred Email Address'].fillna(oasa_df['Email'])
    oasa_df.drop(names_col, axis=1, inplace=True)
    oasa_df.drop('Email', axis=1, inplace=True)
    oasa_df.rename({'Preferred Email Address': 'Email'}, axis=1, inplace=True)
    return oasa_df


In [None]:
freshman_oasa_df = finalize_oasa_list(freshman_oasa_df, freshman_form_df, freshman_names_col)

In [None]:
transfer_oasa_df = finalize_oasa_list(transfer_oasa_df, transfer_form_df, transfer_names_col)

In [None]:
from google.colab import files
freshman_oasa_df.to_csv('Clean-Freshmen.csv')
files.download('Clean-Freshmen.csv')
transfer_oasa_df.to_csv('Clean-Transfers.csv')
files.download('Clean-Transfers.csv')

Start of adaptation of github repo to this notebook:

In [None]:
#imports
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
#constants
# mentors_filename="MentorSEAS Mentor Form.csv"
# mentors_filename="MentorSEAS Mentor Form 19 (Freshman) 3.csv"
# mentors_filename="MentorSEAS Mentor Form 19 (Transfer).csv"
mentors_filename="MentorSEAS Mentor Form (Transfer) adjusted.csv"
# mentee_responses_filename="2022-23 MentorSEAS Mentee Form.csv"
# mentee_responses_filename="9_10_22_CleanedMenteeResponses 3.csv"
# mentee_responses_filename="Unlinked9_10_22_MentorSEAS Mentee Form (Freshmen).csv"
mentee_responses_filename="Unlinked9_10_22_MentorSEAS Mentee Form (Transfer).csv"
# mentees_master_filename="New-Freshmen.csv"
# mentees_master_filename="Clean-Freshmen.csv"
# mentees_master_filename="trialplzwork.csv"
# mentees_master_filename="Trial-Zero-Misses (1).csv"
mentees_master_filename="Clean-Transfers.csv"

categoryChoices = {
    'ActivitiesRank': [
        'Select your 5 favorite activities out of 10 listed below! [1st]',
        'Select your 5 favorite activities out of 10 listed below! [2nd]',
        'Select your 5 favorite activities out of 10 listed below! [3rd]',
        'Select your 5 favorite activities out of 10 listed below! [4th]',
        'Select your 5 favorite activities out of 10 listed below! [5th]'],
    'Activities': [
        "Art/Theater",
        "Hiking/Outdoors",
        "Community Service",
        "Gym",
        "Greek Life",
        "Sports",
        "Video Games",
        "Watching TV/Movies",
        "Music" ],
    'FridaysQuestion': 'Which of the following best describes an ideal Friday night for you?',
    'FridaysResponse': [
        'Catch up on sleep/finish up homework',
        'Get ahead in class/Finish up homework',
        'Hang out with your best friend',
        'Hang out in a big group of friends',
        'Party' ],
    'MentorTypeQuestion': 'What kind of mentor would you be?',
    'MenteeTypeQuestion': 'What kind of mentor would you prefer?',
    'MentorTypeResponse': [
        'One that primarily provides emotional support',
        'One that primarily provides academic support'],
    'MajorsQuestion': 'What is your major?',
    'Majors': [
        'Aerospace Engineering',
        'Bioengineering',
        'Chemical Engineering',
        'Civil & Environmental Engineering',
        'Computer Engineering',
        'Computer Science',
        'Computer Science and Engineering',
        'Electrical Engineering',
        'Material Science and Engineering',
        'Mechanical Engineering',
        'Undeclared Engineering' ],
    'TransferQuestion': 'Did you enter UCLA as a transfer student?',
    'MenteeTransferQuestion': 'Are you entering UCLA as a transfer student?',
    'TransferResponse': ['Yes', 'No'],
    'MentorshipQuestion': 'Could you provide mentorship in any of the following categories?',
    'MenteeshipQuestion': 'Would you like a mentor in any of the following categories?',
    'Mentorship': [
        "Changing Majors",
        "First Generation",
        "International",
        "LGBTQ",
        "Out of State",
        "Racial Minority in Engineering",
        "Transfer Student",
        "Women in Engineering"],
    'MenteeTimeZoneQuestion': 'What time zone will you be living in this year?',
    'TimeZones' : [
        'GMT +0:00 (Greenwich Mean Time)',
        'GMT +1:00 (European Central Time)',
        'GMT +2:00 (Eastern European Time)',
        'GMT +3:00 (Middle East Time)',
        'GMT +4:00 (Near East Time)',
        'GMT +5:00 (Pakistan Lahore Time)',
        'GMT +5:30 (India Standard Time)',
        'GMT +6:00 (Bangladesh Standard Time)',
        'GMT +7:00 (Vietnam Standard Time)',
        'GMT +8:00 (China Taiwan Time)',
        'GMT +9:00 (Japan Standard Time)',
        'GMT +10:00 (Australia Eastern Time)',
        'GMT +12:00 (New Zealand Standard Time)',
        'GMT -10:00 (Hawaii Standard Time)',
        'GMT -9:00 (Alaska Standard Time)',
        'GMT -8:00 (Pacific Standard Time)',
        'GMT -7:00 (Mountain Standard Time)',
        'GMT -6:00 (Central Standard Time)',
        'GMT -5:00 (Eastern Standard Time)',
        'GMT -4:00 (Puerto Rico and US Virgin Islands Time)',
        'GMT -3:30 (Canada Newfoundland Time)',
        'GMT -3:00 (Argentina Eastern Time)',
        'GMT -1:00 (Central African Time)'
    ]
}


# must strictly use numbers 1 through n
majorToNums = {
    'Aerospace Engineering': 2,
    'Bioengineering': 4,
    'Chemical Engineering': 3,
    'Civil & Environmental Engineering': 3,
    'Computer Engineering': 1,
    'Computer Science': 1,
    'Computer Science and Engineering': 1,
    'Electrical Engineering': 1,
    'Material Science and Engineering': 3,
    'Mechanical Engineering': 2,
    'Undeclared Engineering': 4
}

# tied to the majorToNums, masterMajorToNums dict
num_major_categories = 4

mentorMatchCols = [
  categoryChoices.get('Activities')[0],
  categoryChoices.get('Activities')[1],
  categoryChoices.get('Activities')[2],
  categoryChoices.get('Activities')[3],
  categoryChoices.get('Activities')[4],
  categoryChoices.get('Activities')[5],
  categoryChoices.get('Activities')[6],
  categoryChoices.get('Activities')[7],
  categoryChoices.get('Activities')[8],
  categoryChoices.get('FridaysQuestion'),
  categoryChoices.get('Mentorship')[0],
  categoryChoices.get('Mentorship')[1],
  categoryChoices.get('Mentorship')[2],
  categoryChoices.get('Mentorship')[3],
  categoryChoices.get('Mentorship')[4],
  categoryChoices.get('Mentorship')[5],
  categoryChoices.get('Mentorship')[6],
  categoryChoices.get('Mentorship')[7],
  categoryChoices.get('MajorsQuestion'),
]

menteeMatchCols = [
  categoryChoices.get('Activities')[0],
  categoryChoices.get('Activities')[1],
  categoryChoices.get('Activities')[2],
  categoryChoices.get('Activities')[3],
  categoryChoices.get('Activities')[4],
  categoryChoices.get('Activities')[5],
  categoryChoices.get('Activities')[6],
  categoryChoices.get('Activities')[7],
  categoryChoices.get('Activities')[8],
  categoryChoices.get('FridaysQuestion'),
  categoryChoices.get('Mentorship')[0],
  categoryChoices.get('Mentorship')[1],
  categoryChoices.get('Mentorship')[2],
  categoryChoices.get('Mentorship')[3],
  categoryChoices.get('Mentorship')[4],
  categoryChoices.get('Mentorship')[5],
  categoryChoices.get('Mentorship')[6],
  categoryChoices.get('Mentorship')[7],
  categoryChoices.get('MajorsQuestion'),
#   categoryChoices.get('MenteeTimeZoneQuestion')
]

master_major_map={
  'AEROSPCE':'Aerospace Engineering',
  'BIOENGR':'Bioengineering',
  'CHM ENGR':'Chemical Engineering',
  'CIV ENGR':'Civil & Environmental Engineering',
  'COM ENGR':'Computer Engineering',
  'COM SCI':'Computer Science',
  'C S&ENGR':'Computer Science and Engineering',
  'ELE ENGR':'Electrical Engineering',
  'MAT ENGR':'Material Science and Engineering',
  'MECHANIC':'Mechanical Engineering',
  'UN-E&AS':'Undeclared Engineering'
}



In [None]:
'''
This function is called uppon for all mentor data. It is used to change the input data from the responses into numbers ex from one to five
and this will then be used to plot people in a vector plane to find people's proximity of compatability.
'''
def preprocess_mentors(filename):
  originalMentorData = pd.read_csv(filename)
  mentorData = originalMentorData.copy()

  n_mentorEntries = mentorData.shape[0]

  mentorFamilyQuestion = 'Follow the instructions below to join a family'
  # possibly useless maybe take out later
  for i in range (0, n_mentorEntries):
      family = mentorData.iloc[i][mentorFamilyQuestion]

  # print(mentorData)
  zeroArray = np.zeros(n_mentorEntries)
  for i in range (0, len(categoryChoices['Activities'])):
      mentorData[(categoryChoices.get('Activities'))[i]] = zeroArray

  #
  for i in range (0, n_mentorEntries):
      for j in range (0, len(categoryChoices['ActivitiesRank'])):
          activity = mentorData.iloc[i][(categoryChoices.get('ActivitiesRank'))[j]]
          email = mentorData.iloc[i]['Preferred Email Address']
          mentorData.loc[mentorData['Preferred Email Address']==email, activity]=5-j
  mentorData2 = mentorData.copy()

  for i in range (1, len(categoryChoices['FridaysResponse']) + 1):
      mentorData2.loc[mentorData2[categoryChoices['FridaysQuestion']].astype('str')==categoryChoices.get('FridaysResponse')[i-1], categoryChoices['FridaysQuestion']] = i

  mentorData2.loc[mentorData2[categoryChoices['MentorTypeQuestion']].astype('str') ==categoryChoices.get('MentorTypeResponse')[0], categoryChoices['MentorTypeQuestion']] = 0
  mentorData2.loc[mentorData2[categoryChoices['MentorTypeQuestion']].astype('str') ==categoryChoices.get('MentorTypeResponse')[1], categoryChoices['MentorTypeQuestion']] = 1

  for i in range (1, len(categoryChoices['Majors']) + 1):
      mentorData2.loc[mentorData2[categoryChoices['MajorsQuestion']].astype('str')==categoryChoices.get('Majors')[i-1], categoryChoices['MajorsQuestion']] = majorToNums[categoryChoices.get('Majors')[i-1]]

  mentorData2.loc[mentorData2[categoryChoices['TransferQuestion']].astype('str')==categoryChoices.get('TransferResponse')[0], categoryChoices['TransferQuestion']] = 0
  mentorData2.loc[mentorData2[categoryChoices['TransferQuestion']].astype('str')==categoryChoices.get('TransferResponse')[1], categoryChoices['TransferQuestion']] = 1

  for i in range (0, len(categoryChoices['Mentorship'])):
      mentorData2[categoryChoices['Mentorship'][i]] = zeroArray

  mentorData2[categoryChoices['MentorshipQuestion']] = mentorData2[categoryChoices['MentorshipQuestion']].fillna("")

  for i in range (0, n_mentorEntries):
      # mentorships finds the specific areas that a given mentor can offer mentorship in
      mentorships = mentorData2.iloc[i][categoryChoices['MentorshipQuestion']]
      mentorships = mentorships.split(';')
      email = mentorData.iloc[i]['Preferred Email Address']
      for j in range (0, len(mentorships)):
          mentorData2.loc[mentorData2['Preferred Email Address']==email, categoryChoices['Mentorship'][j]] = 1

  return mentorData2

In [None]:
pd.set_option('display.max_columns', None)
originalMentorData = pd.read_csv(mentors_filename)

In [None]:
display(originalMentorData)

In [None]:
originalMentorData.shape

In [None]:
#basically just one hot encodes a bunch of stuff
mentorData=preprocess_mentors(mentors_filename)

In [None]:
mentorData

In [None]:
mentorData.shape #adds tons of new columns lol

In [None]:
def constrained_cluster_mentors(mentorData, mentorMatchCols, cluster_size, familyToMentors):

  # mentorNewData is what KMeans will be applied to
  # mentorMatchCols are just the columns in the dataframe that we care about when matching people (their interests and such)
  mentorNewData = mentorData[mentorMatchCols].copy()
  num_mentors = len(mentorData)

  start_family_num=len(familyToMentors)

  # base case
  # add mentors in this cluster to their family in the familyToMentors list
  if num_mentors<2*cluster_size:
    for i in range(num_mentors):
      family_num=i//cluster_size+start_family_num
      if family_num in familyToMentors:
        familyToMentors[family_num].append(mentorData.iloc[i]['Preferred Email Address'])
      else:
        newList = [mentorData.iloc[i]['Preferred Email Address']]
        familyToMentors.update( { family_num : newList })

    return

  kmeans = KMeans(n_clusters=int(num_mentors/cluster_size), n_init=500)
  # print(mentorNewData.to_string())
  kmeans = kmeans.fit(mentorNewData)
  labels = kmeans.predict(mentorNewData)
  centroids = kmeans.cluster_centers_

  label_count=defaultdict(int)
  for label in labels:
    label_count[label]+=1

  # dict: size of family -> number of families (useful to check performance)
  family_size=defaultdict(int)
  for label in label_count:
    family_size[label_count[label]]+=1

#  print(family_size)

  perfect_mentors=set() #mentors who belong to perfect families (multiples of cluster size)
  for i in range(len(labels)):
    if label_count[labels[i]]%cluster_size==0:
      perfect_mentors.add(i)

  perfect_mentor_indices=[]
  for mentor in perfect_mentors:
    perfect_mentor_indices.append( (mentorData[mentorData['Preferred Email Address'] == mentorData.iloc[mentor]['Preferred Email Address']].index.values)[0])

  label_to_family_num=[None for i in range(len(labels))]

  for mentor in perfect_mentors:
    family_num=label_to_family_num[labels[mentor]]
    if family_num==None:
      family_num=len(familyToMentors)
      label_to_family_num[labels[mentor]]=family_num
      newList = [mentorData.iloc[mentor]['Preferred Email Address']]
      familyToMentors.update( { family_num : newList })
    else:
      familyToMentors[family_num].append(mentorData.iloc[mentor]['Preferred Email Address'])

  # chop up families which are bigger multiples of cluster size
  for i in range(len(familyToMentors)-start_family_num):
    family_num=start_family_num+i;
    family=familyToMentors[family_num]
    if len(family)>cluster_size:
      for i in range(len(family)//cluster_size):
        if i==0:
          continue
        new_family_num=len(familyToMentors)
        newList = family[i*cluster_size:(i+1)*cluster_size]
        familyToMentors.update( { new_family_num : newList })
      newList=family[:cluster_size]
      familyToMentors.update( { family_num : newList })

  scraped_mentor_indices=[]
  # scrape families from imperfect clusters
  for i in range(len(labels)):
    if label_count[labels[i]]%cluster_size!=0 and label_count[labels[i]]>cluster_size:
      family_num=label_to_family_num[labels[i]]
      if family_num==None:
        family_num=len(familyToMentors)
        label_to_family_num[labels[i]]=family_num
        newList = [mentorData.iloc[i]['Preferred Email Address']]
        familyToMentors.update( { family_num : newList })
        scraped_mentor_indices.append((mentorData[mentorData['Preferred Email Address'] == mentorData.iloc[i]['Preferred Email Address']].index.values)[0])
      else:
        if len(familyToMentors[family_num])<cluster_size:
          familyToMentors[family_num].append(mentorData.iloc[i]['Preferred Email Address'])
          scraped_mentor_indices.append((mentorData[mentorData['Preferred Email Address'] == mentorData.iloc[i]['Preferred Email Address']].index.values)[0])

  # remove the mentors who have now been assigned families
  mentorData = mentorData.drop(perfect_mentor_indices)
  mentorData = mentorData.drop(scraped_mentor_indices)

  #recurse on the remaining mentors
  constrained_cluster_mentors(mentorData, mentorMatchCols, cluster_size, familyToMentors)

In [None]:
import re
# function to split, alphebetize, and rejoin dash-separated groupname
def standardize_family_string(str):
  str=str.lower().strip()
  # to standardize groupnames
  regex = re.compile('[^a-zA-Z\-]')
  str = regex.sub('', str)
  str = str.split('-')
  ret = '-'.join(sorted(str))
  return ret

In [None]:
import csv
def get_inconsistencies(write_to_file=False, preferred_email=False):

  # this path name should be CHANGED depending on where the csv is stored #
  data = pd.read_csv(mentors_filename, quotechar='"', skipinitialspace=True)
  email_column_no=3 if preferred_email else 1
  family_column_no=21
  name_column_no=2

  # dictionary that maps groupnames to list of emails
  group_names = {}

  # number of rows in csv
  rows = len(data)

  # email to name mapping
  email_to_name={}
  for i in range(0, rows):
    email_to_name[data.iloc[i,email_column_no]]=data.iloc[i,name_column_no]

  # parse through csv and create/add to dictionary defintions, mapping groupnames to lists of emails
  for i in range(0, rows):
    if data.iloc[i,family_column_no] == data.iloc[i,family_column_no]:
      key = standardize_family_string(data.iloc[i,family_column_no])
      if key in group_names:
        ls = group_names[key]
        ls.append(data.iloc[i,email_column_no])
        group_names[key] = ls
      else:
        ls = [data.iloc[i,email_column_no]]
        group_names[key] = ls


  inconsistent_families=[]
  inconsistent_emails=[]
  consistent_emails=[]

  if write_to_file:
    # write out inconsistent families to csv
    with open('inconsistent_families.csv', 'w') as csvfile:
      csvwriter = csv.writer(csvfile)
      headers=['Submitted Family String', 'Member 1', 'Member 2', 'Member 3']
      csvwriter.writerow(headers)
      for g in group_names:
        names = g.split('-')
        ls = group_names[g]
        if len(names) != len(ls):
          inconsistent_families.append(g)
          family_members=[email_to_name[email] for email in ls]
          family_data=[g]
          family_data= family_data+family_members
          inconsistent_emails=inconsistent_emails+ls
          csvwriter.writerow(family_data)

    # write out consistent families to csv
    with open('consistent_families.csv', 'w') as csvfile:
      csvwriter = csv.writer(csvfile)
      headers=['Member 1', 'Member 2', 'Member 3']
      csvwriter.writerow(headers)
      for family in group_names.keys():
        if family not in inconsistent_families:
          family_members=[email_to_name[email] for email in group_names[family]]
          consistent_emails=consistent_emails+group_names[family]
          csvwriter.writerow(family_members)

  else:
    for g in group_names:
      names = g.split('-')
      ls = group_names[g]
      if len(names) != len(ls):
        inconsistent_families.append(g)
        family_members=[email_to_name[email] for email in ls]
        family_data=[g]
        family_data= family_data+family_members
        inconsistent_emails=inconsistent_emails+ls

    for family in group_names.keys():
      if family not in inconsistent_families:
        family_members=[email_to_name[email] for email in group_names[family]]
        consistent_emails=consistent_emails+group_names[family]

#  print("Inconsistent Emails")
#  print(inconsistent_emails)
#  print("Consistent Emails")
#  print(consistent_emails)

  return inconsistent_emails, consistent_emails

In [None]:
"""
Split df by major group.

Parameters
--------------------
    data           -- pandas df, mentor/mentee data

Returns
--------------------
    majorMap       -- dict, major_group_number->corresponding_df
"""

def splitByMajor(data):
    majorMap = dict();
    for i in range(1,num_major_categories+1):
        majorMap[i]=data.loc[data[categoryChoices['MajorsQuestion']]==i].reset_index(drop=True)
    # print("here's the major map:")
    # print(majorMap)
    return majorMap

In [None]:
def cluster_mentors(mentorData):

  n_mentorEntries = mentorData.shape[0]

  # mentorKM is a copy of mentorData
  mentorKM = mentorData.copy()

  # preferredFamilyToMentors is a dictionary mapping family string to members' emails for people with 3-person families
  preferredFamilyToMentors = dict()

  # removeMentors is a list of indexes of people (in mentorKM dataframe) that will need to be removed from mentorKM before performing KMeans
  removeMentors = []

  # twoMentorFams is a dictionary mapping family string to members' emails for people with 2-person families
  twoMentorFams = dict()

  # for people who didn't put down families, fill their mentorFamilyQuestion column with empty string
  mentorFamilyQuestion = 'Follow the instructions below to join a family'
  mentorKM[mentorFamilyQuestion] = mentorKM[mentorFamilyQuestion].fillna("")

  # get inconsistent families to ignore
  inconsistent_emails, _ = get_inconsistencies(write_to_file=False, preferred_email=False)
  inconsistent_emails=set(inconsistent_emails)
  # print(inconsistent_emails)

  # figure out the family string, and continue adding key-value pairs between family string and members' email to preferredFamilyToMentors and twoMentorFams
  for i in range (0, n_mentorEntries):
      if mentorKM.iloc[i]['Preferred Email Address'] in inconsistent_emails:
        continue
      familyString = mentorKM.iloc[i][mentorFamilyQuestion]
      familyString=standardize_family_string(familyString)
      splitFamString = familyString
      if (familyString != ""):
          splitFamString = splitFamString.split('-')
          if (len(splitFamString) == 3):
              if familyString in preferredFamilyToMentors:
                  preferredFamilyToMentors[familyString].append(mentorKM.iloc[i]['Preferred Email Address'])
              else:
                  newList = [mentorKM.iloc[i]['Preferred Email Address']]
                  preferredFamilyToMentors.update( { familyString : newList } )
              indexToErase = mentorKM[mentorKM['Preferred Email Address'] == mentorKM.iloc[i]['Preferred Email Address']].index.values
              removeMentors.append(indexToErase[0])
          elif (len(splitFamString) == 2):
              if familyString in twoMentorFams:
                  twoMentorFams[familyString].append(mentorKM.iloc[i]['Preferred Email Address'])
              else:
                  newList1 = [mentorKM.iloc[i]['Preferred Email Address']]
                  twoMentorFams.update( { familyString : newList1 })



  mentorKMData = []

  for i in range(0, n_mentorEntries):
      mentorKMData.append(mentorData.iloc[i][mentorMatchCols])

  # if family name is only 2 people, alter the data in mentorKM so that both people have the exact same data, meaning they are guaranteed to get matched through KMeans
  for key in twoMentorFams:
      # print(twoMentorFams[key])
      mentor1 = twoMentorFams[key][0]
      mentor2 = twoMentorFams[key][1]
      for i in range (0, len(mentorMatchCols)):
          mentorKM.loc[mentorKM['Preferred Email Address']==mentor2,mentorMatchCols] = mentorKM.loc[mentorKM['Preferred Email Address']==mentor1, mentorMatchCols].values
  # delete all the 3-family mentors from mentorKM so KMeans is only applied for 2-fam or 0-fam mentors
  mentorKM = mentorKM.drop(removeMentors)
  mentorRecursiveKM=mentorKM.copy()
  num_mentors = len(mentorKM)

  familyToMentors = dict()
  constrained_cluster_mentors(mentorRecursiveKM, mentorMatchCols,3, familyToMentors)
#  print(familyToMentors)

  for key in preferredFamilyToMentors:
      familyToMentors[len(familyToMentors)] = preferredFamilyToMentors[key]

  return familyToMentors, splitByMajor(mentorData)

In [None]:
#familyToMentors is a dict: family number -> list of emails of the mentors in that family
#mentor_tables is much more complicated; has a ton of the data in tables, don't worry too much for now
familyToMentors, mentor_tables = cluster_mentors(mentorData)

In [None]:
#tells us the number of mentors for each group of majors (1 is CS, 2 is aero/mech, 3 is chem/civ/mate, 4 is bioe + undeclared)
num_mentors_in_category=dict()
for category in mentor_tables.keys():
  num_mentors_in_category[category]=mentor_tables[category].shape[0]

In [None]:
num_mentors_in_category

In [None]:
"""
Preprocess mentee data.
Read into dataframe.
One-hot encode activity preferences.
Convert other categorical reponses to numerical.

Parameters
--------------------
    filename          -- string, path to mentee response csv

Returns
--------------------
    mentee_tables     -- dict, major_category_# -> corresponding split df
"""
def preprocess_mentee_responses(filename):
  originalMenteeData = pd.read_csv(filename)
  menteeData = originalMenteeData.copy()
  # print(menteeData)
  n_menteeEntries = menteeData.shape[0]

# seems like: adds in zeroes for one hot encoding
  menteeEmails = "Preferred Email Address"
  zeroArray = np.zeros(n_menteeEntries)
  for i in range (0, len(categoryChoices['Activities'])):
     menteeData[(categoryChoices.get('Activities'))[i]] = zeroArray
  # print(menteeData)

  for i in range (0, n_menteeEntries):
     for j in range (0, len(categoryChoices['ActivitiesRank'])):
         activity = menteeData.iloc[i][(categoryChoices.get('ActivitiesRank'))[j]]
         email = menteeData.iloc[i]['Preferred Email Address']
         menteeData.loc[menteeData['Preferred Email Address']==email, activity]=5-j
  # print("now it's like this: ")
  # print(menteeData)

  for i in range (1, len(categoryChoices['FridaysResponse']) + 1):
     menteeData.loc[menteeData[categoryChoices['FridaysQuestion']].astype('str')==categoryChoices.get('FridaysResponse')[i-1], categoryChoices['FridaysQuestion']] = i

  menteeData.loc[menteeData[categoryChoices['MenteeTypeQuestion']].astype('str') ==categoryChoices.get('MentorTypeResponse')[0], categoryChoices['MenteeTypeQuestion']] = 0
  menteeData.loc[menteeData[categoryChoices['MenteeTypeQuestion']].astype('str') ==categoryChoices.get('MentorTypeResponse')[1], categoryChoices['MenteeTypeQuestion']] = 1

  for i in range (1, len(categoryChoices['Majors']) + 1):
      menteeData.loc[menteeData[categoryChoices['MajorsQuestion']].astype('str')==categoryChoices.get('Majors')[i-1], categoryChoices['MajorsQuestion']] = majorToNums[categoryChoices.get('Majors')[i-1]]

#   for i in range(0, len(categoryChoices['TimeZones'])):
#       if abs(i-15)<=abs(len(categoryChoices['TimeZones'])+i-15):
#           timeZoneAsNum = i-15
#       else:
#           timeZoneAsNum = len(categoryChoices['TimeZones'])+i-15
#       menteeData.loc[menteeData[categoryChoices['MenteeTimeZoneQuestion']].astype('str')==categoryChoices.get('TimeZones')[i], categoryChoices['MenteeTimeZoneQuestion']] = timeZoneAsNum

  menteeData.loc[menteeData[categoryChoices['MenteeTransferQuestion']].astype('str')==categoryChoices.get('TransferResponse')[0], categoryChoices['TransferQuestion']] = 0
  menteeData.loc[menteeData[categoryChoices['MenteeTransferQuestion']].astype('str')==categoryChoices.get('TransferResponse')[1], categoryChoices['TransferQuestion']] = 1

  for i in range (0, len(categoryChoices['Mentorship'])):
     menteeData[categoryChoices['Mentorship'][i]] = zeroArray

  for i in range (0, n_menteeEntries):
     for j in range (0, len(categoryChoices['Mentorship'])):
         menteeships = menteeData.iloc[i][categoryChoices['MenteeshipQuestion']]
         email = menteeData.iloc[i]['Preferred Email Address']
         menteeData.loc[(menteeData['Preferred Email Address']==email) & (menteeships.find(categoryChoices['Mentorship'][j])) != -1, categoryChoices['Mentorship'][j]] = 1
  # print("Turned into this:")
  # print(menteeData)

  return splitByMajor(menteeData) #returns a dictionary of the four major groups, like 1: 2: 3: 4:

In [None]:
#mentee_tables gives us the data split up by major groups
mentee_tables=preprocess_mentee_responses(mentee_responses_filename)

In [None]:
"""
Preprocess mentee master data.
Read into dataframe.
Encode majors.

Parameters
--------------------
    filename          -- string, path to mentee master csv

Returns
--------------------
    mentee_master_tables     -- dict, major_category_# -> list of emails
"""
def preprocess_mentee_masterlist(filename):
  menteeData = pd.read_csv(filename)
  n_menteeEntries = menteeData.shape[0]

  for major in master_major_map:
    menteeData.loc[menteeData['Major'].astype('str')==major, 'Major'] = majorToNums[master_major_map[major]]

  mentee_master_tables=dict()
  for i in range(1,num_major_categories+1):
    mentee_master_tables[i]=menteeData.loc[menteeData['Major']==i,'Email'].to_list()
#   print("here's the mentees but with the majors as numbers")
#   print(mentee_master_tables)
  return mentee_master_tables #returns a dict with 1: emails of major 1, 2: emails of mentees with major 2,etc. for 3 and 4

In [None]:
#gives us emails for each student in each major group; dict: major group number -> list of emails
mentee_master_tables=preprocess_mentee_masterlist(mentees_master_filename)

In [None]:
for category in mentee_master_tables.keys():
  print(len(mentee_master_tables[category]))
print(113+56+36+12)

In [None]:
! pip install numpy==1.23

In [None]:
! pip install k-means-constrained>=0.7.2

In [None]:
! pip install ortools==9.3.10497
from ortools.graph.pywrapgraph import SimpleMinCostFlow

In [None]:
"""
Cluster mentees into groups to establish representative mentees.
Parameters
--------------------
    menteeData             -- pandas df, parsed mentee data
    num_clusters           -- int, number of clusters (should equal number of mentors)
Returns
--------------------
    familyToMentees       -- dict, representative_mentee# -> [mentee emails]
    menteeClusterData     -- dict, representative_mentee# -> [arrays of numerical mentee data] (these correspond to the data used to cluster)
"""
def cluster_mentees(menteeData, num_clusters):
  # menteeKM is a copy of menteeData
  menteeKM = menteeData.copy()

  # menteeNewData is what KMeans will be applied to
  menteeNewData = menteeKM[menteeMatchCols]

  menteeKmeans = KMeans(n_clusters=num_clusters, n_init=500)
  menteeKmeans = menteeKmeans.fit(menteeNewData)
  menteeLabels = menteeKmeans.predict(menteeNewData)

  familyToMentees = dict()
  menteeClusterData = dict()

  for i in range(0, len(menteeLabels)):
      if menteeLabels[i] in familyToMentees:
          familyToMentees[menteeLabels[i]].append(menteeKM.iloc[i]['Preferred Email Address'])
          menteeClusterData[menteeLabels[i]].append(menteeKM.iloc[i][menteeMatchCols])
      else:
          newList = [menteeKM.iloc[i]['Preferred Email Address']]
          newData = [menteeKM.iloc[i][menteeMatchCols]]
          familyToMentees.update( { menteeLabels[i] : newList })
          menteeClusterData.update( { menteeLabels[i] : newData})

  return familyToMentees, menteeClusterData

In [None]:
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import numpy as np
from k_means_constrained import KMeansConstrained
"""
Cluster mentees into groups to establish representative mentees.

Parameters
--------------------
    menteeData             -- pandas df, parsed mentee data
    num_clusters           -- int, number of clusters (should equal number of mentors)

Returns
--------------------
    familyToMentees       -- dict, representative_mentee# -> [mentee emails]
    menteeClusterData     -- dict, representative_mentee# -> [arrays of numerical mentee data] (these correspond to the data used to cluster)
"""
def cluster_mentees(menteeData, num_clusters):
  #adding this to try to get equal size clusters
  cluster_size = int(np.ceil(len(menteeData) / num_clusters))
  print(cluster_size)
  # menteeKM is a copy of menteeData
  menteeKM = menteeData.copy()

  # menteeNewData is what KMeans will be applied to
  menteeNewData = menteeKM[menteeMatchCols]

  menteeKmeans = KMeansConstrained(size_min=cluster_size - 1, size_max=cluster_size, n_clusters=num_clusters, n_init=500)


  # menteeKmeans = KMeans(n_clusters=num_clusters, n_init=500)
  menteeKmeans = menteeKmeans.fit(menteeNewData)

  # centers = menteeKmeans.cluster_centers_
  # print(centers)
  # centers = centers.reshape(-1, 1, menteeNewData.shape[-1]).repeat(cluster_size, 1).reshape(-1, menteeNewData.shape[-1])
  # distance_matrix = cdist(menteeNewData, centers)
  # clusters = linear_sum_assignment(distance_matrix)[1]//cluster_size
  menteeLabels = menteeKmeans.predict(menteeNewData)


  familyToMentees = dict()
  menteeClusterData = dict()

  for i in range(0, len(menteeLabels)):
      if menteeLabels[i] in familyToMentees:
          familyToMentees[menteeLabels[i]].append(menteeKM.iloc[i]['Preferred Email Address'])
          menteeClusterData[menteeLabels[i]].append(menteeKM.iloc[i][menteeMatchCols])
      else:
          newList = [menteeKM.iloc[i]['Preferred Email Address']]
          newData = [menteeKM.iloc[i][menteeMatchCols]]
          familyToMentees.update( { menteeLabels[i] : newList })
          menteeClusterData.update( { menteeLabels[i] : newData})
  # print("here is familytomentees")
  # print(familyToMentees)
  # print('here is menteeClusterData')
  # print(menteeClusterData)

  return familyToMentees, menteeClusterData

In [None]:
import numpy as np

"""
Return a stable matching as obtained from Gale Shapley

Parameters
--------------------
    n                       -- int, number of mentors/mentees (must be equal)
    mentor_preferences      -- ndarray of shape (n,n), mentor_preferences[i] is the all the mentee indices in order of preference of mentor i
    mentee_preferences      -- ndarray of shape (n,n), mentee_preferences[i] is the all the mentor indices in order of preference of mentee i

Returns
--------------------
    mentor_matches          -- array of length n, mentor_matches[i] corresponds to index of the mentor matched to mentee i
"""

def stable_match(n, mentor_preferences, mentee_preferences):

    # mentor_matches stores mentee/mentor pairs
    mentor_matches = [-1 for i in range(n)]

    # last_match[i] stores the index of the last checked mentee for mentor[i]
    last_match = [-1 for i in range(n)]

    # stores availability of mentor
    mentorFree = [True for i in range(n)]

    freeCount = n
    m=0 # current_mentor

    # while there are free mentors
    while (freeCount > 0):

        if(mentorFree[m]):

          # Go through mentees according to m's preferences
          while last_match[m] < n-1:
              last_match[m]+=1
              mentee = mentor_preferences[m][last_match[m]]

              # the preferred mentee is free, so mentee and mentor become matched
              if (mentor_matches[mentee] == -1):
                  mentor_matches[mentee] = m
                  mentorFree[m] = False
                  freeCount -= 1
                  # print(str(m) + "is engaged to " + str(mentee))
                  # print("so mentor #" + str(m) + " is no longer free")
                  # print(mentorFree)
                  break

              else:
                  # preferred mentee is not free, check current match
                  existing_m = mentor_matches[mentee]

                  # if new match is better, create it
                  if (mentee_preferences[mentee].tolist().index(m) < mentee_preferences[mentee].tolist().index(existing_m)):
                      mentor_matches[mentee] = m
                      mentorFree[m] = False
                      mentorFree[existing_m] = True
                      # print(str(mentee) + " broke off their engagement to " + str(existing_m) + " so " + str(m) + " is now engaged to " + str(mentee))
                      # print("so mentor #" + str(m) + "is no longer free, but mentor #" + str(existing_m) + " is free now")
                      # print(mentorFree)
                      break

                  # otherwise, do nothing
                  pass

        # else:
          # print(str(m) + " is not free, so we'll go to the next mentor")
        # cycle through the mentors
        m=(m+1)%n

    return mentor_matches

In [None]:
mentorEmailToMenteesEmails = defaultdict(list)
# noMisses = pd.read_csv(io.BytesIO(uploaded[mentees_master_filename]))
# display(noMisses)
# goodOnes = set()
for category in mentor_tables.keys():
  # print(category)
  # print(mentee_master_tables)
  # print(mentee_master_tables[category])
  master_mentee_emails=set(mentee_master_tables[category])
  # filter out unmatchable responses
  mentee_tables[category]=mentee_tables[category][mentee_tables[category]['Preferred Email Address'].isin(master_mentee_emails)]
  matched_mentee_emails=set(mentee_tables[category]['Preferred Email Address'].to_list())
  master_mentee_emails=master_mentee_emails.difference(matched_mentee_emails)

  # goodOnes |= matched_mentee_emails
  # print("this is who was not matched: ")
  # display(master_mentee_emails)
  print("this is number of people not matched in this category" + str(len(master_mentee_emails)))
  print("number of people matched: " + str(len(matched_mentee_emails)))

  # proportion of mentors to be matched
  # print(matched_mentee_emails)
  # print(master_mentee_emails)
  p=len(matched_mentee_emails)/(len(matched_mentee_emails)+len(master_mentee_emails))
  num_match_mentors=int(p * num_mentors_in_category[category])
  print(str(num_match_mentors) + " mentors chosen ")
  print("out of ")
  print(num_mentors_in_category[category])
  # display(mentee_tables[category])
  familyToMentees, menteeClusterData=cluster_mentees(mentee_tables[category], num_match_mentors)
  print("result of clustering mentees")
  tot = 0
  for item in familyToMentees:
    tot += len(familyToMentees[item])
  print("total found with familyToMentees: " + str(tot))


  # -------------------------------------------
  # -Assign respresentative mentees to mentors-
  # -------------------------------------------

  # mentorNums keeps track of only the numerical values for each mentor in each column
  mentorNums = mentor_tables[category][mentorMatchCols].to_numpy()
  mentorNums = mentorNums[:num_match_mentors]

  # repMentees is a matrix of representative mentees for each mentee cluster to be KMeansed with mentors
  representativeMentees = []

  for key in range (0, len(menteeClusterData)):
      representative = np.zeros(len(mentorMatchCols))
      for mentee in menteeClusterData[key]:
          representative = representative + mentee[mentorMatchCols]
      representative /= len(menteeClusterData[key])
      representativeMentees.append(representative)
  representativeMentees = np.array(representativeMentees)
  print("len of representative mentees: ")
  print(len(representativeMentees))

  # Calculate pairwise distance for every pair of mentor and representative mentee

  mentorToPref = dict()
  menteeToPref = dict()

  mentorPrefs = euclidean_distances(mentorNums, representativeMentees)
  menteePrefs = euclidean_distances(representativeMentees, mentorNums)

  # Create the mentor preferences matrix by sorting each row of the pairwise distance matrix
  # Create the mentee preferences matrix in a similar fashion

  for i,mentor in enumerate(mentorPrefs):
    mentorPrefs[i]=sorted(range(mentorPrefs.shape[1]), key=lambda x: mentor[x])
  for i,mentee in enumerate(menteePrefs):
    menteePrefs[i]=sorted(range(menteePrefs.shape[1]), key=lambda x: mentee[x])

  mentorPrefs=np.array(mentorPrefs).astype(int)
  menteePrefs=np.array(menteePrefs).astype(int)
  print("mentorPrefs then menteePrefs")
  print(mentorPrefs.shape)
  print(menteePrefs.shape)
  # print(mentorPrefs)
  # print(menteePrefs)

  stableMatchResult = stable_match(num_match_mentors, mentorPrefs, menteePrefs)
  print("this is length of stable match result" + str(len(stableMatchResult)))
  print(stableMatchResult)

  for i in range(0, len(stableMatchResult)):
      menteeEmails = familyToMentees[i]
      mentor = mentor_tables[category].iloc[stableMatchResult[i]]['Preferred Email Address']
      # print(mentor)
      mentorEmailToMenteesEmails[mentor].extend(menteeEmails)
  print("this is mentorEmailToMenteesEmails right after adding stuff")
  total = 0
  i = 0
  for j in mentorEmailToMenteesEmails:
    total += len(mentorEmailToMenteesEmails[j])
    # print(str(i) + " : " + str(total))
    i+=1

  mentor_index=0
  for i, mentee in enumerate(master_mentee_emails):
    # print(str(i) + "person being added at the end")
    mentor = mentor_tables[category].iloc[num_match_mentors+mentor_index]['Preferred Email Address']
    mentorEmailToMenteesEmails[mentor].append(mentee)
    mentor_index=(mentor_index+1)%(num_mentors_in_category[category]-num_match_mentors)

  print("updated length of mentorEmailToMenteesEmails: " + str(len(mentorEmailToMenteesEmails)))
  # total = 0
  # i = 0
  # for j in mentorEmailToMenteesEmails:
  #   total += len(mentorEmailToMenteesEmails[j])
  #   print(str(i) + " : " + str(total))
  #   i+=1

# noMisses = noMisses[noMisses['Email'].isin(goodOnes)]
# display(noMisses)
# noMisses.to_csv('Trial-Zero-Misses.csv')
# files.download('Trial-Zero-Misses.csv')

In [None]:
familyToMentees

In [None]:
"""
Process mentor data to get relevant contact details

Parameters
--------------------
    filename          -- string, path to mentor csv

Returns
--------------------
    mentor_contact     -- dict, ucla email -> (name,preferred email,phone)
"""
def get_mentor_contact(filename):
  mentor_data = pd.read_csv(filename)
  contact_columns=["Preferred Email Address", "Full Legal Name (First and Last)", "Preferred Email Address", "Phone Number (##########)"]
  mentor_data=mentor_data[contact_columns]

  mentor_contact=dict()

  for i in range(len(mentor_data)):
    mentor=mentor_data.iloc[i]
    if np.isnan(mentor[3]):
      contact_info=(mentor[1], mentor[2], "")
    else:
      contact_info=(mentor[1], mentor[2], int(mentor[3]))
    mentor_contact[mentor[0]]=contact_info

  return mentor_contact

In [None]:
"""
Process mentee data to get relevant contact details

Parameters
--------------------
    filename          -- string, path to mentee csv

Returns
--------------------
    mentee_contact     -- dict, email -> (name)
"""
def get_mentee_contact(filename):
  mentee_data = pd.read_csv(filename)
  contact_columns=["Full Name", "Email"]
  mentee_data=mentee_data[contact_columns]

  mentee_contact=dict()

  for i in range(len(mentee_data)):
    mentee=mentee_data.iloc[i]
    contact_info=(mentee[0])
    mentee_contact[mentee[1]]=contact_info

  return mentee_contact

In [None]:
"""
Export data to csvs.

Parameters
--------------------
    familyToMentors              -- dict, family# -> [mentor emails]
    mentorEmailToMenteesEmails   -- dict, mentor_email -> [mentee_emails]

Returns
--------------------
    42
"""
def generate_csvs(familyToMentors, mentorEmailToMenteesEmails, mentors_filename, mentees_filename):

  mentor_contact=get_mentor_contact(mentors_filename)
  mentee_contact=get_mentee_contact(mentees_filename)


  mentor_email_list=[]
  with open('mentor_families.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    headers=['Family Number', 'Mentor 1', 'Mentor 2', 'Mentor 3']
    csvwriter.writerow(headers)
    for family in sorted(familyToMentors.keys()):
      # print("here's the data about the families")
      # print(familyToMentors[family])
      # print("here's what's being put in for names")
      # for email in familyToMentors[family]:
            # print(mentor_contact[email])
      names = [mentor_contact[email][0] for email in familyToMentors[family]]
      preferred_emails=[mentor_contact[email][1] for email in familyToMentors[family]]
      family_data=[family] +names
      csvwriter.writerow(family_data)
      mentor_email_list+=preferred_emails

  with open('mentee_to_mentor.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    headers=['Mentee', 'Family Number', 'Mentor']
    csvwriter.writerow(headers)
    for family in sorted(familyToMentors.keys()):
        for mentor in familyToMentors[family]:
          # print("here is mentees assigned to " + str(mentor) + " : " + str(mentorEmailToMenteesEmails[mentor]))
          for mentee in mentorEmailToMenteesEmails[mentor]:
            mentee_data=[mentee_contact[mentee],family, mentor_contact[mentor][0]]
            csvwriter.writerow(mentee_data)

  with open('mentor_to_mentees.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    headers=['Mentor', 'Family Number', 'Mentees']
    csvwriter.writerow(headers)
    for family in sorted(familyToMentors.keys()):
        for mentor in familyToMentors[family]:
          mentees=[mentee for mentee in mentorEmailToMenteesEmails[mentor]]
          mentor_data=[mentor_contact[mentor][0],family, mentees]
          csvwriter.writerow(mentor_data)

  return 42

In [None]:
generate_csvs(familyToMentors, mentorEmailToMenteesEmails, mentors_filename, mentees_master_filename)