In [1]:
!pip install matching

from matching.games import StableMarriage

from sklearn.metrics import jaccard_score

#csv IOs and dataframes
import pandas as pd

import random

def getKeyList(dict):
    list = []
    for key in dict.keys():
        list.append(key)

    return list

Collecting matching
  Downloading matching-1.4.3-py3-none-any.whl (30 kB)
Installing collected packages: matching
Successfully installed matching-1.4.3


Data Cleaning

In [2]:
#This can be a relative or absolute path. In actual work it will very highly on your envoirment
df = pd.read_csv('new_member_matching.csv')

#initialize sets to exlude or drop later
drop_cols = set([])
id_cols = set([])

#get shape of survey data
df.shape

(40, 47)

We don't need the timestamps or identifiers so we will drop them

In [3]:
#add timestamp column to drop_cols set

drop_cols.add('Timestamp')
drop_cols.add('Username')
drop_cols.add('First Name')
drop_cols.add('Last Name')
df.drop(drop_cols, axis=1,inplace=True)

#### Print out the csv

In [4]:
df

Unnamed: 0,Family,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,Do you like playing sports?,Do your hobbies fall within the arts?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,Prim,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,...,Yes,No,No,Yes,No,No,Yes,Yes,No,No
1,Euclid,No,Yes,Yes,Yes,No,No,No,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
2,Hopper,Yes,Yes,Yes,Yes,Yes,No,No,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
3,Prim,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,...,Yes,No,No,No,No,Yes,Yes,No,Yes,No
4,Prim,Yes,No,Yes,Yes,Yes,No,Yes,Yes,Yes,...,No,No,No,No,No,No,Yes,No,No,Yes
5,Prim,Yes,Yes,No,Yes,No,Yes,Yes,No,No,...,No,No,Yes,No,No,No,Yes,No,Yes,Yes
6,Prim,Yes,No,No,Yes,Yes,Yes,Yes,Yes,No,...,Yes,No,Yes,No,No,No,Yes,No,No,No
7,Prim,Yes,Yes,No,Yes,Yes,No,Yes,Yes,No,...,No,No,No,No,No,Yes,Yes,Yes,No,No
8,Prim,Yes,No,No,Yes,Yes,No,Yes,No,No,...,Yes,No,Yes,No,No,No,Yes,No,Yes,No
9,Pascal,Yes,No,Yes,Yes,Yes,No,No,Yes,No,...,Yes,Yes,Yes,No,No,No,No,Yes,No,Yes


To get this data from the survey responses of existing members, we take the majority response to each question and represent it as the "response" for that family

In [9]:
def getFamilyData(df, family_name):
  data_out = []
  df_family = df[df['Family'] == family_name]
  for col_name, col_vals in df_family.items():
    num_yes = 0
    num_no = 0
    for val in col_vals:
      if(val == 'Yes'):
        num_yes += 1
      if(val == 'No'):
        num_no += 1
    if(not(num_yes == 0 and num_no == 0)):
      if(num_yes > num_no):
        data_out.append(1)
      elif(num_no > num_yes):
        data_out.append(0)
      else:
        data_out.append(random.randrange(0, 2))
  return data_out

djikstra_data = getFamilyData(df, 'Djikstra')
hopper_data = getFamilyData(df, 'Hopper')
prim_data = getFamilyData(df, 'Prim')
boole_data = getFamilyData(df, 'Boole')
pascal_data = getFamilyData(df, 'Pascal')
euclid_data = getFamilyData(df, 'Euclid')

new_mem_1_data = [1,0,1,1,0,1,1,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1]
new_mem_2_data = [1,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1]
new_mem_3_data = [1,1,0,1,1,0,1,1,1,1,0,1,1,0,0,0,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0]
new_mem_4_data = [1,1,1,1,1,0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,1,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1]
new_mem_5_data = [1,1,1,1,1,1,0,1,1,1,1,0,1,0,0,1,1,0,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1]
new_mem_6_data = [1,1,1,1,1,1,0,1,0,1,1,0,0,1,1,1,1,1,0,1,0,1,0,0,1,1,1,0,1,1,0,1,1,1,1,0,0,0,1,1,0,1]

families_data = [djikstra_data, hopper_data, prim_data, boole_data, pascal_data, euclid_data]
family_names = ["djikstra", "hopper", "prim", "boole", "pascal", "euclid"]

mems_data = [new_mem_1_data, new_mem_2_data, new_mem_3_data, new_mem_4_data, new_mem_5_data, new_mem_6_data]
mem_names = ["New Member 1", "New Member 2", "New Member 3", "New Member 4", "New Member 5", "New Member 6"]

Get the distances between the responses from the existing families and our new members

In [10]:
fam_dists = {}

for i in range(len(families_data)):
    fam_dist = {}
    for j in range(len(mems_data)):
        fam_dist[mem_names[j]] = jaccard_score(families_data[i], mems_data[j])
    fam_dists[family_names[i]] = {k: v for k, v in sorted(fam_dist.items(), key=lambda item: item[1], reverse=True)}

print(fam_dists)

mem_dists = {}

for i in range(len(mems_data)):
    mem_dist = {}
    for j in range(len(families_data)):
        mem_dist[family_names[j]] = jaccard_score(mems_data[i], families_data[j])
    mem_dists[mem_names[i]] = {k: v for k, v in sorted(mem_dist.items(), key=lambda item: item[1], reverse=True)}

print(mem_dists)

{'djikstra': {'New Member 6': 0.6333333333333333, 'New Member 5': 0.6206896551724138, 'New Member 4': 0.5769230769230769, 'New Member 2': 0.5555555555555556, 'New Member 1': 0.4583333333333333, 'New Member 3': 0.36666666666666664}, 'hopper': {'New Member 4': 0.7727272727272727, 'New Member 6': 0.5666666666666667, 'New Member 5': 0.5517241379310345, 'New Member 2': 0.5384615384615384, 'New Member 3': 0.4444444444444444, 'New Member 1': 0.43478260869565216}, 'prim': {'New Member 6': 0.5161290322580645, 'New Member 5': 0.45161290322580644, 'New Member 3': 0.4444444444444444, 'New Member 1': 0.43478260869565216, 'New Member 2': 0.42857142857142855, 'New Member 4': 0.3448275862068966}, 'boole': {'New Member 1': 0.55, 'New Member 3': 0.48, 'New Member 4': 0.48, 'New Member 6': 0.45161290322580644, 'New Member 5': 0.43333333333333335, 'New Member 2': 0.4074074074074074}, 'pascal': {'New Member 6': 0.5357142857142857, 'New Member 4': 0.4583333333333333, 'New Member 5': 0.36666666666666664, 'Ne

Get preferences for the families and for the members

In [11]:
fam_prefs = fam_dists

for key in fam_prefs:
    fam_prefs[key] = getKeyList(fam_prefs[key])

print(fam_prefs)

mem_prefs = mem_dists

for key in mem_prefs:
    mem_prefs[key] = getKeyList(mem_prefs[key])

print(mem_prefs)

{'djikstra': ['New Member 6', 'New Member 5', 'New Member 4', 'New Member 2', 'New Member 1', 'New Member 3'], 'hopper': ['New Member 4', 'New Member 6', 'New Member 5', 'New Member 2', 'New Member 3', 'New Member 1'], 'prim': ['New Member 6', 'New Member 5', 'New Member 3', 'New Member 1', 'New Member 2', 'New Member 4'], 'boole': ['New Member 1', 'New Member 3', 'New Member 4', 'New Member 6', 'New Member 5', 'New Member 2'], 'pascal': ['New Member 6', 'New Member 4', 'New Member 5', 'New Member 2', 'New Member 1', 'New Member 3'], 'euclid': ['New Member 3', 'New Member 4', 'New Member 5', 'New Member 6', 'New Member 1', 'New Member 2']}
{'New Member 1': ['boole', 'djikstra', 'hopper', 'prim', 'euclid', 'pascal'], 'New Member 2': ['djikstra', 'hopper', 'prim', 'boole', 'pascal', 'euclid'], 'New Member 3': ['euclid', 'boole', 'hopper', 'prim', 'djikstra', 'pascal'], 'New Member 4': ['hopper', 'djikstra', 'euclid', 'boole', 'pascal', 'prim'], 'New Member 5': ['djikstra', 'hopper', 'pri

Now we're left with the Stable Marriage Problem which we can solve with Gale-Shapley

In [12]:
game = StableMarriage.create_from_dictionaries(mem_prefs, fam_prefs)
print("matches")
print(game.solve())

matches
{New Member 1: boole, New Member 2: pascal, New Member 3: euclid, New Member 4: hopper, New Member 5: prim, New Member 6: djikstra}
