In [42]:
#lin alg
import numpy as np

#csv IOs and dataframes
import pandas as pd

#clustering
from sklearn.cluster import KMeans
from FixedKmeans import same_size_kmeans

import matplotlib.pyplot as plt

%matplotlib notebook

## Same-Sized K-means or Classic K-means

When performing K-means, we will use the same-size version, which gives equal family sizes at the cost of fit.

Adjust the number of families using `num_families`.

In [43]:
num_families = 6

## Data Cleaning

In [44]:
#This can be a relative or absolute path. In actual work it will very highly on your envoirment
df = pd.read_csv('family_matching.csv')

#initialize sets to exlude or drop later
drop_cols = set([])
id_cols = set([])

In [45]:
#get shape of survey data
df.shape

(40, 46)

#### Print out the full csv

In [46]:
df

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2No2Yes/No2/No2 7:49:YesNo PM EST,REDACTED,A,A,Yes,Yes,Yes,No,No,No,...,No,Yes,Yes,No,No,Yes,Yes,Yes,No,Yes
1,2No2Yes/No2/No2 7:49:Yes2 PM EST,REDACTED,B,A,Yes,No,Yes,No,No,Yes,...,No,Yes,No,Yes,No,No,No,Yes,Yes,No
2,2No2Yes/No2/No2 7:49:Yes4 PM EST,REDACTED,C,A,No,Yes,No,No,No,No,...,No,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes
3,2No2Yes/No2/No2 7:49:Yes9 PM EST,REDACTED,D,A,No,No,Yes,Yes,No,Yes,...,Yes,Yes,No,Yes,No,Yes,No,No,Yes,Yes
4,2No2Yes/No2/No2 7:49:22 PM EST,REDACTED,E,A,Yes,No,Yes,Yes,Yes,No,...,No,No,Yes,No,No,Yes,No,No,Yes,Yes
5,2No2Yes/No2/No2 7:49:3No PM EST,REDACTED,F,A,No,No,No,Yes,No,Yes,...,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes
6,2No2Yes/No2/No2 7:49:45 PM EST,REDACTED,G,A,No,No,Yes,Yes,Yes,Yes,...,No,No,Yes,No,No,Yes,Yes,No,No,Yes
7,2No2Yes/No2/No2 7:5No:No6 PM EST,REDACTED,H,A,Yes,No,Yes,No,No,Yes,...,No,No,No,No,No,No,Yes,No,Yes,Yes
8,2No2Yes/No2/No2 7:5No:No8 PM EST,REDACTED,I,A,Yes,Yes,No,Yes,No,Yes,...,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,No
9,2No2Yes/No2/No2 7:5No:YesYes PM EST,REDACTED,J,A,Yes,Yes,Yes,Yes,No,Yes,...,Yes,No,No,No,Yes,Yes,Yes,Yes,Yes,Yes


In [47]:
df.head()

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2No2Yes/No2/No2 7:49:YesNo PM EST,REDACTED,A,A,Yes,Yes,Yes,No,No,No,...,No,Yes,Yes,No,No,Yes,Yes,Yes,No,Yes
1,2No2Yes/No2/No2 7:49:Yes2 PM EST,REDACTED,B,A,Yes,No,Yes,No,No,Yes,...,No,Yes,No,Yes,No,No,No,Yes,Yes,No
2,2No2Yes/No2/No2 7:49:Yes4 PM EST,REDACTED,C,A,No,Yes,No,No,No,No,...,No,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes
3,2No2Yes/No2/No2 7:49:Yes9 PM EST,REDACTED,D,A,No,No,Yes,Yes,No,Yes,...,Yes,Yes,No,Yes,No,Yes,No,No,Yes,Yes
4,2No2Yes/No2/No2 7:49:22 PM EST,REDACTED,E,A,Yes,No,Yes,Yes,Yes,No,...,No,No,Yes,No,No,Yes,No,No,Yes,Yes


Timestamps are not relevant to our family matching so we will just drop this column.

In [48]:
#add timestamp column to drop_cols set

drop_cols.add('Timestamp')

#### Our first three columns are all identifier data

In [49]:
df[['Username', 'First Name', 'Last Name']]

Unnamed: 0,Username,First Name,Last Name
0,REDACTED,A,A
1,REDACTED,B,A
2,REDACTED,C,A
3,REDACTED,D,A
4,REDACTED,E,A
5,REDACTED,F,A
6,REDACTED,G,A
7,REDACTED,H,A
8,REDACTED,I,A
9,REDACTED,J,A


We will store this data separately for use in identifying our matching results later, but it will not be run through k-means.

In [50]:
id_cols = set(['Username', 'First Name', 'Last Name'])

In [51]:
# We will convert all of our 'No'/'Yes' values to 0/1 values
for question in df:
    if(not question in id_cols and not question in drop_cols):
        df[question] = df[question].apply(lambda x: 0 if x=='No' else 1)

In [52]:
df

Unnamed: 0,Timestamp,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,2No2Yes/No2/No2 7:49:YesNo PM EST,REDACTED,A,A,1,1,1,0,0,0,...,0,1,1,0,0,1,1,1,0,1
1,2No2Yes/No2/No2 7:49:Yes2 PM EST,REDACTED,B,A,1,0,1,0,0,1,...,0,1,0,1,0,0,0,1,1,0
2,2No2Yes/No2/No2 7:49:Yes4 PM EST,REDACTED,C,A,0,1,0,0,0,0,...,0,1,1,1,1,1,0,0,0,1
3,2No2Yes/No2/No2 7:49:Yes9 PM EST,REDACTED,D,A,0,0,1,1,0,1,...,1,1,0,1,0,1,0,0,1,1
4,2No2Yes/No2/No2 7:49:22 PM EST,REDACTED,E,A,1,0,1,1,1,0,...,0,0,1,0,0,1,0,0,1,1
5,2No2Yes/No2/No2 7:49:3No PM EST,REDACTED,F,A,0,0,0,1,0,1,...,1,1,0,1,1,0,1,0,1,1
6,2No2Yes/No2/No2 7:49:45 PM EST,REDACTED,G,A,0,0,1,1,1,1,...,0,0,1,0,0,1,1,0,0,1
7,2No2Yes/No2/No2 7:5No:No6 PM EST,REDACTED,H,A,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,1
8,2No2Yes/No2/No2 7:5No:No8 PM EST,REDACTED,I,A,1,1,0,1,0,1,...,1,1,1,0,1,1,1,1,1,0
9,2No2Yes/No2/No2 7:5No:YesYes PM EST,REDACTED,J,A,1,1,1,1,0,1,...,1,0,0,0,1,1,1,1,1,1


### Delete drop columns

In [53]:
df.drop(drop_cols, axis=1,inplace=True)

In [54]:
df

Unnamed: 0,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,REDACTED,A,A,1,1,1,0,0,0,0,...,0,1,1,0,0,1,1,1,0,1
1,REDACTED,B,A,1,0,1,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
2,REDACTED,C,A,0,1,0,0,0,0,0,...,0,1,1,1,1,1,0,0,0,1
3,REDACTED,D,A,0,0,1,1,0,1,1,...,1,1,0,1,0,1,0,0,1,1
4,REDACTED,E,A,1,0,1,1,1,0,0,...,0,0,1,0,0,1,0,0,1,1
5,REDACTED,F,A,0,0,0,1,0,1,1,...,1,1,0,1,1,0,1,0,1,1
6,REDACTED,G,A,0,0,1,1,1,1,0,...,0,0,1,0,0,1,1,0,0,1
7,REDACTED,H,A,1,0,1,0,0,1,1,...,0,0,0,0,0,0,1,0,1,1
8,REDACTED,I,A,1,1,0,1,0,1,1,...,1,1,1,0,1,1,1,1,1,0
9,REDACTED,J,A,1,1,1,1,0,1,1,...,1,0,0,0,1,1,1,1,1,1


# Inferencing

#### Seperate the data from the ids for now

In [55]:
X, X_data = df, df.drop(id_cols, axis=1)

In [56]:
X

Unnamed: 0,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,REDACTED,A,A,1,1,1,0,0,0,0,...,0,1,1,0,0,1,1,1,0,1
1,REDACTED,B,A,1,0,1,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
2,REDACTED,C,A,0,1,0,0,0,0,0,...,0,1,1,1,1,1,0,0,0,1
3,REDACTED,D,A,0,0,1,1,0,1,1,...,1,1,0,1,0,1,0,0,1,1
4,REDACTED,E,A,1,0,1,1,1,0,0,...,0,0,1,0,0,1,0,0,1,1
5,REDACTED,F,A,0,0,0,1,0,1,1,...,1,1,0,1,1,0,1,0,1,1
6,REDACTED,G,A,0,0,1,1,1,1,0,...,0,0,1,0,0,1,1,0,0,1
7,REDACTED,H,A,1,0,1,0,0,1,1,...,0,0,0,0,0,0,1,0,1,1
8,REDACTED,I,A,1,1,0,1,0,1,1,...,1,1,1,0,1,1,1,1,1,0
9,REDACTED,J,A,1,1,1,1,0,1,1,...,1,0,0,0,1,1,1,1,1,1


In [57]:
X_data

Unnamed: 0,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,Do you like playing sports?,Do your hobbies fall within the arts?,Do you do a lot of outdoor activities?,...,Do you like Indie music?,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.
0,1,1,1,0,0,0,0,0,1,0,...,0,1,1,0,0,1,1,1,0,1
1,1,0,1,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,1,1,0
2,0,1,0,0,0,0,0,0,1,1,...,0,1,1,1,1,1,0,0,0,1
3,0,0,1,1,0,1,1,0,0,1,...,1,1,0,1,0,1,0,0,1,1
4,1,0,1,1,1,0,0,0,1,1,...,0,0,1,0,0,1,0,0,1,1
5,0,0,0,1,0,1,1,0,0,1,...,1,1,0,1,1,0,1,0,1,1
6,0,0,1,1,1,1,0,0,0,0,...,0,0,1,0,0,1,1,0,0,1
7,1,0,1,0,0,1,1,1,0,1,...,0,0,0,0,0,0,1,0,1,1
8,1,1,0,1,0,1,1,1,1,0,...,1,1,1,0,1,1,1,1,1,0
9,1,1,1,1,0,1,1,0,0,1,...,1,0,0,0,1,1,1,1,1,1


### Kmeans perfromed using the following [api](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)

In [58]:
model = same_size_kmeans(X_data, k=6, size_flexibility=0, max_iter=10)

In [59]:
 X['Labels'] = model.cluster_id

In [60]:
X

Unnamed: 0,Username,First Name,Last Name,Do you like vanilla ice cream?,Do you like chocolate ice cream?,Do you like strawberry ice cream?,Do you like Python?,Do you like Java?,Do you like C/C++?,Do you like Javascript?,...,What about K-Pop?,Do you live on campus?,Do you live in Allston?,Do you live in Cambridge?,Do you live in Brookline?,"You want to enter the tech industry (Google, Facebook, Microsoft, Startups...).",You want to pursue a career in Finance.,You want to work in a role related to Game Design.,You want to eventually work in a role that is algorithm/math heavy.,Labels
0,REDACTED,A,A,1,1,1,0,0,0,0,...,1,1,0,0,1,1,1,0,1,5
1,REDACTED,B,A,1,0,1,0,0,1,0,...,1,0,1,0,0,0,1,1,0,5
2,REDACTED,C,A,0,1,0,0,0,0,0,...,1,1,1,1,1,0,0,0,1,5
3,REDACTED,D,A,0,0,1,1,0,1,1,...,1,0,1,0,1,0,0,1,1,5
4,REDACTED,E,A,1,0,1,1,1,0,0,...,0,1,0,0,1,0,0,1,1,3
5,REDACTED,F,A,0,0,0,1,0,1,1,...,1,0,1,1,0,1,0,1,1,3
6,REDACTED,G,A,0,0,1,1,1,1,0,...,0,1,0,0,1,1,0,0,1,3
7,REDACTED,H,A,1,0,1,0,0,1,1,...,0,0,0,0,0,1,0,1,1,3
8,REDACTED,I,A,1,1,0,1,0,1,1,...,1,1,0,1,1,1,1,1,0,3
9,REDACTED,J,A,1,1,1,1,0,1,1,...,0,0,0,1,1,1,1,1,1,3


Now we add the family labels and return the final families

In [61]:
families = {}
families['Euclid'] = []
families['Prim'] = []
families['Hopper'] = []
families['Pascal'] = []
families['Boole'] = []
families['Djikstra'] = []

for i in range(len(X['Labels'])):
    if(X['Labels'][i] == 0):
        families['Euclid'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 1):
        families['Prim'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 2):
        families['Hopper'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 3):
        families['Pascal'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 4):
        families['Boole'].append(X['First Name'][i] + ' ' + X['Last Name'][i])
    if(X['Labels'][i] == 5):
        families['Djikstra'].append(X['First Name'][i] + ' ' + X['Last Name'][i])

for family in families:
    print(family)
    print(families[family])
    print()

Euclid
['L A', 'M A', 'N A', 'O A', 'S A', 'T A', 'C B']

Prim
['P A', 'I B', 'J B', 'K B', 'L B', 'M B', 'N B']

Hopper
['U A', 'D B', 'E B', 'F B', 'G B', 'H B']

Pascal
['E A', 'F A', 'G A', 'H A', 'I A', 'J A', 'A B']

Boole
['K A', 'V A', 'W A', 'X A', 'Y A', 'Z A', 'B B']

Djikstra
['A A', 'B A', 'C A', 'D A', 'Q A', 'R A']

