In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
def encoding(enc_string, df):
  #Call encoder
  enc = OneHotEncoder(handle_unknown='ignore')
  enc_df = df[[enc_string]]
  X = enc_df.to_numpy()
  #Fit and transform data
  enc.fit(X)
  g = enc.transform(X).toarray()
  names = enc.get_feature_names_out([''])
  #Put it back into dataframe mode
  arr = pd.DataFrame(g, columns = names)
  return arr

In [None]:
def kmeans(df, k):
  #Drop non k-meanable columns
  kmeans_df = df.drop(columns = ['Age', 'Preferred Age','Height', 'Preferred Height','Race', 'Preferred Race', 'Personality'])
  X = kmeans_df.drop(columns = ['Name', 'Gender', 'Email'])
  #Scale and transform data
  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  #Call k means and run through data
  kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
  res = kmeans.labels_
  df['group'] = res
  return df



In [None]:
def sort_into_groups(df, k):
  d = {}
  #Sort each clusters into a dictionary
  for x in range(k):
      d["group_{0}".format(x)] = df[df['group'] == x]
  return d

In [None]:
def compatible_height_calc (group):
  #Height compatibility equation by subtracting values from preferred to original
  compat_height = np.subtract.outer(group['Height'].to_numpy(), group['Preferred Height'].to_numpy())
  compat_height = np.absolute(compat_height)
  compat_height = compat_height/np.amax(compat_height)
  #Invert it to have higher compat score for more compat
  compat_height = 1- compat_height
  return compat_height

In [None]:
def compatible_race_calc (group):
  #if races match give a +1 score
  compat_race = np.equal.outer(group['Race'].to_numpy(), group['Preferred Race'].to_numpy())
  return compat_race

In [None]:
def compatible_age_calc (group):
  #Age compatibility score based on differences from preferred to original
  compat_age = np.subtract.outer(group['Age'].to_numpy(), group['Preferred Age'].to_numpy())
  compat_age = np.absolute(compat_age)
  compat_age = compat_age/np.amax(compat_age)
  #Inver it
  compat_age = 1- compat_age
  return compat_age

In [None]:
def compatible_personality_chart():
  #Compatibility personality chart with each personality having a 0, 0.25, .5, 0.75, and 1 score with another
  personality_chart = np.array([[.75, .75, .75, 1, .75, 1, .75, .75, 0, 0, 0, 0, 0, 0, 0, 0],
                             [.75, .75, 1, .75, 1, .75, .75, .75, 0, 0, 0, 0, 0, 0, 0, 0],
                             [.75, 1, .75, .75, .75, .75, .75, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                             [1, .75, .75, .75, .75, .75, .75, .75, 1, 0, 0, 0, 0, 0, 0, 0],
                             [.75, 1, .75, .75, .75, .75, .75, 1, .5, .5, .5, .5, .25, .25, .25, .25],
                             [1, .75, .75, .75, .75, .75, 1, .75, .5, .5, .5, .5, .5, .5, .5, .5],
                             [.75, .75, .75, .75, .75, 1, .75, .75, .5, .5, .5, .5, .25, .25, .25, 1],
                             [.75, .75, 1, .75, 1, .75, .75, .75, .5, .5, .5, .5, .25, .25, .25, .25],
                             [0, 0, 0, 1, .5, .5, .5, .5, .25, .25, .25, .25, .5, 1, .5, 1],
                             [0, 0, 0, 0, .5, .5, .5, .5, .25, .25, .25, .25, 1, .5, 1, .5],
                             [0, 0, 0, 0, .5, .5, .5, .5, .25, .25, .25, .25, .5, 1, .5, 1],
                             [0, 0, 0, 0, .5, .5, .5, .5, .25, .25, .25, .25, 1, .5, 1, .5],
                             [0, 0, 0, 0, .25, .5, .25, .25, .5, 1, .5, 1, .75, .75, .75, .75],
                             [0, 0, 0, 0, .25, .5, .25, .25, 1, .5, 1, .5, .75, .75, .75, .75],
                             [0, 0, 0, 0, .25, .5, .25, .25, .5, 1, .5, 1, .75, .75, .75, .75],
                             [0, 0, 0, 0, .25, .5, 1, .25, 1, .5, 1, .5, .75, .75, .75, .75]])

  personality_names = ['INFP', 'ENFP', 'INFJ', 'ENFJ', 'INTJ', 'ENTJ', 'INTP', 'ENTP',
                     'ISFP', 'ESFP', 'ISTP', 'ESTP', 'ISFJ', 'ESFJ', 'ISTJ', 'ESTJ']

  personality_df = pd.DataFrame(personality_chart, columns = personality_names, index = personality_names)
  return personality_df

In [None]:
def compatible_personality_calc (group, personality_chart):
  #Based on personality original, find score based on chart
  per = group['Personality'].to_numpy()
  compat_per = np.zeros((len(per),len(per)))
  for x in range(len(per)):
    for y in range(len(per)):
      compat_per[x][y] = personality_chart.loc[per[x]][per[y]]
  return compat_per

In [None]:
def array_of_tuple_matches (group, final_compatible_chart):
  #Get the names, email, and gender of the group
  names = group['Name'].tolist()
  emails = group['Email'].tolist()
  gender = group['Gender'].tolist()
  #Make a male and female list of each
  index_list_of_females = [i for i, x in enumerate(gender) if x == "F"]
  index_list_of_males = [i for i, x in enumerate(gender) if x == "M"]
  M_names = [names[i] for i in index_list_of_males]
  M_emails = [emails[i] for i in index_list_of_males]
  F_names = [names[i] for i in index_list_of_females]
  F_emails = [emails[i] for i in index_list_of_females]

  arr = []
  for x in range(len(names)):
    #If it's male, match to female
    if gender[x] == 'M':
      F_final_compat = [final_compatible_chart[x][i] for i in index_list_of_females]
      arr.append(sorted(zip(F_final_compat, F_names,F_emails), reverse=True)[:3])
    else:
      #If its female, match to male
      M_final_compat = [final_compatible_chart[x][i] for i in index_list_of_males]
      arr.append(sorted(zip(M_final_compat, M_names, M_emails), reverse=True)[:3])


  return arr

In [None]:
def df_with_best_matches (arr, group):
  #Make the dataframe that we need as output
  match_names = ['Best Match 1', 'Best Match 2', 'Best Match 3']
  match_df = pd.DataFrame(arr, columns = match_names)
  match_df.reset_index(drop=True, inplace=True)
  group.reset_index(drop=True, inplace=True)
  res = pd.concat([group, match_df], axis =1)
  res['Best Match 1 Email'] = res['Best Match 1'].apply(lambda x: x[2] if x != None else None)
  res['Best Match 2 Email'] = res['Best Match 2'].apply(lambda x: x[2] if x != None else None)
  res['Best Match 3 Email'] = res['Best Match 3'].apply(lambda x: x[2] if x != None else None)
  res['Best Match 1 Score'] = res['Best Match 1'].apply(lambda x: x[0] if x != None else None)
  res['Best Match 2 Score'] = res['Best Match 2'].apply(lambda x: x[0] if x != None else None)
  res['Best Match 3 Score'] = res['Best Match 3'].apply(lambda x: x[0] if x != None else None)
  res['Best Match 1 Name'] = res['Best Match 1'].apply(lambda x: x[1] if x != None else None)
  res['Best Match 2 Name'] = res['Best Match 2'].apply(lambda x: x[1] if x != None else None)
  res['Best Match 3 Name'] = res['Best Match 3'].apply(lambda x: x[1] if x != None else None)
  final_table = res[['Name', 'Email', 
                     'Best Match 1 Name', 'Best Match 1 Email', 'Best Match 1 Score', 
                     'Best Match 2 Name','Best Match 2 Email', 'Best Match 2 Score',
                     'Best Match 3 Name', 'Best Match 3 Email', 'Best Match 3 Score']]
  return final_table

In [None]:
df = pd.read_excel("/content/Test Dataset for Algorithm.xlsx")
df = df.head(100)
def main_algo (df):
  #Call encoding on necessary columns
  aes_arr = encoding('Aesthetic', df)
  rel_arr = encoding('Religion', df)
  df = pd.concat([df, aes_arr, rel_arr], axis=1).drop(columns = ['Aesthetic', 'Religion'])
  number_of_cluster = 3
  #K means with cluster optimized to 3
  df = kmeans(df, number_of_cluster)
  groups = sort_into_groups(df, number_of_cluster)

  #For each clusters, run compatibility scores
  for gp in groups.keys():
    compat_height = compatible_height_calc(groups[gp])
    compat_age = compatible_age_calc(groups[gp])
    compat_race = compatible_race_calc(groups[gp])
    compat_per = compatible_personality_calc(groups[gp], compatible_personality_chart())
    final_compatibility = 15 * compat_age + 25 * compat_height + 35 * compat_race + 25 * compat_per
    np.fill_diagonal(final_compatibility, 0)
    arr = array_of_tuple_matches(groups[gp], final_compatibility)
    result = df_with_best_matches(arr, groups[gp])
    groups[gp] = result


  return groups

#Merge all the groups together
results = main_algo(df)
final = np.concatenate((results['group_0'], results['group_1'], results['group_2']), axis = 0)
final_df = pd.DataFrame(final, columns = ['Name', 'Email', 
                     'Best Match 1 Name', 'Best Match 1 Email', 'Best Match 1 Score', 
                     'Best Match 2 Name','Best Match 2 Email', 'Best Match 2 Score',
                     'Best Match 3 Name', 'Best Match 3 Email', 'Best Match 3 Score'])
final_df


Unnamed: 0,Name,Email,Best Match 1 Name,Best Match 1 Email,Best Match 1 Score,Best Match 2 Name,Best Match 2 Email,Best Match 2 Score,Best Match 3 Name,Best Match 3 Email,Best Match 3 Score
0,Sam,hachi@msn.com,Sam,danzigism@icloud.com,68.03529,George,bartak@att.net,60.065551,George,scarlet@gmail.com,44.753572
1,Ann,hachi@msn.com,Ann,scarlet@gmail.com,54.684396,Matt,thassine@outlook.com,53.682749,Ann,mcrawfor@live.com,49.443277
2,George,mcrawfor@live.com,Jennifer,thassine@outlook.com,82.400132,Ann,mcrawfor@live.com,77.432492,Jennifer,scarlet@gmail.com,67.289201
3,Ann,bartak@att.net,Jennifer,hachi@msn.com,78.578034,Matt,thassine@outlook.com,70.916374,Matt,thassine@outlook.com,59.681352
4,Jennifer,scarlet@gmail.com,Ann,bartak@att.net,88.927473,Ann,hachi@msn.com,65.568604,Matt,hachi@msn.com,49.024012
...,...,...,...,...,...,...,...,...,...,...,...
95,Matt,mcrawfor@live.com,Ann,froodian@me.com,77.430512,Matt,mcrawfor@live.com,72.034848,Sam,bartak@att.net,61.477796
96,Sam,danzigism@icloud.com,Sam,froodian@me.com,76.973193,Ann,mcrawfor@live.com,57.981918,Ann,mcrawfor@live.com,54.003374
97,Sam,froodian@me.com,Ann,kewley@optonline.net,91.967946,Sam,kewley@optonline.net,78.619237,Jennifer,danzigism@icloud.com,69.5375
98,Matt,bartak@att.net,Sam,bartak@att.net,78.016653,George,kewley@optonline.net,77.576895,Sam,kewley@optonline.net,75.860823
