## 1. Dataset Creation

Converting `speed_dating_raw.csv` to `speed_dating.csv`. This section of code takes about 10 minutes to run, and can be skipped if you want, as we've included the `speed_dating.csv` file.

In [157]:
import pandas as pd
# from geopy.geocoders import Nominatim
# import math
import numpy as np

cols = ['match', 'exphappy', 'samerace', 'hobby_diff_phys', 'hobby_diff_out', 'hobby_diff_in', 'same_goal', 'attr_diff', 'sinc_diff', 'intel_diff', 'fun_diff', 'amb_diff', 'income_diff', 'age_diff', 'same_career', 'confidence', 'imprace', 'date_freq', 'out_freq']
raw_df = pd.read_csv('data/speed_dating_raw.csv', encoding='latin-1')
df = pd.DataFrame(columns=cols)

hobbies = ['sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga']

hobbies_phys = ['sports', 'exercise', 'hiking', 'yoga']
hobbies_out = ['dining', 'museums', 'concerts', 'clubbing', 'theater', 'movies', 'shopping']
hobbies_in = ['tvsports', 'art', 'gaming', 'reading', 'tv', 'music']

atts = ['attr', 'sinc', 'intel', 'fun', 'amb']
lookingfor = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1']
selfrate_a = ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']
selfrate_b = ['attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1']

# Difference between totals for hobby attribute groups
def sum_diff(attrs,p_1,p_2):
    hobby_diff = 0
    for h in attrs:
        hobby_diff += abs(p_1[h] - p_2[h])
    return hobby_diff

# Difference between how much each person values a particular quality vs. the other's self-rating of that quality
def att_diff(p_1, p_2):
    all_diff = {}
    for i in range(5):
        diff = 0
        selfrate_a1 = p_1[selfrate_a[i]]
        selfrate_b1 = p_1[selfrate_b[i]]
        selfrate_a2 = p_2[selfrate_a[i]]
        selfrate_b2 = p_2[selfrate_b[i]]

        # if pd.isna(p_1[selfrate_a[i]]) and pd.isna(p_1[selfrate_b[i]]):
        #     selfrate_a1 = raw_df[selfrate_a[i]].mean()
        #     selfrate_b1 = raw_df[selfrate_b[i]].mean()
        if pd.isna(p_1[selfrate_a[i]]):
            selfrate_a1 = p_1[selfrate_b[i]]
        elif pd.isna(p_1[selfrate_b[i]]):
            selfrate_b1 = p_1[selfrate_a[i]]
        # if pd.isna(p_2[selfrate_a[i]]) and pd.isna(p_2[selfrate_b[i]]):
        #     selfrate_a2 = raw_df[selfrate_a[i]].mean()
        #     selfrate_b2 = raw_df[selfrate_b[i]].mean()
        if pd.isna(p_2[selfrate_a[i]]):
            selfrate_a2 = p_2[selfrate_b[i]]
        elif pd.isna(p_2[selfrate_b[i]]):
            selfrate_b2 = p_2[selfrate_a[i]]

        want_1 = p_1[lookingfor[i]]
        want_2 = p_2[lookingfor[i]]
        selfrate_1 = (selfrate_a1 + selfrate_b1) / 2
        selfrate_2 = (selfrate_a2 + selfrate_b2) / 2
        diff += abs(selfrate_1 - want_2)
        diff += abs(selfrate_2 - want_1)
        if pd.isna(p_1[lookingfor[i]]) and pd.isna(p_2[lookingfor[i]]): print('W')
        all_diff[atts[i]] = diff
    return all_diff

# Flat difference (for income and age)
def plain_diff(key, p_1, p_2):
    item_1 = p_1[key]
    item_2 = p_2[key]
    if isinstance(item_1,str):
        item_1 = float(item_1.replace(',',''))
    if isinstance(item_2,str):
        item_2 = float(item_2.replace(',',''))
    if pd.isna(p_1[key]) or pd.isna(p_2[key]):
        return np.nan
    return abs(item_1 - item_2)

# Make sure there exists a date instance between a given pair of IDs
def dated(iid, pid):
    return len(raw_df.loc[(raw_df['iid'] == iid) & (raw_df['pid'] == pid)]) != 0


## We were going to include euclidean distance between where each person grew up, but the required APIs kept giving rate limits and we did not have time to rectify this.
# def get_dist(p_1,p_2):
#     if isinstance(p_1['zipcode'], str):
#         zip_1 = int(p_1['zipcode'].replace(',',''))
#     else:
#         zip_1 = p_1['zipcode']
#     if isinstance(p_2['zipcode'], str):
#         zip_2 = int(p_2['zipcode'].replace(',',''))
#     else:
#         zip_2 = p_2['zipcode']
#     if pd.isna(zip_1) or pd.isna(zip_2) or (zip_1 == 0) or (zip_2 == 0):
#         return np.nan
#     geolocator = Nominatim(user_agent="geoapiExercises")
#     l_1 = geolocator.geocode(zip_1)
#     l_2 = geolocator.geocode(zip_2)
#     d = math.sqrt(((l_1.latitude - l_2.latitude) ** 2) + ((l_1.longitude - l_2.longitude) ** 2))
#     return d

# 
def create_df():
    for iid in range(1,len(raw_df['iid'].unique())+1):
        for pid in raw_df.loc[raw_df['iid'] == iid]['pid'].unique():
            if dated(iid,pid):
                date = raw_df.loc[(raw_df['iid'] == iid) & (raw_df['pid'] == pid)].to_dict('records')[0]
                p_1 = raw_df.loc[raw_df['iid'] == iid].to_dict('records')[0]
                p_2 = raw_df.loc[raw_df['iid'] == pid].to_dict('records')[0]
                att_diffs = att_diff(p_1, p_2)
                # print(date['wave'],att_diffs)

                new_row = {
                    'match': date['match'],
                    'exphappy': date['exphappy'],
                    'samerace': date['samerace'],
                    'hobby_diff_phys': sum_diff(hobbies_phys,p_1,p_2),
                    'hobby_diff_out': sum_diff(hobbies_out, p_1, p_2),
                    'hobby_diff_in': sum_diff(hobbies_in, p_1, p_2),
                    'same_goal': int(p_1['goal'] == p_2['goal']),
                    'attr_diff': att_diffs['attr'],
                    'sinc_diff': att_diffs['sinc'],
                    'intel_diff': att_diffs['intel'],
                    'fun_diff': att_diffs['fun'],
                    'amb_diff': att_diffs['amb'],
                    'income_diff': plain_diff('income', p_1, p_2),
                    'age_diff': plain_diff('age', p_1, p_2),
                    'same_career': int(p_1['career_c'] == p_2['career_c']),
                    'confidence': ((((int(p_1['expnum']) / 20) + (int(p_2['expnum']) / 20)) / 2) if not (pd.isna(p_1['expnum']) or pd.isna(p_2['expnum'])) else p_1['expnum']),
                    'imprace': (((p_1['imprace'] + p_2['imprace']) / 2) if not (pd.isna(p_1['imprace']) or pd.isna(p_2['imprace'])) else p_1['imprace']),
                    'date_freq': p_1['date'],
                    'out_freq': p_1['go_out']
                }
                # print(new_row)
                df.loc[len(df)] = new_row
    return df

df = create_df()
df.to_csv('data/speed_dating.csv')




## 2.1 Data Quality Report

### DQR: Continuous Features

In [169]:
import pandas as pd
import statistics

df = pd.read_csv('data/speed_dating.csv')

con_headers = ["Feature", "Desc.", "Count", "% of Missing", "Card.", "Min.", "Q1", "Median", "Q3", "Max.", "Mean", "Std. Dev.", "Notes"]

con_features = ['hobby_diff_phys', 'hobby_diff_out', 'hobby_diff_in', 'attr_diff', 'sinc_diff', 'intel_diff', 'amb_diff', 'fun_diff', 'income_diff', 'age_diff', 'confidence', 'exphappy', 'out_freq', 'date_freq', 'imprace'] 
con_df = pd.DataFrame(columns=con_headers)
features_df = pd.read_csv('data/features.csv', index_col='feature')

for col in con_features:
    con_df.loc[len(con_df.index)] = {
        "Feature": col,
        "Desc.": features_df.at[col, "desc"],
        "Count": len(df) - df[col].isnull().sum(),
        "% of Missing": round(df[col].isnull().sum() / len(df), 2),
        "Card.": len(df[col].unique()),
        "Min.": df[col].min(),
        "Q1": round(pd.qcut(df[col], [.25], retbins=True)[1][0], 2),
        "Median": round(df[col].median(),2),
        "Q3": round(pd.qcut(df[col], [.75], retbins=True)[1][0], 2),
        "Max.": df[col].max(),
        "Mean": round(df[col].mean(),2),
        "Std. Dev.": round(statistics.stdev([i for i in df[col] if not pd.isna(i)], df[col].mean()),2),
        "Notes": ""
    }

con_df

Unnamed: 0,Feature,Desc.,Count,% of Missing,Card.,Min.,Q1,Median,Q3,Max.,Mean,Std. Dev.,Notes
0,hobby_diff_phys,sum of difference between hobby/interest value...,8188,0.02,36,0.0,8.0,12.0,15.0,35.0,12.01,4.89,
1,hobby_diff_out,sum of difference between hobby/interest value...,8188,0.02,52,3.0,12.0,16.0,21.0,58.0,17.23,6.77,
2,hobby_diff_in,sum of difference between hobby/interest value...,8188,0.02,38,2.0,12.0,16.0,19.0,41.0,15.95,5.27,
3,attr_diff,difference in self-rated amount vs partner's p...,8136,0.03,758,0.67,19.0,26.5,37.0,131.0,30.56,16.15,
4,sinc_diff,difference in self-rated amount vs partner's p...,8136,0.03,726,1.0,15.0,19.48,24.0,62.0,20.0,8.02,
5,intel_diff,difference in self-rated amount vs partner's p...,8136,0.03,633,0.0,18.98,23.0,29.0,69.0,24.39,8.93,
6,amb_diff,difference in self-rated amount vs partner's p...,8100,0.03,702,0.0,7.5,11.0,15.0,57.0,11.55,5.57,
7,fun_diff,difference in self-rated amount vs partner's p...,8118,0.03,640,0.0,15.5,20.0,24.0,61.5,20.24,7.65,
8,income_diff,difference between incomes,2178,0.74,1061,8.0,6591.0,14997.0,26150.0,85670.0,18447.4,15078.11,
9,age_diff,difference between ages,8159,0.02,25,0.0,1.0,3.0,5.0,32.0,3.66,3.06,


### DQR: Categorical Features

In [170]:
cat_features = ['samerace', 'same_goal', 'same_career', 'match']

cat_headers = ["Feature", "Desc.", "Count", "% of Missing", "Card.", "Mode", "Mode Freq.", "Mode %", "2nd Mode", "2nd Mode Freq.", "2nd Mode %", "Notes"]

cat_df = pd.DataFrame(columns=cat_headers)

for col in cat_features:
    multi_mode = True if len(df[col].mode()) > 1 else False
    mode = df[col].mode()[0]
    mode_2 = list(df[col].value_counts().keys())[1]
    cat_df.loc[len(cat_df.index)] = {
        "Feature": col,
        "Desc.": features_df.at[col, "desc"],
        "Count": len(df) - df[col].isnull().sum(),
        "% of Missing": round(100 * df[col].isnull().sum() / len(df), 2),
        "Card.": len(df[col].unique()),
        "Mode": mode,
        "Mode Freq.": df[col].value_counts()[mode],
        "Mode %": round(df[col].value_counts()[mode] * 100 / (len(df) - df[col].isnull().sum()),2),
        "2nd Mode": mode_2,
        "2nd Mode Freq.": df[col].value_counts()[mode_2],
        "2nd Mode %": round(df[col].value_counts()[mode_2] * 100 / (len(df) - df[col].isnull().sum()),2),
        "Notes": ""
    }

cat_df

Unnamed: 0,Feature,Desc.,Count,% of Missing,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %,Notes
0,samerace,are the two participants the same race,8346,0.0,2,0,5039,60.38,1,3307,39.62,
1,same_goal,whether both people have the same goal in part...,8346,0.0,2,0,5805,69.55,1,2541,30.45,
2,same_career,whether both intended career paths fall into t...,8346,0.0,2,0,6852,82.1,1,1494,17.9,
3,match,target: did they end up matching,8346,0.0,2,0,6972,83.54,1,1374,16.46,


### Visualizing Continuous Features

In [171]:
import matplotlib.pyplot as plt
import seaborn as sns
for col in con_features:
    bins=10
    if col in ['date_freq', 'out_freq']: bins=7
    plt.title(col)
    sns.histplot(df[col], bins=bins)
    plt.savefig(f'figs/figs/hist_{col}')
    plt.close()


### Visualizing Categorical Features

In [172]:
for col in cat_features:
    vc = df[col].value_counts()
    plt.title(col)
    sns.barplot(x=list(vc.index), y=vc.iloc[:], color='#3274a1')
    plt.savefig(f'figs/figs/bar_{col}')
    plt.close()

## 2.2 Missing Values and Outliers

### Missing Values

In [173]:
from sklearn.impute import KNNImputer
import numpy as np

# Verify there are no missing values among categorical features
for col in cat_features:
    indeces = df[df[col].isna()].index
    print(len(indeces))

# Mean imputation for continuous features
for col in con_features:
    if col != 'income_diff':
        mean = df[col].mean()
        df[col].fillna(value=mean, inplace=True)


# KNN imputation for income_diff
imputer = KNNImputer()
index = list(df.columns).index('income_diff')
imputed_col = imputer.fit_transform(df)[:, 13]
df['income_diff'] = imputed_col

display(df)

0
0
0
0


Unnamed: 0.1,Unnamed: 0,match,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,same_goal,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0,0,3.0,0,7.0,21.0,24.0,0,36.0,23.0,24.0,20.0,17.0,26855.6,6.0,0,0.125000,4.5,7.0,1.0
1,1,0,3.0,0,7.0,19.0,15.0,0,60.0,19.0,18.0,38.0,13.0,40250.0,1.0,0,0.550000,1.5,7.0,1.0
2,2,1,3.0,1,9.0,18.0,18.0,1,24.0,23.0,23.0,17.0,19.0,19000.0,1.0,0,0.150000,2.5,7.0,1.0
3,3,1,3.0,0,6.0,14.0,16.0,1,30.0,14.0,18.0,38.0,8.0,12907.0,2.0,0,0.425000,1.5,7.0,1.0
4,4,1,3.0,0,2.0,28.0,20.0,0,32.0,15.0,23.0,10.0,9.0,32705.0,3.0,0,0.275000,2.5,7.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8341,8341,0,3.0,0,21.0,21.0,13.0,1,40.0,15.5,34.0,28.5,15.0,16892.8,1.0,0,0.386816,3.5,6.0,3.0
8342,8342,0,3.0,0,16.0,7.0,18.0,1,74.0,23.0,12.0,14.0,14.0,22752.0,3.0,0,0.386816,3.5,6.0,3.0
8343,8343,0,3.0,0,11.0,26.0,25.0,0,64.0,16.0,33.5,19.5,14.0,39275.0,2.0,0,0.386816,5.0,6.0,3.0
8344,8344,0,3.0,0,11.0,16.0,22.0,0,37.0,26.5,29.5,18.5,12.0,15071.2,5.0,0,0.386816,3.0,6.0,3.0


### Outlier Handling

In [174]:
clamped_df = df.copy()

for col in con_features:
    if col not in ['confidence', 'out_freq', 'date_freq', 'imprace', 'exphappy']:
        q1 = pd.qcut(df[col], [.25], retbins=True)[1][0] if col != 'income_diff' else pd.qcut(df[col], [.05], retbins=True)[1][0]
        q3 = pd.qcut(df[col], [.75], retbins=True)[1][0] if col != 'income_diff' else pd.qcut(df[col], [.95], retbins=True)[1][0]
        iqr = q3 - q1
        upper_bound = q3 + (iqr * 1.5) if col != 'income_diff' else q3
        lower_bound = q1 - (iqr * 1.5)  if col != 'income_diff' else q1
        outlier_count = len(df[df[col] > upper_bound])
        outlier_count += len(df[df[col] < lower_bound])
        clamped_df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
        print(col, f"({outlier_count} features updated with bounds {lower_bound} to {upper_bound})")

df = clamped_df

df

hobby_diff_phys (108 features updated with bounds 0.0 to 24.0)
hobby_diff_out (183 features updated with bounds 1.0 to 33.0)
hobby_diff_in (87 features updated with bounds 1.5 to 29.5)
attr_diff (444 features updated with bounds -6.899999999999999 to 62.54)
sinc_diff (258 features updated with bounds 1.5 to 37.5)
intel_diff (376 features updated with bounds 4.75 to 42.75)
amb_diff (52 features updated with bounds -3.4437500000000005 to 26.06625)
fun_diff (290 features updated with bounds 3.1750000000000043 to 36.495)
income_diff (833 features updated with bounds 4579.0 to 39660.35)
age_diff (158 features updated with bounds -5.0 to 11.0)


Unnamed: 0.1,Unnamed: 0,match,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,same_goal,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0,0,3.0,0,7.0,21.0,24.0,0,36.00,23.0,24.0,20.000,17.0,26855.60,6.0,0,0.125000,4.5,7.0,1.0
1,1,0,3.0,0,7.0,19.0,15.0,0,60.00,19.0,18.0,36.495,13.0,39660.35,1.0,0,0.550000,1.5,7.0,1.0
2,2,1,3.0,1,9.0,18.0,18.0,1,24.00,23.0,23.0,17.000,19.0,19000.00,1.0,0,0.150000,2.5,7.0,1.0
3,3,1,3.0,0,6.0,14.0,16.0,1,30.00,14.0,18.0,36.495,8.0,12907.00,2.0,0,0.425000,1.5,7.0,1.0
4,4,1,3.0,0,2.0,28.0,20.0,0,32.00,15.0,23.0,10.000,9.0,32705.00,3.0,0,0.275000,2.5,7.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8341,8341,0,3.0,0,21.0,21.0,13.0,1,40.00,15.5,34.0,28.500,15.0,16892.80,1.0,0,0.386816,3.5,6.0,3.0
8342,8342,0,3.0,0,16.0,7.0,18.0,1,62.54,23.0,12.0,14.000,14.0,22752.00,3.0,0,0.386816,3.5,6.0,3.0
8343,8343,0,3.0,0,11.0,26.0,25.0,0,62.54,16.0,33.5,19.500,14.0,39275.00,2.0,0,0.386816,5.0,6.0,3.0
8344,8344,0,3.0,0,11.0,16.0,22.0,0,37.00,26.5,29.5,18.500,12.0,15071.20,5.0,0,0.386816,3.0,6.0,3.0


## 2.3 Normalization

In [176]:
norm_df = df.copy()

# range normalization ## ai' = (ai - min(a)) / (max(a) - min(a))) * (high - low) + low
for col in con_features:
    high = 1
    low = 0
    min = norm_df[col].min()
    max = norm_df[col].max()
    norm_df[col] = norm_df[col].apply(lambda a: (((a - min) / (max - min)) * (high - low)) + low)

df = norm_df

df.to_csv('data/speed_dating_transformed.csv')

df

Unnamed: 0.1,Unnamed: 0,match,exphappy,samerace,hobby_diff_phys,hobby_diff_out,hobby_diff_in,same_goal,attr_diff,sinc_diff,intel_diff,fun_diff,amb_diff,income_diff,age_diff,same_career,confidence,imprace,date_freq,out_freq
0,0,0,0.222222,0,0.291667,0.600000,0.800000,0,0.571036,0.597222,0.506579,0.504952,0.652184,0.634998,0.545455,0,0.006250,0.421053,1.000000,0.000000
1,1,0,0.222222,0,0.291667,0.533333,0.472727,0,0.958946,0.486111,0.348684,1.000000,0.498729,1.000000,0.090909,0,0.027500,0.105263,1.000000,0.000000
2,2,1,0.222222,1,0.375000,0.500000,0.581818,1,0.377081,0.597222,0.480263,0.414916,0.728912,0.411073,0.090909,0,0.007500,0.210526,1.000000,0.000000
3,3,1,0.222222,0,0.250000,0.366667,0.509091,1,0.474059,0.347222,0.348684,1.000000,0.306910,0.237391,0.181818,0,0.021250,0.105263,1.000000,0.000000
4,4,1,0.222222,0,0.083333,0.833333,0.654545,0,0.506384,0.375000,0.480263,0.204832,0.345274,0.801737,0.272727,0,0.013750,0.210526,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8341,8341,0,0.222222,0,0.875000,0.600000,0.400000,1,0.635688,0.388889,0.769737,0.760054,0.575457,0.351007,0.090909,0,0.019341,0.315789,0.833333,0.333333
8342,8342,0,0.222222,0,0.666667,0.133333,0.581818,1,1.000000,0.597222,0.190789,0.324880,0.537093,0.518025,0.272727,0,0.019341,0.315789,0.833333,0.333333
8343,8343,0,0.222222,0,0.458333,0.766667,0.836364,0,1.000000,0.402778,0.756579,0.489946,0.537093,0.989016,0.181818,0,0.019341,0.473684,0.833333,0.333333
8344,8344,0,0.222222,0,0.458333,0.433333,0.727273,0,0.587199,0.694444,0.651316,0.459934,0.460365,0.299082,0.454545,0,0.019341,0.263158,0.833333,0.333333


## 2.4 Transformations

### Visualizing continuous features post-transformations

In [177]:
import matplotlib.pyplot as plt
import seaborn as sns
for col in con_features:
    bins=10
    if col in ['date_freq', 'out_freq']: bins=7
    # plt.title(col)
    sns.histplot(df[col], bins=bins)
    plt.tight_layout()
    plt.savefig(f'figs/figs_transformed/hist_{col}')
    plt.close()

### Comparing continuous distributions to the target feature

In [178]:
for col in con_features:
    df.pivot(columns='match', values=col).plot.hist()
    plt.xlabel(col)
    plt.tight_layout()
    plt.savefig(f'figs/figs_compare/hist_{col}')
    plt.close()