#1 - Collaborative Filtering

In [None]:
import importlib, get_data, process_matches_and_members
importlib.reload(process_matches_and_members)

df_matches = get_data.get_dataframe_from_postgres("matches")
df_matches = process_matches_and_members.process_match_rating(df_matches)
df_members = get_data.get_dataframe_from_postgres("members")

In [None]:
import pandas as pd
# gives us a pd of all the matches where the woman was on YU connects,
# and cuts out all the match info except for the ids and is_good_match
yuconnects = pd.merge(df_matches, df_members[['id', 'site']],
                      left_on='female_id',
                      right_on='id',
                      how='left') \
                      [lambda x: x['site'] == 'YUConnects'] \
                      .drop(columns=['id_x', 'id_y']) #these columns get generated at some point, not sure why

In [None]:
from fastai.tabular.all import *

procs = [Categorify]
cat_names = ['male_id', 'female_id']
dep_var = ['is_good_match']
y_block = CategoryBlock()
splits = RandomSplitter(valid_pct=0.2)(range_of(yuconnects))

dls = TabularDataLoaders.from_df(
    yuconnects,
    path='.',
    procs=procs,
    cat_names=cat_names,
    cont_names=[],
    y_names=dep_var,
    y_block=y_block,
    splits=splits,
    bs=64
)

In [None]:
learn = tabular_learner(
    dls,
    metrics=accuracy
)
learn.fit_one_cycle(5, .01)
# ~74% accuracy

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

preds, targs = learn.get_preds()
predicted_classes = torch.argmax(preds, dim=1)

class_names = ['Bad Match', 'Good Match']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=targs,
    y_pred=predicted_classes,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

#2 Random Forest Model

In [None]:
# this is a nested merge that gets all the data about the males and females in a match
yuconnects_with_member_data = yuconnects.merge(df_members.add_prefix('female_'),
                                               left_on='female_id',
                                               right_on='female_id',
                                               how='left',
                                               suffixes=('', '_female_y')).merge(
                                                   df_members.add_prefix('male_'),
                                                   left_on='male_id',
                                                   right_on='male_id',
                                                   how='left',
                                                   suffixes=('_female', '_male')
                                               )


In [None]:
column_names = yuconnects_with_member_data.columns

# all the continuous columns with operating data (some seemingly continuous columns have bad data)
forest_cont_names = ['female_age', 'female_height_inches', 'female_num_matches',
                     'female_acceptance_rate', 'female_min_height_inches', 'female_max_height_inches',
                     'male_age', 'male_height_inches', 'male_num_matches',
                     'male_acceptance_rate', 'male_min_height_inches', 'male_max_height_inches']

# these are columns that are used to calculate 'is_good_match' and must be kept hidden from the model
indicators_of_dep_var = [
                         'ms','male_pr', 'male_s','female_s',
                         'female_pr', 'matchmaker_pr', 'match_quality',
                         'decline_reason','overall_pr','male_s_rating',
                         'female_s_rating', 'quality_rating','overall_rating',
                         ]
# all columns that aren't continuous are categorical
forest_cat_names = [col for col in column_names if col not in forest_cont_names and col not in indicators_of_dep_var and col not in dep_var]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# We are adding to the procs FillMissing, because whereas before there were no missing values, now there are.
# We are also adding normalize which will normalize the values in our columns with continuous values
procs = [Categorify, Normalize, FillMissing]
splits = RandomSplitter(valid_pct=0.2)(range_of(yuconnects_with_member_data))

to = TabularPandas(yuconnects_with_member_data,
                   procs=procs,
                   cat_names=forest_cat_names,
                   cont_names=forest_cont_names,
                   y_names=dep_var,
                   splits=splits,
                   y_block=CategoryBlock())
X_train = to.train.xs
y_train = to.train.y
X_valid = to.valid.xs
y_valid = to.valid.y

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, oob_score=True)
rf_model.fit(X_train, y_train)

y_pred_valid = rf_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy: {accuracy}")
# 75% accuracy

In [None]:
X_train.columns.tolist()
# male_pr, ms,

In [None]:
class_names = ['Bad Match', 'Good Match']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_valid,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(
    n_estimators= 100,
    learning_rate=.1,
    max_depth=20,
    enable_categorical=True,
    subsample=1,
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred_valid = xgb_model.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy: {accuracy}")
# ~76.5% accuracy (playing with hyperparams didn't change much as long as max_depth was above 20)

In [None]:
class_names = ['Bad Match', 'Good Match']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_valid,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

#2a - Random Forest with Rejection Counts

In [None]:
import importlib
import rejection_type_counts
importlib.reload(rejection_type_counts)
members_with_matches = df_members[df_members['num_matches'] > 0]
rejection_count = rejection_type_counts.add_rejection_type_counts(members_with_matches, df_matches)

In [None]:
# this is a nested merge that gets all the data about the males and females in a match
yuconnects_with_rejection_count = yuconnects.merge(rejection_count.add_prefix('female_'),
                                               left_on='female_id',
                                               right_on='female_id',
                                               how='left',
                                               suffixes=('', '_female_y')).merge(
                                                   rejection_count.add_prefix('male_'),
                                                   left_on='male_id',
                                                   right_on='male_id',
                                                   how='left',
                                                   suffixes=('_female', '_male')
                                               )
# gets rid of bad values that allow for more continuous columns
yuconnects_with_rejection_count = process_matches_and_members.process_data(yuconnects_with_rejection_count)

In [None]:
column_names = yuconnects_with_rejection_count.columns

# all the rejection count columns
rejection_counts_columns = [col for col in column_names if col.startswith('male_male_reason')
                                                or col.startswith('male_male_rejection')
                                                or col.startswith('female_female_reason')
                                                or col.startswith('female_female_rejection')]

# all the continuous columns with operating data (some seemingly continuous columns have bad data)
forest_cont_names = rejection_counts_columns +  ['female_age', 'female_height_inches', 'female_num_matches',
                     'female_acceptance_rate', 'female_min_height_inches', 'female_max_height_inches',
                     'male_age', 'male_height_inches', 'male_num_matches',
                     'male_acceptance_rate', 'male_min_height_inches', 'male_max_height_inches']

# all columns that aren't continuous are categorical
forest_cat_names = [col for col in column_names if col not in forest_cont_names and col not in indicators_of_dep_var and col not in dep_var]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# We are adding to the procs FillMissing, because whereas before there were no missing values, now there are.
# We are also adding normalize which will normalize the values in our columns with continuous values
procs = [Categorify, Normalize, FillMissing]
splits = RandomSplitter(valid_pct=0.2)(range_of(yuconnects_with_rejection_count))
to = TabularPandas(yuconnects_with_rejection_count,
                   procs=procs,
                   cat_names=forest_cat_names,
                   cont_names=forest_cont_names,
                   y_names=dep_var,
                   splits=splits,
                   y_block=CategoryBlock())
X_train = to.train.xs
y_train = to.train.y
X_valid = to.valid.xs
y_valid = to.valid.y

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, oob_score=True)
rf_model.fit(X_train, y_train)

y_pred_valid = rf_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy: {accuracy}")
# 77% accuracy

In [None]:
class_names = ['Bad Match', 'Good Match']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_valid,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

#2b - More Rejection Feature Engineering

In [None]:
importlib.reload(rejection_type_counts)
members_with_matches = df_members[df_members['num_matches'] > 0]
rejection_count = rejection_type_counts.add_rejection_sums(rejection_count, df_matches)

In [None]:
mean_acceptance_rate = rejection_count['acceptance_rate'].mean()
rejection_count['acceptance_rate'].fillna(mean_acceptance_rate, inplace=True)

In [None]:
# this is a nested merge that gets all the data about the males and females in a match
yuconnects_with_rejection_count = yuconnects.merge(rejection_count.add_prefix('female_'),
                                               left_on='female_id',
                                               right_on='female_id',
                                               how='left',
                                               suffixes=('', '_female_y')).merge(
                                                   rejection_count.add_prefix('male_'),
                                                   left_on='male_id',
                                                   right_on='male_id',
                                                   how='left',
                                                   suffixes=('_female', '_male')
                                               )

In [None]:
yuconnects_with_rejection_count = rejection_type_counts.add_rejection_products(yuconnects_with_rejection_count)
yuconnects_with_rejection_count['male_percent_rejected'].fillna(0, inplace=True)
yuconnects_with_rejection_count['male_percent_rejecter'].fillna(0, inplace=True)

In [None]:
yuconnects_with_rejection_count[['male_percent_rejecter', 'female_percent_rejecter', 'male_percent_rejected', 'female_percent_rejected']]
yuconnects_with_rejection_count['male_rejects_female_percentage_product'] = yuconnects_with_rejection_count['male_percent_rejecter'] * yuconnects_with_rejection_count['female_percent_rejected']
yuconnects_with_rejection_count['female_rejects_male_percentage_product'] = yuconnects_with_rejection_count['female_percent_rejecter'] * yuconnects_with_rejection_count['male_percent_rejected']
yuconnects_with_rejection_count[['male_percent_rejecter', 'female_percent_rejecter', 'male_percent_rejected', 'female_percent_rejected', 'male_rejects_female_percentage_product', 'female_rejects_male_percentage_product']].plot.kde()

In [None]:
procs = [Categorify, Normalize, FillMissing]

column_names = yuconnects_with_rejection_count.columns

product_columns = [col for col in column_names if 'product' in col]

# rejectered is combo of 'rejecter' and 'rejected' (both of which contain 'rejecte')
rejectered_columns = [col for col in column_names if 'rejecte' in col]

forest_cont_names = forest_cont_names + product_columns + rejectered_columns

forest_cat_names = [col for col in column_names if (col not in forest_cont_names) and (col not in indicators_of_dep_var) and (col not in dep_var)]

splits = RandomSplitter(valid_pct=0.2)(range_of(yuconnects_with_rejection_count))
forest_cat_names


In [None]:
to = TabularPandas(yuconnects_with_rejection_count,
                   procs=procs,
                   cat_names=forest_cat_names,
                   cont_names=forest_cont_names,
                   y_names=dep_var,
                   splits=splits,
                   y_block=CategoryBlock())
X_train = to.train.xs
y_train = to.train.y
X_valid = to.valid.xs
y_valid = to.valid.y

In [None]:
len(to.train.xs.columns.tolist())

In [None]:
X_train['male_rejects_female_percentage_product']

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, oob_score=True)
rf_model.fit(X_train, y_train)

y_pred_valid = rf_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy: {accuracy}")
# 78% accuracy

In [None]:
class_names = ['Bad Match', 'Good Match']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_valid,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

#2c - throwing it in an XGBoost Model

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(
    n_estimators= 250,
    learning_rate=.1,
    max_depth=25,
    enable_categorical=True,
    subsample=1,
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred_valid = xgb_model.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy: {accuracy}")
# ~79% accuracy (playing with hyperparams didn't change much as long as max_depth was above 20)

In [None]:
class_names = ['Bad Match', 'Good Match']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_valid,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

In [None]:
y_pred_proba = xgb_model.predict_proba(X_valid)[:, 1]
# best accuracy
y_pred_very_strict = (y_pred_proba >= 0.65).astype(int)
y_pred_strict = (y_pred_proba >= 0.565).astype(int)

# You can then evaluate accuracy or other metrics with these new predictions
from sklearn.metrics import accuracy_score
print(f"Accuracy (strict): {accuracy_score(y_valid, y_pred_very_strict)}")
print(f"Accuracy (lenient): {accuracy_score(y_valid, y_pred_strict)}")

In [None]:
class_names = ['Off Target', 'On Target']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_very_strict,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix")
plt.show()

In [None]:
class_names = ['Off Target', 'On Target']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_strict,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix (56.5% Thesrold)")
plt.show()

In [None]:
y_pred_lenient = (y_pred_proba >= 0.1).astype(int)
print(f"Accuracy (lenient): {accuracy_score(y_valid, y_pred_lenient)}")

In [None]:
class_names = ['Off Target', 'On Target']

disp = ConfusionMatrixDisplay.from_predictions(
    y_true=y_valid,
    y_pred=y_pred_lenient,
    display_labels=class_names,
    cmap=plt.cm.Blues
)

disp.ax_.set_title("Confusion Matrix (10% Threshold)")
plt.show()