## **CISC873_Assignment1**

Mahsa Aghaeeaval (10177616)

# Loading Data / Libraries

In [None]:
# list of code resources used

# Steven Ding's Code for the assignment
# https://www.kaggle.com/aeshen/the-secret-to-getting-the-second-date
# https://www.kaggle.com/lucabasa/the-data-science-book-of-love
# https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines
# https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [25]:
# libriaries used are loaded
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.model_selection import learning_curve
from sklearn.datasets import fetch_openml
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from imblearn.combine import SMOTEENN 

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
# grab dataset from google drive
drive.mount('/content/drive')

# loading the train_new.csv data as a pandas dataframe
df = pd.read_csv("/content/drive/My Drive/CISC873_Assignments/Assignment1/train.csv")

# loading the test_new.csv data as a pandas dataframe to be used later for kaggle prediction
df_test_for_kaggle = pd.read_csv("/content/drive/My Drive/CISC873_Assignments/Assignment1/test.csv")

# Data exploration

In [None]:
# peak into data

# print first and last few rows of the dataset
print(df.head())
print(df.tail())

# print all column headers + type
print(df.info())

# print all missing values usinbg isnull() function
obj = df.isnull().sum()
for key,value in obj.iteritems():
    print(key,",",value)

#NOTES: These statistics will show how the data is structured and what it cocntains and gives an idea how much 'fun' this is going to be

In [None]:
# features/ target features distribution check

# checking the ditribution of features
select_feature = df['match'] # replace 'rating' with any feature you want to check

feature_distribution_hist = plt.hist(select_feature) # dist using his
plt.show()

feature_distribution_kde = sns.kdeplot(select_feature) # dist using kde
plt.show()

# stats summary of features
print(round(df.describe().T))
print(df.median())


#NOTES: checking the distribution helps understand the feature in terms of its range. It will help see if the values are centered or scattered.
#NOTES: I am checking with 2 graphs (kde and histogram) since some feature are better represented using one
#NOTES: I included stats summary (mean) and median to mathematically check to see if any features are heavily + or - skewed

In [10]:
# dropping columsn with missing values
df = df.loc[:, df.isnull().mean() < .5]


#NOTE: can change treshhold to any and see what columns are removed

In [None]:
# print all the data type for every column to see what mess we're dealing with
obj = df.dtypes
for key,value in obj.iteritems():
    print(key,",",value)


print(df.info())

In [None]:
# creating an object free data-frame (JUST FOR EDA, df will be used for pipline)
no_object_df = df.drop(['field', 'undergra', 'zipcode', 'income', 'from', 'career'], axis=1)

print(no_object_df.info())

In [None]:
# heatmap to check attribute correlation
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("Correlation Heatmap")
corr = no_object_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
# looking into a relationship between gender and matches
sns.set(style="ticks", color_codes=True)
g = sns.FacetGrid(no_object_df, col="gender")
g = g.map(plt.hist, "match")
plt.ticklabel_format(useOffset=False, style='plain')

In [None]:
# look into race
# assignining the number to their respective string for better EDA. 

df['race'] = df.race.map({1: 'Black', 2: 'White', 3: 'Hispanic', 
                          4: 'Asian', 6: 'Other'}).fillna(df.race)

df.race.value_counts(dropna=False)

# look into race by age
ax = df[['race', 'age']].groupby('race').mean().plot(kind='bar', figsize=(12,5), legend=False,
                                               title='Mean Age by Race',
                                                   ylim=(24,28), color='rgbmy')
ax.set_xticklabels(['Asian', 'Black', 'Hispanic',  'Other', 'White'], 
                   fontsize=12, rotation='horizontal')
ax.set_xlabel('',fontsize=1)

for i in ax.patches:
    ax.text(i.get_x()+.155, i.get_height()+.05, \
            str(round((i.get_height()), 1)), fontsize=12)

# NOTE: I found the documentation file for the dataset which is how I know what string to assign to each number

In [None]:
# look into fields of study of the participants
df['field_cd'] = df.field_cd.map({1: 'Law', 2: 'Math', 3: 'Soc. Sc.', 4: 'Med. Sc.',
                                 5: 'Eng.', 6: 'Journ.', 7: 'Hist.', 8: 'Econ', 9: 'Educ.',
                                 10: 'Nat. Sc.', 11: 'Soc. Wr.', 12: 'Und.', 13: 'Pol. Sc.',
                                 14: 'Film', 15: 'Arts', 16:'Lang.', 17: 'Arch.', 18: 'Oth.'}).fillna(df.field_cd)
df.field_cd.value_counts(dropna=False)

# plot
plt.figure(figsize = (12,5))
ax = sns.countplot(x="field_cd", data=df)
plt.title('Field of study', fontsize=18)

# field of study by gender plot
tmp = df[['gender', 'field_cd']].groupby(['field_cd', 'gender']).size().unstack().fillna(0)
ax = tmp.plot(kind='bar', figsize=(12,6), stacked=True)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12, rotation=45)

ax.set_title('Field of study by gender', fontsize=18)
ax.set_xlabel('',fontsize=1)

In [None]:
# Is race important (is it a decided factor for match)?

# creates histogram of race importance using the imprace column
ax = df.imprace.hist(bins=10, figsize=(12,8))
ax.set_title('How important is the race', fontsize=15)
ax.set_xlabel('Importance',fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.grid(False)

In [None]:
# Is religion important (is it a decided factor for match)?
ax = df.imprelig.hist(bins=10, figsize=(12,8))
ax.set_title('How important is the religion', fontsize=15)
ax.set_xlabel('Importance',fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.grid(False)

In [None]:
# looking into why people are doing this (goals)
df['goal'] = df.goal.map({1: 'Fun', 2: 'Meet', 3: 'Date', 
                          4: 'Relationship', 5: 'IdidIt', 6: 'Other'}).fillna(df.goal)

df.goal.value_counts(dropna=False)

# NOTE: I found the documentation file for the dataset which is how I know what string to assign to each number

In [None]:
# looking into people's interests
def many_hist(cols):
    num = len(cols)
    rows = int(num/2) + (num % 2 > 0)
    fig, ax = plt.subplots(rows, 2, figsize=(15, 5 * (rows)))
    i = 0
    j = 0
    for feat in cols:
        df[feat].hist(label=feat, ax=ax[i][j])
        ax[i][j].set_title(feat, fontsize=12)
        ax[i][j].grid(False)
        j = (j+1)%2
        i = i + 1 - j
# hists for interests
interests = ['sports', 'tvsports', 'exercise', 'dining', 'museums',
       'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater',
       'movies', 'concerts', 'music', 'shopping', 'yoga']
many_hist(interests)


# heatmap for interests
corr = df[interests].corr()
plt.figure(figsize=(12,10))
ax = sns.heatmap(corr, cmap='RdBu_r')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_title('Correlation between interests', fontsize=18)

# Data preparation



In [12]:
# x and y for PIPELINE 1
x = df.drop('match', axis=1)
features_numeric = list(x.select_dtypes(include=['float64']))
features_categorical = list(x.select_dtypes(include=['object']))
y = df['match']

# Pipelines

Since it would take too unnecassary space to have a cell for every variation of the model, I will make show all variations of my XGBClassifier (since it was my best model) and only one cell for every model (using grid search). 

In [None]:
# Pipeline 1 (BASE LINE) grid search + XGclassifier 

transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)

full_pipline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('my_classifier', XGBClassifier(
            objective='binary:logistic', seed=1))
    ]
)



param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'my_classifier__n_estimators': [5, 20, 100, 500],
    'my_classifier__max_depth':[3, 5, 10]
}

grid_search = GridSearchCV(
    full_pipline, param_grid, cv=5, verbose=3, n_jobs=2, 
    scoring='roc_auc')

grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))

In [None]:
# Pipeline 2 (MODIFIED) grid search + XGclassifier 

transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)


# running SMOTE to overcome unbalanced data problem
smote_enn = SMOTEENN()

# running for k fold validation
kf = StratifiedKFold()

# conducting preprocessor the same way as pipeline, then adding TruncatedSVD() as well
estimators = [('preprocessor', preprocessor), ('smote_enn', smote_enn), ('reduce_dim', TruncatedSVD()), ('my_classifier', XGBClassifier(objective='binary:logistic'))]
pipe = Pipeline(estimators)




# `__` denotes attribute 
# (e.g. my_classifier__n_estimators means the `n_estimators` param for `my_classifier`
#  which is our xgb)
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'],
    'reduce_dim__algorithm' : ['randomized','arpack'],
    'my_classifier__n_estimators': [5, 20, 100, 500],
    'my_classifier__max_depth':[3, 5, 10],
    'my_classifier__min_child_weight': [0.01, 0.1 , 1],
    'my_classifier__subsample' : [0.8],
    'my_classifier__gamma' : [0.01, 0.1, 0.5]
}

grid_search = GridSearchCV(
    pipe, param_grid, cv= kf, verbose=3, n_jobs=2, 
    scoring='roc_auc')

grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))


## VERY IMPORTANT NOTE: Since the previous pipline i have addded SMOTE, kf validation, TruncatedSVD() and added more hyperparameter HOWEVER, these steps were NOT run all at once together.
## I ran these functions one by one to observe model performance but wanted to keep my final code clean. Keep in mind to slighly change the pipeline structure if you're trying to run SMOTEEN. 

In [None]:
# Pipeline 2 (MODIFIED) Bayes search + XGclassifier 

transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)


# running SMOTE to overcome unbalanced data problem
smote_enn = SMOTEENN()

# running for k fold validation
kf = StratifiedKFold()

# conducting preprocessor the same way as pipeline, then adding TruncatedSVD() as well
estimators = [('preprocessor', preprocessor), ('smote_enn', smote_enn), ('reduce_dim', TruncatedSVD()), ('my_classifier', XGBClassifier(objective='binary:logistic'))]
pipe = Pipeline(estimators)


# `__` denotes attribute 
# (e.g. my_classifier__n_estimators means the `n_estimators` param for `my_classifier`
#  which is our xgb)
bayes_search = BayesSearchCV(
    pipe,
    {
        'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'],
        'reduce_dim__algorithm' : ['randomized','arpack'],
        'my_classifier__n_estimators': [5, 20, 100, 500],
        'my_classifier__max_depth':[3, 5, 10],
        'my_classifier__min_child_weight': [0.01, 0.1 , 1],
        'my_classifier__subsample' : [0.8],
        'my_classifier__gamma' : [0.01, 0.1, 0.5]
    },
    n_iter=3,
    random_state=0,
    verbose=3,
)

bayes_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))




bayes_search = BayesSearchCV(
    full_pipline,
    {
        'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'],
        'reduce_dim__algorithm' : ['randomized','arpack'],
        'my_classifier__n_estimators': [5, 20, 100, 500],
        'my_classifier__max_depth':[3, 5, 10],
        'my_classifier__min_child_weight': [0.01, 0.1 , 1],
        'my_classifier__subsample' : [0.8],
        'my_classifier__gamma' : [0.01, 0.1, 0.5]
    },
    n_iter=3,
    random_state=0,
    verbose=3,
)

bayes_search.fit(x, y)


In [None]:
# Pipeline 2 (MODIFIED) Random search + XGclassifier 

transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)


# running SMOTE to overcome unbalanced data problem
smote_enn = SMOTEENN()

# running for k fold validation
kf = StratifiedKFold()

# conducting preprocessor the same way as pipeline, then adding TruncatedSVD() as well
estimators = [('preprocessor', preprocessor), ('smote_enn', smote_enn), ('reduce_dim', TruncatedSVD()), ('my_classifier', XGBClassifier(objective='binary:logistic'))]
pipe = Pipeline(estimators)




# `__` denotes attribute 
# (e.g. my_classifier__n_estimators means the `n_estimators` param for `my_classifier`
#  which is our xgb)
random_search = RandomizedSearchCV(
    full_pipline,
    {
        'preprocessor__num__imputer__strategy': ['mean','median','most_frequent'],
        'reduce_dim__algorithm' : ['randomized','arpack'],
        'my_classifier__n_estimators': [5, 20, 100, 500],
        'my_classifier__max_depth':[3, 5, 10],
        'my_classifier__min_child_weight': [0.01, 0.1 , 1],
        'my_classifier__subsample' : [0.8],
        'my_classifier__gamma' : [0.01, 0.1, 0.5]
    },
    n_iter=3,
    random_state=0,
    verbose=3,
)

random_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))





I will now show only ONE example of each model with GRID search (best performing search) to let you see what parameter were used and whatnot. Obviously Pipeline 2 was also ran for these models but code will be ommitted as nothing really changes except for the hyperparameters (discussed more in detail in my document). 

In [None]:
# Pipeline 1 (grid search) + Random forest

transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)


preprocessor = ColumnTransformer(
    transformers=[('num', transformer_numeric, features_numeric)]
)


full_pipline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('my_classifier', RandomForestClassifier())])



param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'my_classifier__n_estimators': [500, 700],
    'my_classifier__max_depth':[10],
    'my_classifier__min_samples_leaf': [1, 2, 4],
    'my_classifier__max_features': ['auto', 'sqrt'],
    'my_classifier__min_samples_split': [2, 5, 10]

}

grid_search = GridSearchCV(
    full_pipline, param_grid, cv=5, verbose=3, n_jobs=2, 
    scoring='roc_auc')

grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))

In [None]:
# Pipeline 1 (grid search) + Logistic regression
transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)

full_pipline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('my_classifier', LogisticRegression(class_weight='balanced'))])



param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'my_classifier__C':np.logspace(-3,3,7),
    'my_classifier__penalty':['l1','l2']
}

grid_search = GridSearchCV(
    full_pipline, param_grid, cv=5, verbose=3, n_jobs=-1, 
    scoring='roc_auc')

grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))

In [None]:
# Pipeline 1 (grid search) + SVC
transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)

full_pipline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('my_classifier', SVC(class_weight='balanced'))])



param_grid = {
    'my_classifier__kernel': ['rbf','poly'], 
    'my_classifier__gamma': [1e-3, 1e-4],
    'my_classifier__C': [1, 10, 100, 1000]
}


grid_search = GridSearchCV(
    full_pipline, param_grid, cv=5, verbose=3, n_jobs=-1, 
    scoring='roc_auc')

grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))

In [None]:
from sklearn.neural_network import MLPClassifier

# Pipeline 1 (grid search) + SVC
transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)

full_pipline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('my_classifier', MLPClassifier())])



param_grid = {
    'my_classifier__hidden_layer_sizes': [(50,100,50), (100,)],
    'my_classifier__activation': ['tanh', 'relu'],
    'my_classifier__solver': ['sgd', 'adam'],
    'my_classifier__alpha': [0.0001, 0.05],
    'my_classifier__learning_rate': ['constant','adaptive']
}


grid_search = GridSearchCV(
    full_pipline, param_grid, cv=4, verbose=3, n_jobs=-1, 
    scoring='roc_auc')

grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best score {}'.format(grid_search.best_params_))

# Predict and save as csv

Grid search

In [None]:
# prepare submission for grid_search:
submission = pd.DataFrame()
submission['id'] = df_test_for_kaggle['id']
submission['match'] = grid_search.predict_proba(df_test_for_kaggle)[:,1]
submission.to_csv('name_file_here.csv', index=False)
submission

Bayes search

In [None]:
# prepare submission for bayes_search:
submission = pd.DataFrame()
submission['id'] = df_test_for_kaggle['id']
submission['match'] = bayes_search.predict_proba(df_test_for_kaggle)[:,1]
submission.to_csv('name_file_here.csv', index=False)
submission

Random search

In [None]:
# prepare submission for random_search:
submission = pd.DataFrame()
submission['id'] = df_test_for_kaggle['id']
submission['match'] = random_search.predict_proba(df_test_for_kaggle)[:,1]
submission.to_csv('name_file_here.csv', index=False)
submission