# Setup

In [20]:
# Add src/ folder to path
import sys
src_path = '../../code' 
sys.path.insert(0, src_path) 

import matplotlib.pylab as plt
import numpy as np
import pandas as pd

from factor_scatter_matrix import factor_scatter_matrix

In [15]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
data_df = pd.read_csv("../../data/raw/speed_dating_data.csv", encoding="ISO-8859-1")
data_df.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


# Male Analysis

In [22]:
data_m_df = data_df[data_df["gender"] == 1]

In [None]:
features_to_use = [
    'match',\
    'age', 'go_out', 'sports',\
    'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing',\
    'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy']

In [None]:
plt.figure(figsize=(18,18))
factor_scatter_matrix(data_m_df[features_to_use], "match", figsize=(20,20))
plt.show()

There are Nan values in both 'field' and 'field_cd', but there are some entries that 'field' is filled but with 'filed_cd' empty, so we convert them here

In [6]:
print("Numer of ppl did not fill field:", sum(pd.isnull(data_df['field'])))
print("Numer of ppl did not fill field_cd:", sum(pd.isnull(data_df['field_cd'])))
print("Number of ppl did not fill career:", sum(pd.isnull(data_df['career'])))

# convert all filled field to field code
f_fcd = data_df[['field','field_cd']].drop_duplicates() # get all listed filed name and its code
nan_ind = pd.isnull(f_fcd).any(1).nonzero()[0] # row has Nan
f_fcd.drop(f_fcd.index[nan_ind],inplace=True) # remove rows that has Nan

fcd_ind = pd.isnull(data_df['field_cd']).nonzero()[0] # row where 'field_cd' is Nan
data_ind = data_df.index
n = 0
for i in fcd_ind:
    field_i = data_df.loc[data_ind[i], 'field']
    f_cd = f_fcd[f_fcd['field']==field_i]['field_cd'].values
    if pd.isnull(f_cd) == 0: # if the person did not leave 'field' empty
        n += 1
        data_df.loc[data_ind[i], 'field_cd'] = f_cd[0]
print("Done converting with {} filed_cd added!".format(n))

Numer of ppl did not fill field: 63
Numer of ppl did not fill field_cd: 82
Number of ppl did not fill career: 89
Done converting with 19 filed_cd added!


We will only use some relevant and avaliable features

In [7]:
# note 'career_c' is incomplete and needs to add
# And also 'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1' are not filled 
use_features = ['iid', 'gender', 'wave', 'pid', 'match', 'samerace', 'age_o', 'race_o', \
                 'pf_o_att', 'pf_o_sin', 'pf_o_int','pf_o_fun', 'pf_o_amb', 'pf_o_sha',\
                 'age', 'field_cd', 'race', 'imprace', 'imprelig', 'goal', 'date', 'go_out', 'sports',\
                 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing',\
                 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy',\
                 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr2_1', 'sinc2_1',\
                 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1', 'attr3_1', 'sinc3_1','fun3_1', 'intel3_1', 'amb3_1']
Data_df = data_df[use_features]

The features 1_1, 2_1 are measured differently than 3_1, where former has the rule: "Waves 6-9: Please rate the importance of the following attributes on a scale of 1-10; Waves 1-5 and 10-21: Please distribute 100 points among the following attributes -- give more points to those attributes that you think are more important to members of the opposite sex when they are deciding whether to date someone. Total points must equal 100."

In [8]:
# we will change the 100 pts measure to 1~10 scale
# First, get all the data where wave 1-5, 10-21 where people use pts measure
pts = Data_df[(Data_df['wave']>9)|(Data_df['wave']<6)]
pts_ind = pts.index
f1 = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
f2 = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1']
print(Data_df.loc[pts_ind,f1].max())
# however, some scale are written in pts
scale = Data_df[(Data_df['wave']<10)&(Data_df['wave']>5)]
print("However, somehow some entries that are supposed to be scale also use pts\n", scale[f1].max())
# by a closer look, we found that wave 6-9 also uses pts measure on f1 and f2 instead of 1-10 scale
scale_pts1 = scale[np.sum(scale[f1]>10,axis=1)==0]
scale_pts2 = scale[np.sum(scale[f2]>10,axis=1)==0]
print("The only one that is not in pts measure are those with Nan value")
scale_pts1[f1+f2]
# so we will just keep it as it is

attr1_1     100.0
sinc1_1      60.0
intel1_1     50.0
fun1_1       50.0
amb1_1       53.0
shar1_1      30.0
dtype: float64
However, somehow some entries that are supposed to be scale also use pts
 attr1_1     27.78
sinc1_1     23.81
intel1_1    23.81
fun1_1      27.78
amb1_1      20.59
shar1_1     23.81
dtype: float64
The only one that is not in pts measure are those with Nan value


Unnamed: 0,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1
1866,,,,,,,,,,,,
1867,,,,,,,,,,,,
1868,,,,,,,,,,,,
1869,,,,,,,,,,,,
1870,,,,,,,,,,,,


build the vector that contains the information both for the male and female

In [9]:
# first get rid off any rows contain Nan value
inds = pd.isnull(Data_df).any(1).nonzero()[0] # row index that contains Nan
print("number of rows contain Nan:", len(inds))
Data_df= Data_df.drop(Data_df.index[inds])
# notice pid is float, so we change it to int
Data_df['pid'] = Data_df['pid'].astype(int)
Data_df['iid'] = Data_df['iid'].astype(int)

mdata_df = Data_df[Data_df['gender']==1]
fdata_df = Data_df[Data_df['gender']==0]
print(mdata_df.shape)
print(fdata_df.shape)
same1 = []
for i in mdata_df.pid.values:
    if i not in fdata_df.iid.values:
        same1.append(i)
print("some guy's partener is not found in fdata:", list(set(same1)))
same2 = []
for i in fdata_df.pid.values:
    if i not in mdata_df.iid.values:
        same2.append(i)
print("some girl's partener is not found in mdata:", list(set(same2)))

# male features in combination
cmfeatures = ['iid', 'pid', 'match', 'samerace', 'age', 'field_cd', 'race', 'imprace', 'imprelig', 'goal', 'date', 'go_out',\
             'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing',\
             'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy',\
             'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr2_1', 'sinc2_1',\
             'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1', 'attr3_1', 'sinc3_1','fun3_1', 'intel3_1', 'amb3_1']
new_mdata = mdata_df[cmfeatures]
# female features in combination
cffeatures = [cmfeatures[0]] + cmfeatures[4:]
new_fdata = fdata_df[cffeatures].drop_duplicates()
new_data = pd.DataFrame.copy(new_mdata)
new_fdata.columns = [i+'_f' for i in cffeatures] # rename the feature name of female

number of rows contain Nan: 336
(4007, 57)
(4035, 57)
some guy's partener is not found in fdata: [416]
some girl's partener is not found in mdata: [413, 414]


In [10]:
print("building data for pairs...")
df = pd.DataFrame()
for i in new_mdata.index.values:
    m_info = new_mdata.loc[i,:]
    pid = new_mdata.loc[i,'pid'] # this is the (female)partener's ID
    f_ind = new_fdata.iid_f==pid
    if sum(f_ind) !=0: # append only if the pid is found in female iid
        f_info_df = new_fdata[f_ind]
        f_info = f_info_df.loc[f_info_df.index[0],:]
        combined = m_info.append(f_info)
        df = df.append(combined,ignore_index=True)
# now drop features that we do not need for prediction
drop_features = ['iid', 'pid', 'iid_f']
print("Done making data for pairs")
pair_df = df.drop(drop_features,axis=1)
pair_df.head()

building data for pairs...
Done making data for pairs


Unnamed: 0,age,age_f,amb1_1,amb1_1_f,amb2_1,amb2_1_f,amb3_1,amb3_1_f,art,art_f,...,sports,sports_f,theater,theater_f,tv,tv_f,tvsports,tvsports_f,yoga,yoga_f
0,27.0,21.0,0.0,15.0,25.0,5.0,5.0,7.0,5.0,1.0,...,8.0,9.0,4.0,1.0,2.0,9.0,7.0,2.0,1.0,1.0
1,27.0,24.0,0.0,0.0,25.0,0.0,5.0,3.0,5.0,6.0,...,8.0,3.0,4.0,9.0,2.0,1.0,7.0,2.0,1.0,1.0
2,27.0,25.0,0.0,10.0,25.0,0.0,5.0,8.0,5.0,5.0,...,8.0,3.0,4.0,7.0,2.0,8.0,7.0,8.0,1.0,7.0
3,27.0,23.0,0.0,10.0,25.0,5.0,5.0,8.0,5.0,7.0,...,8.0,1.0,4.0,9.0,2.0,7.0,7.0,1.0,1.0,8.0
4,27.0,21.0,0.0,10.0,25.0,5.0,5.0,8.0,5.0,8.0,...,8.0,7.0,4.0,6.0,2.0,8.0,7.0,4.0,1.0,3.0


In [11]:
pair_df.columns

Index(['age', 'age_f', 'amb1_1', 'amb1_1_f', 'amb2_1', 'amb2_1_f', 'amb3_1',
       'amb3_1_f', 'art', 'art_f', 'attr1_1', 'attr1_1_f', 'attr2_1',
       'attr2_1_f', 'attr3_1', 'attr3_1_f', 'clubbing', 'clubbing_f',
       'concerts', 'concerts_f', 'date', 'date_f', 'dining', 'dining_f',
       'exercise', 'exercise_f', 'exphappy', 'exphappy_f', 'field_cd',
       'field_cd_f', 'fun1_1', 'fun1_1_f', 'fun2_1', 'fun2_1_f', 'fun3_1',
       'fun3_1_f', 'gaming', 'gaming_f', 'go_out', 'go_out_f', 'goal',
       'goal_f', 'hiking', 'hiking_f', 'imprace', 'imprace_f', 'imprelig',
       'imprelig_f', 'intel1_1', 'intel1_1_f', 'intel2_1', 'intel2_1_f',
       'intel3_1', 'intel3_1_f', 'match', 'movies', 'movies_f', 'museums',
       'museums_f', 'music', 'music_f', 'race', 'race_f', 'reading',
       'reading_f', 'samerace', 'shar1_1', 'shar1_1_f', 'shar2_1', 'shar2_1_f',
       'shopping', 'shopping_f', 'sinc1_1', 'sinc1_1_f', 'sinc2_1',
       'sinc2_1_f', 'sinc3_1', 'sinc3_1_f', 'sports',

Now let us build the model, first, and see the number of each class(0 = no, 1 = yes)

In [12]:
pair_train = pair_df.drop('match',axis=1)
pair_label = pair_df['match']
print("total training data size:", pair_df.shape)
pd.Series.value_counts(pair_label)

total training data size: (3999, 88)


0.0    3345
1.0     654
Name: match, dtype: int64

we will use a tuned xgb model

In [14]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV   #Performing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 22, 4

from time import time

# create train and test data
train_data, test_data = train_test_split(pair_df, test_size=0.1, random_state=42, stratify=pair_df['match'])
predictors = [x for x in pair_df.columns if x not in ['match']]
print("train shape:", train_data.shape)
print("test shape:", test_data.shape)

train shape: (3599, 88)
test shape: (400, 88)




### Train our model!

(see details :http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)

In [15]:
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=10, early_stopping_rounds=20):
    t1 = time()
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['match'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics=['auc'], early_stopping_rounds=early_stopping_rounds, stratified=True)#, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['match'],eval_metric=['auc'])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    #Predict test set:
    dtest_predictions = alg.predict(dtest[predictors])
    dtest_predprob = alg.predict_proba(dtest[predictors])[:,1]
    
    t2 = time()    
    #Print model report:
    print("\nModel Report (took {0:.2f}sec)".format(t2-t1))
    print("The result params is:\n", alg.get_xgb_params())
    print("Train Accuracy: {0:.2f}%".format(metrics.accuracy_score(dtrain['match'].values, dtrain_predictions)))
    print("Train AUC Score: {0:.4f}".format(metrics.roc_auc_score(dtrain['match'], dtrain_predprob)))
    print("Test Accuracy: {0:.2f}%".format(metrics.accuracy_score(dtest['match'].values, dtest_predictions)))
    print("Test AUC Score: {0:.4f}".format(metrics.roc_auc_score(dtest['match'], dtest_predprob)))
    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    return dtest_predprob, feat_imp

In [16]:
clf = XGBClassifier(learning_rate =0.01,
                     n_estimators=5000,
                     max_depth=5,
                     min_child_weight=6,
                     gamma=0.4,
                     subsample=0.95,
                     colsample_bytree=0.65,
                     reg_alpha=0.,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)
result = modelfit(clf, train_data, test_data, predictors)

KeyboardInterrupt: 

First 10 important features: (importance in descending order)

In [None]:
print(result[1].index.values[:10])

Least 10 important features: (importance in ascending order)

In [None]:
print(list(result[1].index.values[-10:])[::-1])

### Plot ROC

In [None]:
from sklearn.metrics import roc_curve, auc
rcParams['figure.figsize'] = 10, 8
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_data['match'], result[0])
roc_auc = auc(false_positive_rate, true_positive_rate)

plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

see more about auc_roc: https://datamize.wordpress.com/2015/01/24/how-to-plot-a-roc-curve-in-scikit-learn/ and http://gim.unmc.edu/dxtests/roc3.htm