In [None]:
# age in kategorien aufteilen? 3?

In [1]:
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
import time

from IPython.display import display, HTML

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance
from sklearn import metrics

pd.set_option('display.max_columns', 500)

In [2]:
filepath = "./dating.csv"
df_full = pd.read_csv(filepath, encoding = "ISO-8859-1")

In [3]:
def write_features_to_file(features, file_name="current_features.txt"):
    with open(file_name, 'w') as f:
        for feat in features:
            f.write(feat + '\n')

In [4]:
def check_missing_col_value(col_name):
    print('-'*75)
    id_set = set(df_clean[df_clean[col_name].isnull()]["iid"])
    id_set2 = set(df_clean[df_clean[col_name].notnull()]["iid"])
    ids = []
    
    for x in id_set:
        if x in id_set2:
            ids.append(x)
    if len(ids) > 0:
        print(f"Following IDs have a value for column {col_name}: {ids}")
    else:
        print(f"No values for {col_name}")
    

In [5]:
def add_missing_data(df, attr_list):
    pd.options.mode.chained_assignment = None  # default='warn'

    # missing ID of a single person
    df['id'].loc[(df['id'].isnull())] = 22
    
    # pid is missing for the same person in wave 5
    df['pid'].loc[(df['pid'].isnull())] = 118
    
    # adding missing code for lawyer and law
    df['career_c'].loc[(df['career_c'].isnull()) & (df['career']=="lawyer") | (df['career']=="law")  ] = 1
    
    # adding those careers to 'other' because only 1 person have them
    df['career_c'].loc[(df['career_c'].isnull()) & (df['career']=="tech professional") | (df['career']=="Economist")  ] = 15
    
    # add field value based on other participants with same field
    df['field_cd'].loc[(df['field_cd'].isnull()) & (df['field']=="Operations Research")  ] = 5
    
    # as suggested, making met and met_o a tertiary feature and fixing weird values
    df['met'].loc[(df.met.isnull())] = 0
    df['met_o'].loc[(df.met_o.isnull())] = 0
    df['met'].loc[ df.met>2] = 2
    df['met_o'].loc[ df.met_o>2] = 2
    
    # reading and gaming have values above 10, so we make them 10
    df['gaming'].loc[(df.gaming>10)] = 10
    df['reading'].loc[(df.reading>10)] = 10
    
    # these should be 0 because no points were left for the allocation of 100pt.
    attrs = ['fun1_1' ,'amb1_1'  ,'shar1_1']
    for attr in attrs:
        df[attr].loc[df.iid==130] = 0
    
    # these should be 0 because no points were left for the allocation of 100pt.
    attrs = ['amb2_1', 'shar2_1', 'shar1_1']
    for attr in attrs:
        df[attr].loc[ (df[attr].isnull())] = 0
        
    # for these values the range should be 1-10, not 0-10
    df['hiking'].loc[ (df.hiking==0) ] = 1
    df['gaming'].loc[ (df.gaming==0) ] = 1
    df['yoga'].loc[ (df.yoga==0) ] = 1
    
    for attr in attr_list:
        df[attr].loc[ (df[attr]==0) ] = 1
    
    return df.copy()

In [6]:
def remove_rows(df):
    ### Alternative is to use mean age / mean race?
    col_to_filt = ['age', 'race']
    mis_id = []

    # get iid and pid (which is the same) for missing age, age_o, race, race_o rows
    for col in col_to_filt:
        mis_id = list(set(mis_id + list(df[df[col].isna()].iid.values)))
        mis_id = list(set(mis_id + list(df[df[col + '_o'].isna()].pid.values)))
    print('Missing ages', len(mis_id))
    print(f'Original number of participants: {df.shape[0]}')

    mis_id += [28] # another respondent with a lot of missing values
    mis_id += [414, 416] # missing 3_1 values
    
   
    # remove participants with missing data
    df.drop(df[df.iid.isin(mis_id)].index, inplace=True)
    df.drop(df[df.pid.isin(mis_id)].index, inplace=True)
    
    
    print(f"Dropping {len(mis_id)} participants")
    print(f"New number of participants: {df.shape[0]}")
    print(f"Dropped shape: {df.shape}")
    print('-'*50)
    
    return df.copy()

In [7]:
def scale_attr_values(df):
    for col in df_clean[attr3_list]:
        df_clean[col] = df_clean[col] / df_clean[attr3_list].sum(axis=1) * 100
        
    return df.copy()

In [8]:
def replace_nan_by_mean(df, cols):
    
    def replace_in_col(col):
        ## maybe take mean from wave, gender and not overall??
        mode = df[col].mode()
        if len(mode) > 1:
            mode = random.choice(mode) 
        df[col].loc[(df[col].isnull())] = mode[0]
        
    for col in cols:
        replace_in_col(col)
 
    
    return df.copy()


In [9]:
def add_missing_attrs_by_mean(df, cols, filter_by_match=False):

    for col in cols:
        missing_pids = set(df[ df[col].isnull() ] ['pid'])

        for pid in list(missing_pids):
            if filter_by_match:
                mean_val = 0
                ## check if a match is available if not go to else block?
            else:
                mean_val = np.rint(df[df.iid==pid][col+'_o'].mean())
            df[col].loc[ (df_clean.pid==pid) & (df_clean[col].isnull()) ] = mean_val

        
    return df.copy()


In [10]:
def check_empty_columns(df):
    col_with_miss_values = []
    
    for col in df_clean.columns:
        mis = df_clean[col].isnull().sum()
        if mis > 0:
            col_with_miss_values.append(col)
            print("{}: {} missing, {}%".format(col, mis, round(mis/df_full.shape[0] * 100, 3)))
            
    if len(col_with_miss_values)     :
        print("There are no missing values")
    else:
        print("Following columns have missing values")
        print(col_with_miss_values)

In [11]:
def print_df_as_html(df):
    display(HTML(df.to_html()))

In [12]:
print(f"Original shape: {df_full.shape}")

interests = ['sports', 'tvsports', 'exercise', 'dining', 'museums', 'art',  'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga']
attr_list=['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']
attr_list_ex = ['like', 'prob', 'met']

attr1_list = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1',]
attr2_list = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1',]
attr3_list = ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']

all_attr_list = attr_list + attr_list_ex + attr1_list + attr2_list + attr3_list

df_clean = df_full.copy()
df_clean = add_missing_data(df_clean, (attr_list + ['like', 'prob']))
df_clean = add_missing_attrs_by_mean(df_clean, (attr_list + attr_list_ex))

unwished_cols = ['pf_o_att' ,'pf_o_sin', 'pf_o_int' ,'pf_o_fun' ,'pf_o_amb', 'pf_o_sha' ] 
unwished_cols += ['attr_o', 'sinc_o' ,'intel_o', 'fun_o', 'amb_o', 'shar_o' ,'prob_o', 'match', 'dec', 'like_o']
unwished_cols += ['length', 'you_call', 'them_cal', 'match',] # post date variables + unrelated date variables
unwished_cols += ['positin1', 'undergra', 'mn_sat', 'tuition', 'zipcode', 'income', 'expnum', 'match_es'] # variables which have missing values that can not be replaced
unwished_cols += ['career', 'field'] # not needed since they are represented by number values + missing values were added

unwished_cols += [col for col in df_full.columns if '1_s' in col or '3_s' in col] # asked mid session, not clear when so we drop it
unwished_cols += [col for col in df_full.columns if '_2' in col]  # asked after the session
unwished_cols += [col for col in df_full.columns if '_3' in col]  # asked after the session
unwished_cols += [col for col in df_full.columns if '4_1' in col]  # missed in wave 6-21 which makes 21%
unwished_cols += [col for col in df_full.columns if '5_1' in col]  # missed in wave 1-9 which makes 40%

print(f"Dropping {len(unwished_cols)} columns")


df_clean.drop(unwished_cols, axis=1, inplace=True)

print(f"Dropped shape: {df_clean.shape}")
print('-'*50)

df_clean = remove_rows(df_clean)
df_clean = replace_nan_by_mean(df_clean, ['career_c', 'date'])
df_clean = scale_attr_values(df_clean)
check_empty_columns(df_clean)

unwished_cols = ['from', 'iid', 'id', 'idg', 'condtn', 'partner', 'pid', 'round'] # rest unnecessary columns
df_clean.drop(unwished_cols, axis=1, inplace=True)


print('-'*50)
print('Current columns')
print(df_clean.columns.tolist())
print_df_as_html(df_clean.head())

Original shape: (8378, 195)
Dropping 125 columns
Dropped shape: (8378, 71)
--------------------------------------------------
Missing ages 9
Original number of participants: 8378
Dropping 12 participants
New number of participants: 8096
Dropped shape: (8096, 71)
--------------------------------------------------
Following columns have missing values
[]
--------------------------------------------------
Current columns
['gender', 'wave', 'position', 'order', 'int_corr', 'samerace', 'age_o', 'race_o', 'dec_o', 'met_o', 'age', 'field_cd', 'race', 'imprace', 'imprelig', 'goal', 'date', 'go_out', 'career_c', 'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1', 'attr3_1', 'sinc3_1', 'fun3_1', 'intel3_1', 'amb3_1', 'attr', 'sinc', 'int

Unnamed: 0,gender,wave,position,order,int_corr,samerace,age_o,race_o,dec_o,met_o,age,field_cd,race,imprace,imprelig,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr,sinc,intel,fun,amb,shar,like,prob,met
0,0,1,7,4,0.14,0,27.0,2.0,0,2.0,21.0,1.0,4.0,2.0,4.0,2.0,7.0,1.0,1.0,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,16.216216,16.943331,12.819552,14.245129,10.412912,6.0,9.0,7.0,7.0,6.0,5.0,7.0,6.0,2.0
1,0,1,7,3,0.54,0,22.0,2.0,0,2.0,21.0,1.0,4.0,2.0,4.0,2.0,7.0,1.0,1.0,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,16.216216,16.943331,12.819552,14.245129,10.412912,7.0,8.0,7.0,8.0,5.0,6.0,7.0,5.0,1.0
2,0,1,7,10,0.16,1,22.0,4.0,1,1.0,21.0,1.0,4.0,2.0,4.0,2.0,7.0,1.0,1.0,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,16.216216,16.943331,12.819552,14.245129,10.412912,5.0,8.0,9.0,8.0,5.0,7.0,7.0,5.0,1.0
3,0,1,7,5,0.61,0,23.0,2.0,1,2.0,21.0,1.0,4.0,2.0,4.0,2.0,7.0,1.0,1.0,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,16.216216,16.943331,12.819552,14.245129,10.412912,7.0,6.0,8.0,7.0,6.0,8.0,7.0,6.0,2.0
4,0,1,7,7,0.21,0,24.0,3.0,1,2.0,21.0,1.0,4.0,2.0,4.0,2.0,7.0,1.0,1.0,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,16.216216,16.943331,12.819552,14.245129,10.412912,5.0,6.0,7.0,7.0,6.0,6.0,6.0,6.0,2.0


In [13]:
nominal_cols = ['career_c','date','dec_o','field_cd','gender','go_out','goal','met','met_o','race','race_o','samerace']
ordinal_cols = [x for x in df_clean.columns.tolist() if x not in nominal_cols]

df_clean_categorized = pd.get_dummies(df_clean, columns=nominal_cols)
df_clean_categorized[ordinal_cols] = df_clean_categorized[ordinal_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

#with normalization dec_o_0 | 1000 iter -> overfitting
#Training Accuracy: 0.9286890645586298
#Validation Accuracy: 0.6571146245059288

#test_set[ordinal_cols] = StandardScaler().fit_transform(test_set[ordinal_cols])
#with standardization dec_o_0 | 1000 iter -> overfitting
#Training Accuracy: 0.9993412384716732
#Validation Accuracy: 0.6353754940711462

#raw data dec_o_0 | 1000 iter -> overfitting
#Training Accuracy: 0.8280632411067194
#Validation Accuracy: 0.6635375494071146

print_df_as_html(df_clean_categorized.head())

Unnamed: 0,wave,position,order,int_corr,age_o,age,imprace,imprelig,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr,sinc,intel,fun,amb,shar,like,prob,career_c_1.0,career_c_2.0,career_c_3.0,career_c_4.0,career_c_5.0,career_c_6.0,career_c_7.0,career_c_8.0,career_c_9.0,career_c_10.0,career_c_11.0,career_c_12.0,career_c_13.0,career_c_14.0,career_c_15.0,career_c_16.0,career_c_17.0,date_1.0,date_2.0,date_3.0,date_4.0,date_5.0,date_6.0,date_7.0,dec_o_0,dec_o_1,field_cd_1.0,field_cd_2.0,field_cd_3.0,field_cd_4.0,field_cd_5.0,field_cd_6.0,field_cd_7.0,field_cd_8.0,field_cd_9.0,field_cd_10.0,field_cd_11.0,field_cd_12.0,field_cd_13.0,field_cd_14.0,field_cd_15.0,field_cd_16.0,field_cd_17.0,field_cd_18.0,gender_0,gender_1,go_out_1.0,go_out_2.0,go_out_3.0,go_out_4.0,go_out_5.0,go_out_6.0,go_out_7.0,goal_1.0,goal_2.0,goal_3.0,goal_4.0,goal_5.0,goal_6.0,met_0.0,met_1.0,met_2.0,met_o_0.0,met_o_1.0,met_o_2.0,race_1.0,race_2.0,race_3.0,race_4.0,race_6.0,race_o_1.0,race_o_2.0,race_o_3.0,race_o_4.0,race_o_6.0,samerace_0,samerace_1
0,0.0,0.285714,0.142857,0.557471,0.243243,0.081081,0.2,0.333333,0.888889,0.111111,0.777778,0.888889,0.0,0.0,0.444444,0.0,0.444444,0.555556,0.888889,0.0,1.0,1.0,0.888889,0.777778,0.0,0.222222,0.15,0.333333,0.4,0.3,0.283019,0.5,0.35,0.4,0.375,0.4,0.1,0.166667,0.509713,0.587688,0.71654,0.561018,0.581125,0.555556,0.888889,0.666667,0.666667,0.555556,0.444444,0.666667,0.555556,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0
1,0.0,0.285714,0.095238,0.787356,0.108108,0.081081,0.2,0.333333,0.888889,0.111111,0.777778,0.888889,0.0,0.0,0.444444,0.0,0.444444,0.555556,0.888889,0.0,1.0,1.0,0.888889,0.777778,0.0,0.222222,0.15,0.333333,0.4,0.3,0.283019,0.5,0.35,0.4,0.375,0.4,0.1,0.166667,0.509713,0.587688,0.71654,0.561018,0.581125,0.666667,0.777778,0.666667,0.777778,0.444444,0.555556,0.666667,0.444444,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0
2,0.0,0.285714,0.428571,0.568966,0.108108,0.081081,0.2,0.333333,0.888889,0.111111,0.777778,0.888889,0.0,0.0,0.444444,0.0,0.444444,0.555556,0.888889,0.0,1.0,1.0,0.888889,0.777778,0.0,0.222222,0.15,0.333333,0.4,0.3,0.283019,0.5,0.35,0.4,0.375,0.4,0.1,0.166667,0.509713,0.587688,0.71654,0.561018,0.581125,0.444444,0.777778,0.888889,0.777778,0.444444,0.666667,0.666667,0.444444,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1
3,0.0,0.285714,0.190476,0.827586,0.135135,0.081081,0.2,0.333333,0.888889,0.111111,0.777778,0.888889,0.0,0.0,0.444444,0.0,0.444444,0.555556,0.888889,0.0,1.0,1.0,0.888889,0.777778,0.0,0.222222,0.15,0.333333,0.4,0.3,0.283019,0.5,0.35,0.4,0.375,0.4,0.1,0.166667,0.509713,0.587688,0.71654,0.561018,0.581125,0.666667,0.555556,0.777778,0.666667,0.555556,0.777778,0.666667,0.555556,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0
4,0.0,0.285714,0.285714,0.597701,0.162162,0.081081,0.2,0.333333,0.888889,0.111111,0.777778,0.888889,0.0,0.0,0.444444,0.0,0.444444,0.555556,0.888889,0.0,1.0,1.0,0.888889,0.777778,0.0,0.222222,0.15,0.333333,0.4,0.3,0.283019,0.5,0.35,0.4,0.375,0.4,0.1,0.166667,0.509713,0.587688,0.71654,0.561018,0.581125,0.444444,0.555556,0.666667,0.666667,0.555556,0.555556,0.555556,0.555556,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0


In [14]:
feature_selection = ['prob', 'fun3_1', 'attr3_1', 'exercise', 'intel', 'sinc','attr2_1', 'dining', 'sports', 'clubbing']

X = df_clean_categorized.drop(['dec_o_0', 'dec_o_1'], axis=1)
y = df_clean_categorized['dec_o_0']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, )
#start = time.time()
#clf = MLPClassifier(random_state=1, max_iter=1000).fit(X_train, y_train)
#print(time.time() - start)
#predict_train_lrc = clf.predict(X_train)
#predict_test_lrc = clf.predict(X_test)

#print('Training Accuracy:', metrics.accuracy_score(y_train, predict_train_lrc))
#print('Validation Accuracy:', metrics.accuracy_score(y_test, predict_test_lrc))

In [15]:

# Number of random trials
NUM_TRIALS = 5

parameter_space = {
    'alpha': 10.0 ** -np.arange(1, 5), 
    'hidden_layer_sizes': [(i, ) for i in range(50,56,1)],
    'learning_rate': ['constant','adaptive'],
}

mlp = MLPClassifier(max_iter=1000)

nested_scores = np.zeros(NUM_TRIALS)
non_nested_scores = np.zeros(NUM_TRIALS)



for i in range(NUM_TRIALS):
    print(f'Trial#{i}')
    start = time.time()
    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=i)
    
    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=mlp, param_grid=parameter_space, cv=inner_cv, n_jobs=7)
    clf.fit(X, y)
    non_nested_scores[i] = clf.best_score_
    print("Done with Gridsearch", time.time() - start)
    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, n_jobs=7)
    nested_scores[i] = nested_score.mean()
    end = time.time()
    print(end - start)
    

score_difference = non_nested_scores - nested_scores

print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))    


params={'alpha': 0.0001, 'learning_rate': 'adaptive', 'solver': 'adam'}


# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Trial#0
Done with Gridsearch 627.8424196243286
6741.470896244049
Trial#1
Done with Gridsearch 638.2851915359497
6400.939851522446
Trial#2
Done with Gridsearch 521.4737820625305
6100.930709838867
Trial#3
Done with Gridsearch 527.9417643547058
6076.782231330872
Trial#4
Done with Gridsearch 518.072470664978
6087.980185270309
Average difference of 0.003433 with std. dev. of 0.003394.
Best parameters found:
 {'alpha': 0.1, 'hidden_layer_sizes': (51,), 'learning_rate': 'constant'}
0.669 (+/-0.006) for {'alpha': 0.1, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.663 (+/-0.009) for {'alpha': 0.1, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.677 (+/-0.018) for {'alpha': 0.1, 'hidden_layer_sizes': (51,), 'learning_rate': 'constant'}
0.661 (+/-0.009) for {'alpha': 0.1, 'hidden_layer_sizes': (51,), 'learning_rate': 'adaptive'}
0.671 (+/-0.008) for {'alpha': 0.1, 'hidden_layer_sizes': (52,), 'learning_rate': 'constant'}
0.667 (+/-0.011) for {'alpha': 0.1, 'hidden_layer_

In [None]:
feature_selection = ['prob', 'fun3_1', 'attr3_1', 'exercise', 'intel', 'sinc','attr2_1', 'dining', 'sports', 'clubbing']
#feature_selection = ['race_6.0','prob','gaming','field_cd_6.0','race_2.0','date_5.0','met_0.0','field_cd_17.0','date_6.0','field_cd_9.0','go_out_1.0','career_c_9.0','go_out_7.0','goal_6.0','go_out_2.0','met_1.0','race_1.0','date_3.0','field_cd_4.0','met_2.0', 'goal_2.0','career_c_4.0','field_cd_2.0','career_c_3.0','field_cd_5.0','field_cd_14.0']

df_test = df_clean.copy()
df_test[ordinal_cols] = df_test[ordinal_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

#X = df_clean_categorized.drop(['dec_o_0', 'dec_o_1'], axis=1)
X = df_test[feature_selection]
y = df_test['dec_o']
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, )
clf = MLPClassifier(random_state=1, max_iter=1000, alpha=0.0001, learning_rate='adaptive', solver='adam').fit(X_train, y_train)
print(time.time() - start)
predict_train_lrc = clf.predict(X_train)
predict_test_lrc = clf.predict(X_test)

print('Training Accuracy:', metrics.accuracy_score(y_train, predict_train_lrc))
print('Validation Accuracy:', metrics.accuracy_score(y_test, predict_test_lrc))

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(clf, X_test, y_test,
                           n_repeats=30,
                           random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{df_clean_categorized.columns.tolist()[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")


In [None]:
"""
mlp = MLPClassifier(max_iter=100)


x=

parameter_space = {
    'alpha': 10.0 ** -np.arange(1, 5), 
    'hidden_layer_sizes': x,
    'learning_rate': ['constant','adaptive'],
}

outer_cv = KFold(n_splits=4, shuffle=True, random_state=1)

start = time.time()

clf = GridSearchCV(mlp, parameter_space, n_jobs=7, cv=KFold(n_splits=4, shuffle=True, random_state=1))
clf.fit(X_train, y_train)
print(time.time() - start)


nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv, n_jobs=7)
print(time.time() - start)
"""
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in sorted(zip(means, stds, clf.cv_results_['params']), key = lambda t: t[0], reverse=True):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [17]:
best = clf.best_estimator_

predict_train_lrc = best.predict(X_train)
predict_test_lrc = best.predict(X_test)

print('Training Accuracy:', metrics.accuracy_score(y_train, predict_train_lrc))
print('Validation Accuracy:', metrics.accuracy_score(y_test, predict_test_lrc))

Training Accuracy: 0.7997364953886693
Validation Accuracy: 0.8112648221343873
