In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
adult_data = pd.read_csv('adult.data')
adult_data.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [3]:
#Going to rename all the columns to correct labels
adult_data.rename(columns={'39':'Age', ' State-gov':'Work-class', ' 77516': 'fnlwgt', ' Bachelors': 'Education',
                          ' 13': 'Education-num', ' Never-married': 'Marital-status', ' Adm-clerical': 'Occupation',
                           ' Not-in-family':'Relationship', ' White': 'Race', ' Male': 'Sex', ' 2174': 'Capital-gain',
                           ' 0' : 'Capital-loss', ' 40':'Hours-per-week',' United-States':'Country',' <=50K' : 'Salary>50K'}, inplace=True)
adult_data.head()

Unnamed: 0,Age,Work-class,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Country,Salary>50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
len(adult_data)

32560

In [5]:
#Seeing what values can be excluded from data based on their frequency of values in dataframe
adult_data['Work-class'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Work-class, dtype: int64

In [6]:
adult_data['Education'].value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5354
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: Education, dtype: int64

In [7]:
adult_data['Marital-status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10682
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: Marital-status, dtype: int64

In [8]:
adult_data['Occupation'].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3769
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: Occupation, dtype: int64

In [9]:
adult_data['Relationship'].value_counts()

 Husband           13193
 Not-in-family      8304
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: Relationship, dtype: int64

In [10]:
adult_data['Race'].value_counts()

 White                 27815
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: Race, dtype: int64

In [12]:
adult_data['Country'].value_counts()

 United-States                 29169
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [13]:
#Dropping all rows with Work class value of ?
adult_data = adult_data.drop(adult_data.loc[adult_data['Work-class'] == ' ?'].index)

In [14]:
#Dropping all rows where 'without-pay' in work class column because it is only there 14 times
adult_data = adult_data.drop(adult_data.loc[adult_data['Work-class'] == ' Without-pay'].index)

In [15]:
#Dropping all the rows with value 'Never-worked' in work class column because it is only there 7 times
adult_data = adult_data.drop(adult_data.loc[adult_data['Work-class'] == 'Never-worked'].index)

In [16]:
#only keeping rows where Education column value is seen more than 500 times
counts = adult_data['Education'].value_counts()
adult_data = adult_data[adult_data['Education'].isin(counts[counts > 500].index)]

In [17]:
#only keeping rows where Marital status column value is seen more than 500 times
counts2 = adult_data['Marital-status'].value_counts()
adult_data = adult_data[adult_data['Marital-status'].isin(counts2[counts2 > 500].index)]

In [18]:
#only keeping rows where Occupation column value is seen more than 700 times
counts3 = adult_data['Occupation'].value_counts()
adult_data = adult_data[adult_data['Occupation'].isin(counts3[counts3 > 500].index)]

In [19]:
#Keeping only the 'United States' label because it holds the vast majority amount of rows
adult_data = adult_data.drop(adult_data.loc[adult_data['Country'] != ' United-States'].index)

In [20]:
#Because all the values in Country are United-states, we will drop this column completely
adult_data = adult_data.drop(columns = 'Country')

In [21]:
adult_data.head()

Unnamed: 0,Age,Work-class,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Salary>50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,<=50K
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,>50K


In [22]:
len(adult_data)

26039

In [26]:
#Now we have to standard scale all numerical data
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()

def scale_columns(dataset, cols_scale):
    for col in cols_scale:
        dataset[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(adult_data[col])), columns=[col])
    return dataset

In [27]:
scaled_adult_data = scale_columns(adult_data, ['fnlwgt', 'Education-num', 'Capital-gain', 'Capital-loss', 'Hours-per-week'])
scaled_adult_data

Unnamed: 0,Age,Work-class,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Salary>50K
0,50,Self-emp-not-inc,-0.994489,Bachelors,1.238847,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.149470,-0.220853,-2.353724,<=50K
1,38,Private,0.275282,HS-grad,-0.593128,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.149470,-0.220853,-0.088287,<=50K
2,53,Private,0.458309,11th,-1.509116,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.149470,-0.220853,-0.088287,<=50K
4,37,Private,0.217673,Masters,-0.593128,Married-civ-spouse,Exec-managerial,Wife,White,Female,-0.149470,-0.220853,0.331238,<=50K
6,52,Self-emp-not-inc,-0.263935,HS-grad,1.238847,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.555969,-0.220853,-0.088287,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,,Assoc-acdm,,Married-civ-spouse,Tech-support,Wife,White,Female,,,,<=50K
32556,40,Private,,HS-grad,,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,,,,>50K
32557,58,Private,,HS-grad,,Widowed,Adm-clerical,Unmarried,White,Female,,,,<=50K
32558,22,Private,,HS-grad,,Never-married,Adm-clerical,Own-child,White,Male,,,,<=50K


In [28]:
#Have to change salary columns to only be values 1 and -1
adult_data['Salary>50K'] = np.where((adult_data['Salary>50K'] == ' <=50K'), -1, adult_data['Salary>50K'])
adult_data['Salary>50K'] = np.where((adult_data['Salary>50K'] == ' >50K'), 1, adult_data['Salary>50K'])

In [29]:
adult_data.head()

Unnamed: 0,Age,Work-class,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Salary>50K
0,50,Self-emp-not-inc,-0.994489,Bachelors,1.238847,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14947,-0.220853,-2.353724,-1
1,38,Private,0.275282,HS-grad,-0.593128,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14947,-0.220853,-0.088287,-1
2,53,Private,0.458309,11th,-1.509116,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14947,-0.220853,-0.088287,-1
4,37,Private,0.217673,Masters,-0.593128,Married-civ-spouse,Exec-managerial,Wife,White,Female,-0.14947,-0.220853,0.331238,-1
6,52,Self-emp-not-inc,-0.263935,HS-grad,1.238847,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.555969,-0.220853,-0.088287,1


In [31]:
#Now I want to hot encode all categorical variables
work_class = pd.get_dummies(scaled_adult_data['Work-class'])
education = pd.get_dummies(scaled_adult_data['Education'])
marital_status = pd.get_dummies(scaled_adult_data['Marital-status'])
occupation = pd.get_dummies(scaled_adult_data['Occupation'])
relationship = pd.get_dummies(scaled_adult_data['Relationship'])
race = pd.get_dummies(scaled_adult_data['Race'])
sex = pd.get_dummies(scaled_adult_data['Sex'])

In [33]:
scaled_adult_data = scaled_adult_data.drop(columns = ['Work-class', 'Education', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex'],axis=1)

In [34]:
scaled_adult_data.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Salary>50K
0,50,-0.994489,1.238847,-0.14947,-0.220853,-2.353724,-1
1,38,0.275282,-0.593128,-0.14947,-0.220853,-0.088287,-1
2,53,0.458309,-1.509116,-0.14947,-0.220853,-0.088287,-1
4,37,0.217673,-0.593128,-0.14947,-0.220853,0.331238,-1
6,52,-0.263935,1.238847,0.555969,-0.220853,-0.088287,1


In [35]:
new_adult_data = pd.concat([scaled_adult_data , work_class, education, marital_status, occupation, relationship, race, sex], axis=1)

In [36]:
new_adult_data.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Salary>50K,Federal-gov,Local-gov,Private,...,Own-child,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male
0,50,-0.994489,1.238847,-0.14947,-0.220853,-2.353724,-1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,38,0.275282,-0.593128,-0.14947,-0.220853,-0.088287,-1,0,0,1,...,0,0,0,0,0,0,0,1,0,1
2,53,0.458309,-1.509116,-0.14947,-0.220853,-0.088287,-1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,37,0.217673,-0.593128,-0.14947,-0.220853,0.331238,-1,0,0,1,...,0,0,1,0,0,0,0,1,1,0
6,52,-0.263935,1.238847,0.555969,-0.220853,-0.088287,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [37]:
#Now I want to move the dependent variable of salary to the end 
salary = new_adult_data.pop('Salary>50K')

new_adult_data['Salary>50K'] = salary

new_adult_data.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Federal-gov,Local-gov,Private,Self-emp-inc,...,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male,Salary>50K
0,50,-0.994489,1.238847,-0.14947,-0.220853,-2.353724,0,0,0,0,...,0,0,0,0,0,0,1,0,1,-1
1,38,0.275282,-0.593128,-0.14947,-0.220853,-0.088287,0,0,1,0,...,0,0,0,0,0,0,1,0,1,-1
2,53,0.458309,-1.509116,-0.14947,-0.220853,-0.088287,0,0,1,0,...,0,0,0,0,1,0,0,0,1,-1
4,37,0.217673,-0.593128,-0.14947,-0.220853,0.331238,0,0,1,0,...,0,1,0,0,0,0,1,1,0,-1
6,52,-0.263935,1.238847,0.555969,-0.220853,-0.088287,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [38]:
new_adult_data['Salary>50K'] = pd.to_numeric(new_adult_data['Salary>50K'])
new_adult_data = new_adult_data.dropna()
new_adult_data.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Federal-gov,Local-gov,Private,Self-emp-inc,...,Unmarried,Wife,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male,Salary>50K
0,50,-0.994489,1.238847,-0.14947,-0.220853,-2.353724,0,0,0,0,...,0,0,0,0,0,0,1,0,1,-1
1,38,0.275282,-0.593128,-0.14947,-0.220853,-0.088287,0,0,1,0,...,0,0,0,0,0,0,1,0,1,-1
2,53,0.458309,-1.509116,-0.14947,-0.220853,-0.088287,0,0,1,0,...,0,0,0,0,1,0,0,0,1,-1
4,37,0.217673,-0.593128,-0.14947,-0.220853,0.331238,0,0,1,0,...,0,1,0,0,0,0,1,1,0,-1
6,52,-0.263935,1.238847,0.555969,-0.220853,-0.088287,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [39]:
len(new_adult_data)

20853

In [75]:
new_adult_data['Salary>50K'].value_counts()

-1    15543
 1     5310
Name: Salary>50K, dtype: int64

In [41]:
#Collecting all the samples we will need for each algorithm
all_samples = []

for sample in range(0,5):
    all_samples.append(new_adult_data.sample(n=5000, replace=False))

In [42]:
len(all_samples)

5

# Logistic Regression

In [43]:
#Lists to save models and training/testing sets
all_models = []
X_training_sets = []
y_training_sets = []
X_testing_sets = []
y_testing_sets = []

#lists to save best params
accuracy_best_params = []
roc_auc_best_params = []
f1_best_params = []

#looping across each sample for each trial
for sample in all_samples:
    X_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:, -1]
    X_training_sets.append(X_train)
    y_training_sets.append(y_train)
    
    #Separating rows for the test set that were not in the sample
    ix = [i for i in new_adult_data.index if i not in sample.index]
    test_set = new_adult_data.loc[ix]
    X_test = test_set.iloc[:, :-1]
    y_test = test_set.iloc[:, -1]
    X_testing_sets.append(X_test)
    y_testing_sets.append(y_test)
    
    #Initiating classifier
    log_reg = LogisticRegression(max_iter=10000)
    
    grid_values = [{'solver':['saga'],
                   'penalty': ['l1', 'l2'],
                   'C': [.00000001, .0000001, .000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000]},
                   {'solver':['lbfgs'],
                   'penalty': ['l2'],
                   'C': [.00000001, .0000001, .000001, .00001, .0001, .001, .01, .1, 1, 10, 100, 1000, 10000]},
                   {'solver':['lbfgs', 'saga'],
                   'penalty': ['none'],}
                   ]
    clf = GridSearchCV(estimator = log_reg, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model = clf.fit(X_train, y_train)
    
    accuracy_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params.append(model.cv_results_['params'][np.argmin(model.cv_results_['rank_test_f1_micro'])])
    
    all_models.append(model)

In [44]:
accuracy_best_params

[{'C': 1, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 10, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'C': 1, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 1, 'penalty': 'l1', 'solver': 'saga'}]

In [45]:
roc_auc_best_params

[{'C': 1, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'C': 1, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'C': 1, 'penalty': 'l2', 'solver': 'saga'}]

In [46]:
f1_best_params

[{'C': 1, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 10, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'},
 {'C': 1, 'penalty': 'l1', 'solver': 'saga'},
 {'C': 1, 'penalty': 'l1', 'solver': 'saga'}]

In [76]:
accuracy_test_errors = []
accuracy_train_errors = []
for param in accuracy_best_params:
    
    n = 0
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred = model.predict(X_testing_sets[n])
    accuracy_test_errors.append(accuracy_score(y_testing_sets[n], y_pred))
    
    y_pred_train = model.predict(X_training_sets[n])
    accuracy_train_errors.append(accuracy_score(y_training_sets[n], y_pred_train))
    
    n += 1

In [77]:
accuracy_test_errors

[0.8191509493471267,
 0.8184570743707815,
 0.8181416766542611,
 0.8191509493471267,
 0.8191509493471267]

In [78]:
accuracy_train_errors

[0.8222, 0.8222, 0.8216, 0.8222, 0.8222]

In [79]:
roc_auc_test_errors = []
roc_auc_train_errors = []
for param in roc_auc_best_params:
    
    n = 0
    
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred2 = model.predict(X_testing_sets[n])
    roc_auc_test_errors.append(roc_auc_score(y_testing_sets[n], y_pred2))
    
    y_pred_train2 = model.predict(X_training_sets[n])
    roc_auc_train_errors.append(roc_auc_score(y_training_sets[n], y_pred_train2))
    
    n += 1

In [80]:
roc_auc_test_errors

[0.731348065923895,
 0.7303397857251025,
 0.7307176419383227,
 0.7303397857251025,
 0.7307176419383227]

In [81]:
roc_auc_train_errors

[0.7420446474648844,
 0.7406829457720823,
 0.7392864989704205,
 0.7406829457720823,
 0.7392864989704205]

In [82]:
f1_score_test_errors = []
f1_score_train_errors = []
for param in f1_best_params:
    
    n = 0
    
    if 'C' in param:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'], C = param['C'], max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    else:
        
        logreg_clf = LogisticRegression(penalty = param['penalty'], solver = param['solver'],  max_iter = 10000)
        model = logreg_clf.fit(X_training_sets[n], y_training_sets[n])
    
    y_pred3 = model.predict(X_testing_sets[n])
    f1_score_test_errors.append(f1_score(y_testing_sets[n], y_pred3))
    
    y_pred_train3 = model.predict(X_training_sets[n])
    f1_score_train_errors.append(f1_score(y_training_sets[n], y_pred_train3))
    
    
    n += 1

In [83]:
f1_score_test_errors

[0.6062354072242824,
 0.6064004376367614,
 0.6044724927973659,
 0.6062354072242824,
 0.6062354072242824]

In [84]:
f1_score_train_errors

[0.631273330568229,
 0.6327963651383726,
 0.629260182876143,
 0.631273330568229,
 0.631273330568229]

# KNN

In [54]:
#Lists to save models and training/testing sets
all_models_knn = []
X_training_sets_knn = []
y_training_sets_knn = []
X_testing_sets_knn = []
y_testing_sets_knn = []

#lists to save best params
accuracy_best_params_knn = []
roc_auc_best_params_knn = []
f1_best_params_knn = []

#looping across each sample for each trial
for sample in all_samples:
    X_train_knn = sample.iloc[:, :-1]
    y_train_knn = sample.iloc[:, -1]
    X_training_sets_knn.append(X_train_knn)
    y_training_sets_knn.append(y_train_knn)
    
    #Separating rows for the test set that were not in the sample
    ix_knn = [i for i in new_adult_data.index if i not in sample.index]
    test_set_knn = new_adult_data.loc[ix]
    X_test_knn = test_set_knn.iloc[:, :-1]
    y_test_knn = test_set_knn.iloc[:, -1]
    X_testing_sets_knn.append(X_test_knn)
    y_testing_sets_knn.append(y_test_knn)
    
    #Initiating classifier
    knn = KNeighborsClassifier()
    
    grid_values = {'n_neighbors' : list(range(1,105,4))}
    
    clf_knn = GridSearchCV(estimator = knn, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model_knn = clf_knn.fit(X_train_knn, y_train_knn)
    
    accuracy_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_knn.append(model_knn.cv_results_['params'][np.argmin(model_knn.cv_results_['rank_test_f1_micro'])])
    
    all_models_knn.append(model_knn)

In [55]:
accuracy_best_params_knn

[{'n_neighbors': 45},
 {'n_neighbors': 61},
 {'n_neighbors': 85},
 {'n_neighbors': 13},
 {'n_neighbors': 49}]

In [56]:
roc_auc_best_params_knn

[{'n_neighbors': 37},
 {'n_neighbors': 49},
 {'n_neighbors': 29},
 {'n_neighbors': 25},
 {'n_neighbors': 49}]

In [57]:
f1_best_params_knn

[{'n_neighbors': 45},
 {'n_neighbors': 61},
 {'n_neighbors': 85},
 {'n_neighbors': 13},
 {'n_neighbors': 49}]

In [85]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors_knn = []
accuracy_train_errors_knn = []
for param in accuracy_best_params_knn:
    
    n_knn = 0
    
    knn_clf = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn = model.predict(X_testing_sets_knn[n_knn])
    accuracy_test_errors_knn.append(accuracy_score(y_testing_sets_knn[n_knn], y_pred_knn))
    
    y_pred_train_knn = model.predict(X_training_sets_knn[n_knn])
    accuracy_train_errors_knn.append(accuracy_score(y_training_sets_knn[n_knn], y_pred_train_knn))
    
    n_knn += 1

In [86]:
accuracy_test_errors_knn

[0.7978300637103387,
 0.7950545638049581,
 0.7895035639941966,
 0.7994070522929414,
 0.7970731091906894]

In [87]:
accuracy_train_errors_knn

[0.8016, 0.798, 0.7918, 0.8264, 0.8016]

In [88]:
roc_auc_test_errors_knn = []
roc_auc_train_errors_knn = []
for param in roc_auc_best_params_knn:
    
    n_knn = 0
    
    knn_clf2 = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf2.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn2 = model.predict(X_testing_sets_knn[n_knn])
    roc_auc_test_errors_knn.append(roc_auc_score(y_testing_sets_knn[n_knn], y_pred_knn2))
    
    y_pred_train_knn2 = model.predict(X_training_sets_knn[n_knn])
    roc_auc_train_errors_knn.append(roc_auc_score(y_training_sets_knn[n_knn], y_pred_train_knn2))
    
    n_knn += 1

In [89]:
roc_auc_test_errors_knn 

[0.7084236276356798,
 0.704563364859298,
 0.7113196957326107,
 0.714504951281424,
 0.704563364859298]

In [102]:
roc_auc_train_errors_knn

[0.7267108133932175,
 0.7156112839850597,
 0.7273581965244672,
 0.7311035148765271,
 0.7156112839850597]

In [90]:
f1_score_test_errors_knn = []
f1_score_train_errors_knn = []
for param in f1_best_params_knn:
    
    n_knn = 0
    
    knn_clf3 = KNeighborsClassifier(n_neighbors = param['n_neighbors'])
    model = knn_clf3.fit(X_training_sets_knn[n_knn], y_training_sets_knn[n_knn])
    
    y_pred_knn3 = model.predict(X_testing_sets_knn[n_knn])
    f1_score_test_errors_knn.append(f1_score(y_testing_sets_knn[n_knn], y_pred_knn3))
    
    y_pred_train_knn3 = model.predict(X_training_sets_knn[n_knn])
    f1_score_train_errors_knn.append(f1_score(y_training_sets_knn[n_knn], y_pred_train_knn3))
    
    
    n_knn += 1

In [91]:
f1_score_test_errors_knn 

[0.5628154412767699,
 0.5546264564770391,
 0.5374913374913375,
 0.5766773162939297,
 0.5618956829633663]

In [92]:
f1_score_train_errors_knn

[0.5876974231088944,
 0.5791666666666666,
 0.5620530079932688,
 0.653631284916201,
 0.5883817427385892]

# Decision Tree

In [65]:
#Lists to save models and training/testing sets
all_models_dt = []
X_training_sets_dt = []
y_training_sets_dt = []
X_testing_sets_dt = []
y_testing_sets_dt = []

#lists to save best params
accuracy_best_params_dt = []
roc_auc_best_params_dt = []
f1_best_params_dt = []

#looping across each sample for each trial
for sample in all_samples:
    X_train_dt = sample.iloc[:, :-1]
    y_train_dt = sample.iloc[:, -1]
    X_training_sets_dt.append(X_train_dt)
    y_training_sets_dt.append(y_train_dt)
    
    #Separating rows for the test set that were not in the sample
    ix_dt = [i for i in new_adult_data.index if i not in sample.index]
    test_set_dt = new_adult_data.loc[ix_dt]
    X_test_dt = test_set_dt.iloc[:, :-1]
    y_test_dt = test_set_dt.iloc[:, -1]
    X_testing_sets_dt.append(X_test_dt)
    y_testing_sets_dt.append(y_test_dt)
    
    #Initiating classifier
    dt = DecisionTreeClassifier()
    
    grid_values = [{'criterion': ['gini', 'entropy'], 'max_depth':list(range(1,100,3)), 'min_samples_leaf': list(range(10,100,10))}]
    
    clf_dt = GridSearchCV(estimator = dt, param_grid = grid_values, cv = StratifiedKFold(n_splits=5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, verbose=0)
    
    model_dt = clf_dt.fit(X_train_dt, y_train_dt)
    
    accuracy_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_accuracy'])])
    roc_auc_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_dt.append(model_dt.cv_results_['params'][np.argmin(model_dt.cv_results_['rank_test_f1_micro'])])
    
    all_models_dt.append(model_dt)

In [66]:
accuracy_best_params_dt

[{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 70},
 {'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 60},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 70},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 60},
 {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 50}]

In [67]:
roc_auc_best_params_dt

[{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 70},
 {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 80},
 {'criterion': 'gini', 'max_depth': 16, 'min_samples_leaf': 70},
 {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 90},
 {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 50}]

In [68]:
f1_best_params_dt

[{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 70},
 {'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 60},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 70},
 {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 60},
 {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 50}]

In [93]:
#Now we need to loop across all models from each metric, train on entire sample, and then predict on test set
accuracy_test_errors_dt = []
accuracy_train_errors_dt = []
for param in accuracy_best_params_dt:
    
    n_dt = 0
    
    dt_clf = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt = model.predict(X_testing_sets_dt[n_dt])
    accuracy_test_errors_dt.append(accuracy_score(y_testing_sets_dt[n_dt], y_pred_dt))
    
    y_pred_train_dt = model.predict(X_training_sets_dt[n_dt])
    accuracy_train_errors_dt.append(accuracy_score(y_training_sets_dt[n_dt], y_pred_train_dt))
    
    n_dt += 1

In [94]:
accuracy_test_errors_dt

[0.8137261086229736,
 0.802750268088059,
 0.8106982905443765,
 0.808679745158645,
 0.8109506087175928]

In [95]:
accuracy_train_errors_dt

[0.8172, 0.8192, 0.8148, 0.8154, 0.819]

In [96]:
roc_auc_test_errors_dt = []
roc_auc_train_errors_dt = []
for param in roc_auc_best_params_dt:
    
    n_dt = 0
    
    dt_clf2 = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf2.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt2 = model.predict(X_testing_sets_dt[n_dt])
    roc_auc_test_errors_dt.append(roc_auc_score(y_testing_sets_dt[n_dt], y_pred_dt2))
    
    y_pred_train_dt2 = model.predict(X_training_sets_dt[n_dt])
    roc_auc_train_errors_dt.append(roc_auc_score(y_training_sets_dt[n_dt], y_pred_train_dt2))
    
    n_dt += 1

In [97]:
roc_auc_test_errors_dt

[0.7192725942257816,
 0.7021183182057417,
 0.72860423742006,
 0.7021183182057417,
 0.7089700741606755]

In [98]:
roc_auc_train_errors_dt

[0.7352683293223169,
 0.7193474051024215,
 0.7598653116074212,
 0.7193474051024215,
 0.729385164860432]

In [99]:
f1_score_test_errors_dt = []
f1_score_train_errors_dt = []
for param in f1_best_params_dt:
    
    n_dt = 0
    
    dt_clf3 = DecisionTreeClassifier(criterion = param['criterion'], max_depth = param['max_depth'],
                                   min_samples_leaf = param['min_samples_leaf'])
    model = dt_clf3.fit(X_training_sets_dt[n_dt], y_training_sets_dt[n_dt])
    
    y_pred_dt3 = model.predict(X_testing_sets_dt[n_dt])
    f1_score_test_errors_dt.append(f1_score(y_testing_sets_dt[n_dt], y_pred_dt3))
    
    y_pred_train_dt3 = model.predict(X_training_sets_dt[n_dt])
    f1_score_train_errors_dt.append(f1_score(y_training_sets_dt[n_dt], y_pred_train_dt3))
    
    n_dt += 1

In [100]:
f1_score_test_errors_dt

[0.5628423390081422,
 0.5751935878277408,
 0.5974513749161637,
 0.5830927835051546,
 0.5722848579991437]

In [101]:
f1_score_train_errors_dt 

[0.5980650835532102,
 0.6340080971659918,
 0.6281124497991969,
 0.6200082338410868,
 0.6130825138948268]