## Round 2:

**Features**:

- age
- education-num
- marital-status
- sex
- workclass
- hours per week
- capital gains

In [185]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve
pd.set_option("display.max_rows", 100)

sns.set()


In [186]:
dataset = pd.read_csv('./data/large_train_sample.csv')

test_data = pd.read_csv('./data/test_data.csv')

dataset.columns=[c.lower().replace('-','_') for c in dataset.columns]
test_data.columns=[c.lower().replace('-','_') for c in test_data.columns]
categorical_features = ['workclass', 'education',
       'marital_status', 'occupation', 'relationship', 'sex', 'native_country']

In [187]:
for c in categorical_features:
    dataset[c]=dataset[c].replace('?','MISSING')
    #test_data[c]=test_data[c].replace('?',None)

dataset[categorical_features]=dataset[categorical_features].astype(str)
test_data[categorical_features]=dataset[categorical_features].astype(str)

dataset=dataset.dropna()
#dataset['workclass']
#test_data=test_data.dropna()

In [188]:
dataset.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage              0
dtype: int64

In [189]:
dataset['wage']=dataset['wage'].astype(str)
dataset['wage']=dataset['wage'].str.strip()
dataset['wage']=dataset['wage'].map({'<=50K':0,'>50K':1})

In [190]:
dataset[categorical_features]=dataset[categorical_features].apply(lambda x: x.str.strip())
test_data[categorical_features]=test_data[categorical_features].apply(lambda x: x.str.strip())

In [191]:
dataset['sex']=dataset['sex'].map({'Male':1,'Female':0})
test_data['sex']=test_data['sex'].map({'Male':1,'Female':0})

In [192]:
dataset['marital_status']=dataset['marital_status'].str.lower().str.replace('-','_')
test_data['marital_status']=test_data['marital_status'].str.lower().str.replace('-','_')

In [193]:
marital_status = pd.get_dummies(dataset['marital_status'],drop_first=True, prefix='ms')
test_data_ms = pd.get_dummies(test_data['marital_status'],drop_first=True, prefix='ms')

In [194]:
dataset['workclass_rebinned']=dataset['workclass'].map({'Private':'Private',
                          'State-gov':'Gov',
                          'Federal-gov':'Gov',
                          'Local-gov':'Gov',
                          'Self-emp-not-inc':'Self',
                          'Self-emp-inc':'Self',
                          'Without-pay':'Other',
                          'Never-worked':'Other',
                          'MISSING':'Other'              
                         })

In [195]:
workclass = pd.get_dummies(dataset['workclass_rebinned'],drop_first=True,prefix='wc')
test_data_wc = pd.get_dummies(dataset['workclass_rebinned'],drop_first=True,prefix='wc')

In [196]:
features = ['age','education_num','sex','hours_per_week','capital_gain']+list(marital_status.columns)+list(workclass.columns)

In [197]:
len(features)

14

In [198]:
dataset = dataset.join(marital_status).join(workclass)
test_data = test_data.join(test_data_ms).join(test_data_wc)

In [199]:
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,\
RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

pipes_dict = dict()
pipe_params = dict()
model_runs = dict()
model_scores = dict()
predicts_probs = dict()

#models = {'LOGREG':LogisticRegression()}
models = {'LOGREG_Jonna':LogisticRegression(),'KNN_Kemal':KNeighborsClassifier(),'Adaboost_Reem':AdaBoostClassifier(),
          'SVC_Reem':SVC(probability=True)}#,'RForest_Kemal':RandomForestClassifier(),'BagginClassifier_Jonna':BaggingClassifier(),}

X=dataset[features]
y=dataset['wage']

X_train, X_test,y_train, y_test =\
train_test_split(X,y,train_size=0.70,random_state=42)

sc = StandardScaler()
X_train_sc= sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

for mname, mvalue in models.items():    
    pipe_params[mname]={}

   
 
pipe_params['LOGREG_Jonna'] = {
    'LOGREG_Jonna__solver':['liblinear'],
    'LOGREG_Jonna__penalty':['l1'],#,'l2'],#,ExtraTreesRegressor()],
    'LOGREG_Jonna__C':[0.1]
}
   

pipe_params['BaggingClassifier_Jonna'] = {    
    'BaggingClassifier_Jonna__base_estimator':[None,RandomForestClassifier()],#,ExtraTreesRegressor()],
     'BTree__max_features':[1], 
     'BTree__max_samples':[1],
    'BTree__n_estimators':[10,50]#[5,10,20,50]
}
    
   

pipe_params['RForest_Kemal'] = {
    #'RForest__n_estimators':[10,20,50,100],
    #'RForest__criterion':["mse"],#,"mse", "mae"],
    #'RForest__splitter':["random"],#["best", "random"],
    'RForest_Kemal__min_samples_split':[4],#2,
    'RForest_Kemal__min_samples_leaf':[10],#,1,5,20],
    'RForest_Kemal__min_weight_fraction_leaf':[0.0],
    'RForest_Kemal__max_features':["sqrt"],#["auto", "sqrt", "log2"],
    'RForest_Kemal__max_leaf_nodes':[30],#,10,None],
    'RForest_Kemal__min_impurity_decrease':[0.0,0.5],
    #'RForest__ccp_alpha':[0.1,0.5,1]
}


pipe_params['Adaboost_Reem']=[{
    'Adaboost_Reem__base_estimator':[RandomForestClassifier()],#,ExtraTreesClassifier()],
    'Adaboost_Reem__n_estimators':[5],#,10,20],#[5,10,20],
    'Adaboost_Reem__learning_rate':[1],#[np.linspace(0.1,1,20)]0.5,
    'Adaboost_Reem__algorithm':['SAMME']
}]


pipe_params['SVC_Reem']=[{
    'SVC_Reem__kernel':['rbf'],#['linear', 'poly', 'rbf', 'sigmoid'],
    'SVC_Reem__degree':[2],#,3,4],
    'SVC_Reem__tol':[0.001],#[np.linspace(0.001,0.005,3),]
    'SVC_Reem__C':[0.5]#[np.linspace(0.1,1,20)]
}]
        
pipe_params['KNN_Kemal']=[{
    'KNN_Kemal__n_neighbors':[21],#19,5,10,30,40,50],
    'KNN_Kemal__algorithm':['brute'],#'auto', 'ball_tree', 'kd_tree',
    'KNN_Kemal__weights':['uniform'],#,'distance'],
    'KNN_Kemal__leaf_size':[30],#,40,50],
    'KNN_Kemal__p':[1],#,2],
    'KNN_Kemal__metric':['euclidean']#,'manhattan']
}]
    
    
for mname, mvalue in models.items():    
    pipes_dict[mname]=Pipeline([
        #('stsc', StandardScaler()),
        (mname,mvalue)
    ])
    model_runs[mname]=GridSearchCV(pipes_dict[mname],
                                   pipe_params[mname])
    
    print(f'Fitting {mname} on X_train_sc')
    model_runs[mname].fit(X_train_sc,y_train)
    preds_train = model_runs[mname].predict(X_train_sc)
    preds_test = model_runs[mname].predict(X_test_sc)
#     pred_proba_train = [i[1] for i in model_runs[mname].predict_proba(X_train_sc)]
#     pred_proba_test = [i[1] for i in model_runs[mname].predict_proba(X_test_sc)]
    
#     predicts_probs[mname] = {'train_pred':preds_train,
#                              'test_pred':preds_test,
#                              'train_proba':pred_proba_train,
#                              'test_proba':pred_proba_test}
    model_scores[mname] ={
        'model':model_runs[mname],
        'train':model_runs[mname].score(X_train_sc,y_train),
        'test':model_runs[mname].score(X_test_sc,y_test),
        'Mean_CV_Score':model_runs[mname].best_score_,
        'f1score_train':f1_score(y_train,preds_train),
        'f1score_test':f1_score(y_test,preds_test),
        
    }
    print(f"{mname} Scores:\n"
          f"Train:{model_scores[mname]['train']}\n"
          f"Test:{model_scores[mname]['test']}\n"
          f"F1 score Train:{model_scores[mname]['f1score_train']}\n"
          f"F1 score Test:{model_scores[mname]['f1score_test']}\n"
          f"Mean_CV_Score:{model_scores[mname]['Mean_CV_Score']}")
    print(f"Best Params {mname}:\n"
          f"{model_runs[mname].best_params_}")

Fitting LOGREG_Jonna on X_train_sc
LOGREG_Jonna Scores:
Train:0.8396805896805897
Test:0.839594636093766
F1 score Train:0.626685737637924
F1 score Test:0.6171512338138284
Mean_CV_Score:0.8394613223283514
Best Params LOGREG_Jonna:
{'LOGREG_Jonna__C': 0.1, 'LOGREG_Jonna__penalty': 'l1', 'LOGREG_Jonna__solver': 'liblinear'}
Fitting KNN_Kemal on X_train_sc
KNN_Kemal Scores:
Train:0.8502983502983503
Test:0.8367284266557478
F1 score Train:0.6603623332669719
F1 score Test:0.6185123176273618
Mean_CV_Score:0.8377062820543791
Best Params KNN_Kemal:
{'KNN_Kemal__algorithm': 'brute', 'KNN_Kemal__leaf_size': 30, 'KNN_Kemal__metric': 'euclidean', 'KNN_Kemal__n_neighbors': 21, 'KNN_Kemal__p': 1, 'KNN_Kemal__weights': 'uniform'}
Fitting Adaboost_Reem on X_train_sc
Adaboost_Reem Scores:
Train:0.9321691821691822
Test:0.8277203398505476
F1 score Train:0.8576689375805561
F1 score Test:0.6175869120654397
Mean_CV_Score:0.8269569154301927
Best Params Adaboost_Reem:
{'Adaboost_Reem__algorithm': 'SAMME', 'Adabo

In [200]:
len(features)

14

In [201]:
model_scores=pd.DataFrame(model_scores).T

In [208]:
model_scores.drop(columns='model').sort_values(by='test', ascending=False)

Unnamed: 0,train,test,Mean_CV_Score,f1score_train,f1score_test
SVC_Reem,0.845779,0.844406,0.844858,0.629102,0.612443
LOGREG_Jonna,0.839681,0.839595,0.839461,0.626686,0.617151
KNN_Kemal,0.850298,0.836728,0.837706,0.660362,0.618512
Adaboost_Reem,0.932169,0.82772,0.826957,0.857669,0.617587


In [203]:
chosen_model = model_scores.loc['SVC_Reem','model']
test_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country',
       'ms_married_af_spouse', 'ms_married_civ_spouse',
       'ms_married_spouse_absent', 'ms_never_married', 'ms_separated',
       'ms_widowed', 'wc_Other', 'wc_Private', 'wc_Self'],
      dtype='object')

In [204]:
X_output_test = test_data[features]
X_output_test_sc = sc.transform(X_output_test)
predictions = pd.Series(chosen_model.predict(X_output_test_sc))
probabilities = pd.DataFrame([i[1] for i in chosen_model.predict_proba(X_output_test_sc)])
output_to_file = probabilities
output_to_file.columns=['wage']
output_to_file

Unnamed: 0,wage
0,0.107464
1,0.142617
2,0.076890
3,0.901278
4,0.081437
...,...
16276,0.713760
16277,0.089920
16278,0.141114
16279,0.428212


In [205]:
output_to_file.to_csv('./data/predictons.csv',index=False,header=True)

In [206]:
output_to_file

Unnamed: 0,wage
0,0.107464
1,0.142617
2,0.076890
3,0.901278
4,0.081437
...,...
16276,0.713760
16277,0.089920
16278,0.141114
16279,0.428212


In [207]:
import pickle
import datetime
current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_")
file_names = [str(f'./models/Round_2_{mtr}_{current_time}.pkl')
              for mtr in model_scores.index]
for fn,model in zip(file_names,model_scores.items()):
    with open(fn, 'wb') as file:
        pickle.dump(model, file)
        file.close()