## Round 1:

**Features**: 
- age
- education-num
- marital-status
- sex

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve
pd.set_option("display.max_rows", 100)

sns.set()

In [4]:
dataset = pd.read_csv('./data/large_train_sample.csv')

test_data = pd.read_csv('./data/test_data.csv')

dataset.columns=[c.lower().replace('-','_') for c in dataset.columns]
test_data.columns=[c.lower().replace('-','_') for c in test_data.columns]
categorical_features = ['workclass', 'education',
       'marital_status', 'occupation', 'relationship', 'sex', 'native_country']

In [5]:
for c in categorical_features:
    dataset[c]=dataset[c].replace('?','MISSING')
    #test_data[c]=test_data[c].replace('?',None)

In [6]:
dataset[categorical_features]=dataset[categorical_features].astype(str)
test_data[categorical_features]=dataset[categorical_features].astype(str)

In [7]:
dataset=dataset.dropna()
#test_data=test_data.dropna()

In [8]:
dataset.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage              0
dtype: int64

In [9]:
dataset[categorical_features]=dataset[categorical_features].apply(lambda x: x.str.strip())
test_data[categorical_features]=test_data[categorical_features].apply(lambda x: x.str.strip())

In [10]:
dataset['wage']=dataset['wage'].astype(str)
dataset['wage']=dataset['wage'].str.strip()
dataset['wage']=dataset['wage'].map({'<=50K':0,'>50K':1})

In [11]:
dataset['sex']=dataset['sex'].map({'Male':1,'Female':0})
test_data['sex']=test_data['sex'].map({'Male':1,'Female':0})

In [12]:
dataset['marital_status']=dataset['marital_status'].str.lower().str.replace('-','_')
test_data['marital_status']=test_data['marital_status'].str.lower().str.replace('-','_')

In [13]:
test_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')

In [14]:
marital_status = pd.get_dummies(dataset['marital_status'],drop_first=True, prefix='ms')
test_data_ms = pd.get_dummies(test_data['marital_status'],drop_first=True, prefix='ms')

In [15]:
features = ['age','education_num','sex']+list(marital_status.columns)

In [16]:
dataset.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'wage'],
      dtype='object')

In [17]:
dataset = dataset.join(marital_status)
test_data = test_data.join(test_data_ms)

In [18]:
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,\
RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

pipes_dict = dict()
pipe_params = dict()
model_runs = dict()
model_scores = dict()
predicts_probs = dict()

#models = {'LOGREG':LogisticRegression()}
models = {'LOGREG_Jonna':LogisticRegression(),'KNN_Kemal':KNeighborsClassifier(),'Adaboost_Reem':AdaBoostClassifier(),
           'BagginClassifier_Jonna':BaggingClassifier(),'SVC_Reem':SVC(),'RForest_Kemal':RandomForestClassifier()}



X=dataset[features]
y=dataset['wage']

X_train, X_test,y_train, y_test =\
train_test_split(X,y,train_size=0.70,random_state=42)

sc = StandardScaler()
X_train_sc= sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

for mname, mvalue in models.items():    
    pipe_params[mname]={}

   
 
pipe_params['LOGREG_Jonna'] = {
    'LOGREG_Jonna__solver':['liblinear'],
    'LOGREG_Jonna__penalty':['l1','l2'],#,ExtraTreesRegressor()],
    'LOGREG_Jonna__C':[0.1,0.2,0.5,0.8,1]
}
   

pipe_params['BaggingClassifier_Jonna'] = {    
    'BaggingClassifier_Jonna__base_estimator':[None,RandomForestClassifier()],#,ExtraTreesRegressor()],
     'BTree__max_features':[1], 
     'BTree__max_samples':[1],
    'BTree__n_estimators':[10,50]#[5,10,20,50]
}
    
   

pipe_params['RForest_Kemal'] = {
    #'RForest__n_estimators':[10,20,50,100],
    #'RForest__criterion':["mse"],#,"mse", "mae"],
    #'RForest__splitter':["random"],#["best", "random"],
    'RForest_Kemal__min_samples_split':[4],#2,
    'RForest_Kemal__min_samples_leaf':[10],#,1,5,20],
    'RForest_Kemal__min_weight_fraction_leaf':[0.0],
    'RForest_Kemal__max_features':["sqrt"],#["auto", "sqrt", "log2"],
    'RForest_Kemal__max_leaf_nodes':[30],#,10,None],
    'RForest_Kemal__min_impurity_decrease':[0.0,0.5],
    #'RForest__ccp_alpha':[0.1,0.5,1]
}


pipe_params['Adaboost_Reem']=[{
    'Adaboost_Reem__base_estimator':[RandomForestClassifier(),ExtraTreesClassifier()],
    'Adaboost_Reem__n_estimators':[5],#,10,20],#[5,10,20],
    'Adaboost_Reem__learning_rate':[0.5,1],#[np.linspace(0.1,1,20)]
    'Adaboost_Reem__algorithm':['SAMME']
}]


pipe_params['SVC_Reem']=[{
    'SVC_Reem__kernel':['linear'],#['linear', 'poly', 'rbf', 'sigmoid'],
    'SVC_Reem__degree':[2],#,3,4],
    'SVC_Reem__tol':[0.001],#[np.linspace(0.001,0.005,3),]
    'SVC_Reem__C':[1]#[np.linspace(0.1,1,20)]
}]
        
pipe_params['KNN_Kemal']=[{
    'KNN_Kemal__n_neighbors':[19,21],#5,10,30,40,50],
    'KNN_Kemal__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'KNN_Kemal__weights':['uniform'],#,'distance'],
    'KNN_Kemal__leaf_size':[30],#,40,50],
    'KNN_Kemal__p':[1],#,2],
    'KNN_Kemal__metric':['euclidean']#,'manhattan']
}]
    
    
for mname, mvalue in models.items():    
    pipes_dict[mname]=Pipeline([
        #('stsc', StandardScaler()),
        (mname,mvalue)
    ])
    model_runs[mname]=GridSearchCV(pipes_dict[mname],
                                   pipe_params[mname])
    
    print(f'Fitting {mname} on X_train_sc')
    model_runs[mname].fit(X_train_sc,y_train)
    preds_train = model_runs[mname].predict(X_train_sc)
    preds_test = model_runs[mname].predict(X_test_sc)
#     pred_proba_train = [i[1] for i in model_runs[mname].predict_proba(X_train_sc)]
#     pred_proba_test = [i[1] for i in model_runs[mname].predict_proba(X_test_sc)]
    
#     predicts_probs[mname] = {'train_pred':preds_train,
#                              'test_pred':preds_test,
#                              'train_proba':pred_proba_train,
#                              'test_proba':pred_proba_test}
    model_scores[mname] ={
        'model':model_runs[mname],
        'train':model_runs[mname].score(X_train_sc,y_train),
        'test':model_runs[mname].score(X_test_sc,y_test),
        'Mean_CV_Score':model_runs[mname].best_score_,
        'f1score_train':f1_score(y_train,preds_train),
        'f1score_test':f1_score(y_test,preds_test),
        
    }
    print(f"{mname} Scores:\n"
          f"Train:{model_scores[mname]['train']}\n"
          f"Test:{model_scores[mname]['test']}\n"
          f"F1 score Train:{model_scores[mname]['f1score_train']}\n"
          f"F1 score Test:{model_scores[mname]['f1score_test']}\n"
          f"Mean_CV_Score:{model_scores[mname]['Mean_CV_Score']}")
    print(f"Best Params {mname}:\n"
          f"{model_runs[mname].best_params_}")

Fitting LOGREG_Jonna on X_train_sc
LOGREG_Jonna Scores:
Train:0.8185328185328186
Test:0.8191217115364929
F1 score Train:0.5609341825902336
F1 score Test:0.5442352334279082
Mean_CV_Score:0.8180941680146827
Best Params LOGREG_Jonna:
{'LOGREG_Jonna__C': 0.1, 'LOGREG_Jonna__penalty': 'l1', 'LOGREG_Jonna__solver': 'liblinear'}
Fitting KNN_Kemal on X_train_sc
KNN_Kemal Scores:
Train:0.8302474552474552
Test:0.8173815129491248
F1 score Train:0.6119747267074516
F1 score Test:0.5694980694980694
Mean_CV_Score:0.8208581822395675
Best Params KNN_Kemal:
{'KNN_Kemal__algorithm': 'ball_tree', 'KNN_Kemal__leaf_size': 30, 'KNN_Kemal__metric': 'euclidean', 'KNN_Kemal__n_neighbors': 21, 'KNN_Kemal__p': 1, 'KNN_Kemal__weights': 'uniform'}
Fitting Adaboost_Reem on X_train_sc
Adaboost_Reem Scores:
Train:0.8459108459108459
Test:0.808475790766711
F1 score Train:0.6604138464513633
F1 score Test:0.5693901035673188
Mean_CV_Score:0.8125657257038789
Best Params Adaboost_Reem:
{'Adaboost_Reem__algorithm': 'SAMME', '

In [19]:
print(len(features))
model_scores=pd.DataFrame(model_scores).T

9


In [20]:
model_scores.drop(columns='model').sort_values(by='Mean_CV_Score',ascending=False)

Unnamed: 0,train,test,Mean_CV_Score,f1score_train,f1score_test
RForest_Kemal,0.823447,0.82209,0.82099,0.544487,0.524096
KNN_Kemal,0.830247,0.817382,0.820858,0.611975,0.569498
SVC_Reem,0.819454,0.819224,0.819454,0.548942,0.532804
LOGREG_Jonna,0.818533,0.819122,0.818094,0.560934,0.544235
Adaboost_Reem,0.845911,0.808476,0.812566,0.660414,0.56939
BagginClassifier_Jonna,0.844989,0.808681,0.810679,0.653797,0.561987


In [21]:
chosen_model = model_scores.loc['RForest_Kemal','model']
test_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country',
       'ms_married_af_spouse', 'ms_married_civ_spouse',
       'ms_married_spouse_absent', 'ms_never_married', 'ms_separated',
       'ms_widowed'],
      dtype='object')

In [22]:
X_output_test = test_data[features]
X_output_test_sc = sc.transform(X_output_test)
predictions = pd.Series(chosen_model.predict(X_output_test_sc))
probabilities = pd.DataFrame([i[1] for i in chosen_model.predict_proba(X_output_test_sc)])
output_to_file = probabilities
output_to_file.columns=['wage']
output_to_file

Unnamed: 0,wage
0,0.019248
1,0.370805
2,0.090583
3,0.474994
4,0.185525
...,...
16276,0.697312
16277,0.108263
16278,0.142086
16279,0.244918


In [23]:
output_to_file.to_csv('./data/predictons.csv',index=False,header=True)

In [24]:
output_to_file

Unnamed: 0,wage
0,0.019248
1,0.370805
2,0.090583
3,0.474994
4,0.185525
...,...
16276,0.697312
16277,0.108263
16278,0.142086
16279,0.244918


In [25]:
import pickle
import datetime
current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_")
file_names = [str(f'./models/Round_1_{mtr}_{current_time}.pkl')
              for mtr in model_scores.index]
for fn,model in zip(file_names,model_scores.items()):
    with open(fn, 'wb') as file:
        pickle.dump(model, file)
        file.close()

FileNotFoundError: [Errno 2] No such file or directory: './models/Round_1_LOGREG_Jonna_2020_06_02_03_51_06_.pkl'