## Round 2 - Jonna:

**Features**:

- age
- education-num
- marital-status
- sex
- workclass
- hours per week
- capital gains

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostRegressor, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve
pd.set_option("display.max_rows", 100)

sns.set()


In [2]:
dataset = pd.read_csv('./data/large_train_sample.csv')

In [3]:
test_data = pd.read_csv('./data/test_data.csv')

In [4]:
dataset.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'wage'],
      dtype='object')

In [5]:
categorical_features = ['workclass', 'education',
       'marital-status', 'occupation', 'relationship', 'sex', 'native-country']

In [6]:
for c in categorical_features:
    dataset[c]=dataset[c].replace('?',None)
    #test_data[c]=test_data[c].replace('?',None)

In [7]:
dataset[categorical_features]=dataset[categorical_features].astype(str)
test_data[categorical_features]=dataset[categorical_features].astype(str)

In [8]:
dataset=dataset.dropna()
dataset['workclass']
#test_data=test_data.dropna()

0                State-gov
1         Self-emp-not-inc
2                  Private
3                  Private
4                  Private
               ...        
32556              Private
32557              Private
32558              Private
32559              Private
32560         Self-emp-inc
Name: workclass, Length: 32561, dtype: object

In [9]:
dataset.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
wage              0
dtype: int64

In [10]:
dataset[categorical_features]=dataset[categorical_features].apply(lambda x: x.str.strip())
test_data[categorical_features]=test_data[categorical_features].apply(lambda x: x.str.strip())

In [11]:
dataset['wage']=dataset['wage'].astype(str)
dataset['wage']=dataset['wage'].str.strip()
dataset['wage']=dataset['wage'].map({'<=50K':0,'>50K':1})

In [12]:
dataset['sex']=dataset['sex'].map({'Male':0,'Female':1})

In [13]:
dataset['marital-status']=dataset['marital-status'].str.lower().str.replace('-','_')

In [14]:
marital_status = pd.get_dummies(dataset['marital-status'],drop_first=True)

In [15]:
dataset['workclass']=dataset['workclass'].map({'State-gov':"government",
                                               'Federal-gov':"government", 
                                               'Local-gov':"government",
                                               'Without-pay':"no_work",
                                               'Never-worked':"no_work",
                                               "Private":"other",
                                               "Self-emp-not-inc":"other",
                                               "?":"other",
                                               "Self-emp-inc":"other",
                                              }
                                             )

In [16]:
workclass = pd.get_dummies(dataset['workclass'],drop_first=True)

In [17]:
dataset.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'wage'],
      dtype='object')

In [18]:
#REFERENCE
dataset.head()
# dataset['workclass'].value_counts()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,government,77516,Bachelors,13,never_married,Adm-clerical,Not-in-family,0,2174,0,40,United-States,0
1,50,other,83311,Bachelors,13,married_civ_spouse,Exec-managerial,Husband,0,0,0,13,United-States,0
2,38,other,215646,HS-grad,9,divorced,Handlers-cleaners,Not-in-family,0,0,0,40,United-States,0
3,53,other,234721,11th,7,married_civ_spouse,Handlers-cleaners,Husband,0,0,0,40,United-States,0
4,28,other,338409,Bachelors,13,married_civ_spouse,Prof-specialty,Wife,1,0,0,40,Cuba,0


In [19]:
features = ['age','education-num','sex', 'hours-per-week', 'capital-gain']+list(marital_status.columns)+list(workclass.columns)

In [20]:
dataset = dataset.join(marital_status)

In [21]:
dataset = dataset.join(workclass)

In [22]:
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier,\
RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

pipes_dict = dict()
pipe_params = dict()
model_runs = dict()
model_scores = dict()
models = {'LOGREG':LogisticRegression(), 'BaggingClassifier_Jonna':BaggingClassifier()} 
#           'KNN':KNeighborsClassifier()}
#           'Adaboost':AdaBoostClassifier(),
#         'BagginClassifier_Jonna':BaggingClassifier(),
#           'SVC':SVC(),'RForest':RandomForestClassifier()}



X=dataset[features]
y=dataset['wage']

X_train, X_test,y_train, y_test =\
train_test_split(X,y,train_size=0.70,random_state=42)

sc = StandardScaler()
X_train_sc= sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

for mname, mvalue in models.items():    
    pipe_params[mname]={}

   
 
pipe_params['LOGREG'] = {
    'LOGREG__solver':['liblinear'],
    'LOGREG__penalty':['l1','l2'],#,ExtraTreesRegressor()],
    'LOGREG__C':[0.1,0.2,0.5,0.8,1]
}
   

pipe_params['BaggingClassifier_Jonna'] = {    
    'BaggingClassifier_Jonna__base_estimator':[None,RandomForestClassifier(),ExtraTreesClassifier()],#,ExtraTreesRegressor()],
    'BaggingClassifier_Jonna__max_features':[1], 
    'BaggingClassifier_Jonna__max_samples':[1],
    'BaggingClassifier_Jonna__n_estimators':[5,10,20,50],
    'BaggingClassifier_Jonna__bootstrap_features': [True],
    'BaggingClassifier_Jonna__bootstrap': [True],
    'BaggingClassifier_Jonna__oob_score': [True],
    'BaggingClassifier_Jonna__warm_start': [False],
    'BaggingClassifier_Jonna__n_jobs': [-1],
    'BaggingClassifier_Jonna__random_state': [None],
    'BaggingClassifier_Jonna__verbose': [0]
}
    
# pipe_params['RForest'] = {
#     #'RForest__n_estimators':[10,20,50,100],
#     #'RForest__criterion':["mse"],#,"mse", "mae"],
#     #'RForest__splitter':["random"],#["best", "random"],
#     'RForest__min_samples_split':[4],#2,
#     'RForest__min_samples_leaf':[10],#,1,5,20],
#     'RForest__min_weight_fraction_leaf':[0.0],
#     'RForest__max_features':["sqrt"],#["auto", "sqrt", "log2"],
#     'RForest__max_leaf_nodes':[30],#,10,None],
#     'RForest__min_impurity_decrease':[0.0,0.5],
#     #'RForest__ccp_alpha':[0.1,0.5,1]
# }

# pipe_params['DTree'] = {
#     #'DTree__criterion':["mse"],#, "friedman_mse", "mae"],
#     'DTree__splitter':["random"],#["best", "random"],
#     'DTree__min_samples_split':[2,4],
#     'DTree__min_samples_leaf':[10],#,1,5,20],
#     'DTree__min_weight_fraction_leaf':[0.0],
#     'DTree__max_features':["auto"],#, "sqrt", "log2"],
#     'DTree__max_leaf_nodes':[None],#,10,30],
#     'DTree__min_impurity_decrease':[0.0],#,0.5],
#     'DTree__ccp_alpha':[0.1,0.5,1]
# }

# pipe_params['Adaboost']=[{
#     'Adaboost__base_estimator':[RandomForestClassifier(),ExtraTreesClassifier()],
#     'Adaboost__n_estimators':[5,10,20],#[5,10,20],
#     'Adaboost__learning_rate':[0.5,1],#[np.linspace(0.1,1,20)]
#     'Adaboost__algorithm':['SAMME']
# }]


# pipe_params['SVM']=[{
#     'SVM__kernel':['linear'],#['linear', 'poly', 'rbf', 'sigmoid'],
#     'SVM__degree':[2],#,3,4],
#     'SVM__tol':[0.001],#[np.linspace(0.001,0.005,3),]
#     'SVM__C':[1]#[np.linspace(0.1,1,20)]
# }]
        
# pipe_params['KNN']=[{
#     'KNN__n_neighbors':[19,21],#5,10,30,40,50],
#     'KNN__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
#     'KNN__weights':['uniform'],#,'distance'],
#     'KNN__leaf_size':[30],#,40,50],
#     'KNN__p':[1],#,2],
#     'KNN__metric':['euclidean']#,'manhattan']
# }]
    
    
for mname, mvalue in models.items():    
    pipes_dict[mname]=Pipeline([
        #('stsc', StandardScaler()),
        (mname,mvalue)
    ])
    model_runs[mname]=GridSearchCV(pipes_dict[mname],
                                   pipe_params[mname])
    
    print(f'Fitting {mname} on X_train_sc')
    model_runs[mname].fit(X_train_sc,y_train)
    preds_train = model_runs[mname].predict(X_train_sc)
    preds_test = model_runs[mname].predict(X_test_sc)
    model_scores[mname] ={
        'model':model_runs[mname],
        'train':model_runs[mname].score(X_train_sc,y_train),
        'test':model_runs[mname].score(X_test_sc,y_test),
        'Mean_CV_Score':model_runs[mname].best_score_,
        'f1score_train':f1_score(y_train,preds_train),
        'f1score_test':f1_score(y_test,preds_test)
    }
    print(f"{mname} Scores:\n"
          f"Train:{model_scores[mname]['train']}\n"
          f"Test:{model_scores[mname]['test']}\n"
          f"F1 score Train:{model_scores[mname]['f1score_train']}\n"
          f"F1 score Test:{model_scores[mname]['f1score_test']}\n"
          f"Mean_CV_Score:{model_scores[mname]['Mean_CV_Score']}")
    print(f"Best Params {mname}:\n"
          f"{model_runs[mname].best_params_}")

Fitting LOGREG on X_train_sc
LOGREG Scores:
Train:0.8395050895050895
Test:0.8390828129798341
F1 score Train:0.6260478429768963
F1 score Test:0.6169590643274854
Mean_CV_Score:0.8394174434340995
Best Params LOGREG:
{'LOGREG__C': 0.1, 'LOGREG__penalty': 'l2', 'LOGREG__solver': 'liblinear'}
Fitting BaggingClassifier_Jonna on X_train_sc
BaggingClassifier_Jonna Scores:
Train:0.7575026325026325
Test:0.7631282628723514
F1 score Train:0.0
F1 score Test:0.0
Mean_CV_Score:0.7575026412514927
Best Params BaggingClassifier_Jonna:
{'BaggingClassifier_Jonna__base_estimator': None, 'BaggingClassifier_Jonna__bootstrap': True, 'BaggingClassifier_Jonna__bootstrap_features': True, 'BaggingClassifier_Jonna__max_features': 1, 'BaggingClassifier_Jonna__max_samples': 1, 'BaggingClassifier_Jonna__n_estimators': 10, 'BaggingClassifier_Jonna__n_jobs': -1, 'BaggingClassifier_Jonna__oob_score': True, 'BaggingClassifier_Jonna__random_state': None, 'BaggingClassifier_Jonna__verbose': 0, 'BaggingClassifier_Jonna__warm

In [23]:
len(features)

13

In [24]:
model_scores=pd.DataFrame(model_scores).T.sort_values(by='test',ascending=False)
model_scores

Unnamed: 0,model,train,test,Mean_CV_Score,f1score_train,f1score_test
LOGREG,"GridSearchCV(cv=None, error_score=nan,\n ...",0.839505,0.839083,0.839417,0.626048,0.616959
BaggingClassifier_Jonna,"GridSearchCV(cv=None, error_score=nan,\n ...",0.757503,0.763128,0.757503,0.0,0.0
