## Import Libraries

In [110]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import time

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

## Read the data

In [112]:
my_training_data = pd.read_csv('train.csv')
my_testing_data = pd.read_csv('test.csv')
test_id = my_testing_data['ID']

## Convert to onehot 

In [113]:
categorical_cols = ['Cat_1','Cat_2','Cat_3']
my_training_data = pd.get_dummies(my_training_data, columns = categorical_cols)
my_testing_data = pd.get_dummies(my_testing_data, columns = categorical_cols)

## Fill missing values

In [114]:

my_training_data['num_4'].fillna((my_training_data['num_4'].median()), inplace=True)
my_training_data['num_5'].fillna((my_training_data['num_5'].median()), inplace=True)

my_testing_data['num_4'].fillna((my_testing_data['num_4'].median()), inplace=True)
my_testing_data['num_5'].fillna((my_testing_data['num_5'].median()), inplace=True)

## Normalize

In [115]:

scaled_columns=['num_1','num_2','num_3','num_4','num_5','num_6']
for column in scaled_columns:
    my_training_data[column] = (my_training_data[column] - my_training_data[column].mean()) / my_training_data[column].std() 
    my_testing_data[column] = (my_testing_data[column] - my_testing_data[column].mean()) / my_testing_data[column].std() 

## Remove unwanted features

In [117]:

Target =['target_class']
column_to_train = list(my_training_data.columns)
column_to_train.remove('target_class')
column_to_train.remove('ID')
column_to_train.remove('Cat_3_d1')
column_to_train.remove('Cat_3_d2')
column_to_train.remove('num_6')
column_to_train.remove('num_4')
column_to_train

['num_3',
 'num_1',
 'num_5',
 'num_2',
 'Cat_1_d1',
 'Cat_1_d2',
 'Cat_1_d4',
 'Cat_1_d5',
 'Cat_1_d6',
 'Cat_2_d1',
 'Cat_2_d2',
 'Cat_2_d3']

In [42]:
my_training_data.head()

Unnamed: 0,ID,num_3,num_4,num_1,num_5,num_6,num_2,target_class,Cat_1_d1,Cat_1_d2,Cat_1_d4,Cat_1_d5,Cat_1_d6,Cat_2_d1,Cat_2_d2,Cat_2_d3,Cat_3_d1,Cat_3_d2
0,1,-0.036001,-0.823512,-0.687687,-0.341508,1.882562,-1.471565,1,1,0,0,0,0,1,0,0,1,0
1,2,-0.036001,1.626235,0.126056,-1.046135,2.806383,-0.338369,1,0,1,0,0,0,1,0,0,0,1
2,3,-1.793255,-0.823512,-0.562495,0.363119,2.128914,-0.163655,1,0,0,0,0,1,1,0,0,0,1
3,4,-0.914628,-0.006929,-0.708552,0.363119,-0.211433,-1.505775,1,0,0,1,0,0,1,0,0,0,1
4,5,-1.793255,-0.006929,-0.54163,-0.341508,2.005738,-0.707345,1,0,0,0,1,0,1,0,0,1,0


In [118]:

cv_split = model_selection.StratifiedShuffleSplit(n_splits = 5, test_size = .1, random_state = 0 ) 

In [119]:

X = my_training_data[column_to_train]

y = my_training_data[Target[0]]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0,stratify=y)

## Test performance on all the models

In [45]:

print("\n ada:")
ada = ensemble.AdaBoostClassifier(learning_rate= 0.01,  n_estimators= 200,random_state=0)
cv_results = model_selection.cross_validate(ada, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
ada.fit(X_train, y_train)
score = ada.score(X_test, y_test)
print('RealTest_score: ',score)

print("\n bc:")
bc = ensemble.BaggingClassifier(max_samples= 0.01,n_estimators= 100,random_state=0)
cv_results = model_selection.cross_validate(bc, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
bc.fit(X_train, y_train)
score = bc.score(X_test, y_test)
print('RealTest_score: ',score)

print("\n etc:")
etc = ensemble.ExtraTreesClassifier(criterion= 'entropy', max_depth= 7, n_estimators= 300,random_state=0)
cv_results = model_selection.cross_validate(etc, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
etc.fit(X_train, y_train)
score = etc.score(X_test, y_test)
print('RealTest_score: ',score)

print("\n gbc:")
gbc = ensemble.GradientBoostingClassifier(learning_rate= 0.03, max_depth= 2, n_estimators= 200,random_state=0)
cv_results = model_selection.cross_validate(gbc, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
gbc.fit(X_train, y_train)
score = gbc.score(X_test, y_test)
print('RealTest_score: ',score)

print("\n rfc:")
rfc = ensemble.RandomForestClassifier(bootstrap=True,criterion= 'entropy', max_depth= 4,max_features=3,min_samples_leaf=2,min_samples_split=12, n_estimators= 50,oob_score=True)
cv_results = model_selection.cross_validate(rfc, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
rfc.fit(X_train, y_train)
score = rfc.score(X_test, y_test)
print('RealTest_score: ',score)

print("\n svc:")
svc = svm.SVC(C= 1, decision_function_shape= 'ovo', gamma= 0.1,probability=True)
cv_results = model_selection.cross_validate(svc, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
svc.fit(X_train, y_train)
score = svc.score(X_test, y_test)
print('RealTest_score: ',score)


 ada:
test_score:  0.6831515151515151
RealTest_score:  0.6812121212121212

 bc:
test_score:  0.6688484848484848
RealTest_score:  0.68

 etc:
test_score:  0.6172121212121212
RealTest_score:  0.6436363636363637

 gbc:
test_score:  0.6860606060606059
RealTest_score:  0.6787878787878788

 rfc:
test_score:  0.6535757575757575
RealTest_score:  0.6751515151515152

 svc:
test_score:  0.6652121212121213
RealTest_score:  0.6593939393939394


In [18]:

from sklearn.neural_network import MLPClassifier
mlp =  MLPClassifier(random_state=42,max_iter=1000,activation='logistic',hidden_layer_sizes= (100,))
cv_results = model_selection.cross_validate(mlp, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
mlp.fit(X_train, y_train)
score = mlp.score(X_test, y_test)
print('RealTest_score: ',score)

test_score:  0.6696969696969697
RealTest_score:  0.6636363636363637


In [259]:

import xgboost as xgb
clf_xgb=xgb.XGBClassifier(learning_rate=0.01,gamma=5,n_estimators=50,reg_alpha=0,reg_lambda  =0.9,subsample =0.8)
cv_results = model_selection.cross_validate(clf_xgb, my_training_data[column_to_train], my_training_data[Target[0]], cv  = cv_split,return_train_score=True)
print('test_score: ',cv_results['test_score'].mean())
clf_xgb.fit(X_train, y_train)
score = clf_xgb.score(X_test, y_test)
print('RealTest_score: ',score)

test_score:  0.6846060606060606
RealTest_score:  0.6824242424242424


### Voting classifier tuning

In [63]:

from sklearn.ensemble import VotingClassifier
named_estimators = [ ("ada",ada),("bc", bc), ("gbc?", gbc), ("rfc", rfc), ("svc", svc),("mlp",mlp)]
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.6824242424242424

In [33]:
voting_clf.fit(X,y)
voting_clf.score(X_test, y_test)

0.6909090909090909

In [21]:

# voting_clf.fit(X,y)
results = pd.DataFrame(data=[],columns=['ID','target_class'])
results['ID']=test_id
results['target_class'] =voting_clf.predict(my_testing_data[column_to_train])
results.to_csv(voting_clf.__class__.__name__+'score_'+str(0.6836363636363636_2)+'_tuned_ada_bc_gbc_rtc_svc_mlp.csv',index=False)

In [134]:
results = pd.DataFrame(data=[],columns=['ID','target_class'])
results['ID']=test_id
results['target_class'] =gbc.predict(my_testing_data[column_to_train])
results.to_csv(gbc.__class__.__name__+'score_'+str(0.6836363636363636)+'_tuned_gbc.csv',index=False)

## Grid Search

In [16]:


param_grid = {
    'bootstrap': [True],
    'max_depth': [15,20,30],
    'max_features': [ 3,4,5,6],
    'min_samples_leaf': [2,3, 4],
    'min_samples_split': [4,5,6,7],
    'n_estimators': [50,100,150],
    'criterion' :[ 'entropy', 'gini']
}

# Create a based model
rf = ensemble.ExtraTreesClassifier()
# Instantiate the grid search model
grid_search = model_selection.GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X,y)
grid_search.best_score_ ,grid_search.best_params_

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


(0.5800500016534939,
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': 30,
  'max_features': 6,
  'min_samples_leaf': 2,
  'min_samples_split': 4,
  'n_estimators': 50})

### use best param to check the score

In [18]:
rf.set_params(**grid_search.best_params_) 
rf.fit(X_train,y_train)
clf_score = rf.score(X_test, y_test)

print('grid_search_std:',grid_search.cv_results_['std_test_score'][grid_search.best_index_]*100*3)  
print('grid_search_best_score:',grid_search.best_score_)
print('fitted_test_score',clf_score)

grid_search_std: 37.53328576919699
grid_search_best_score: 0.5800500016534939
fitted_test_score 0.658989898989899


In [14]:
rf.set_params(**grid_search.best_params_) 
rf.fit(X_train,y_train)
clf_score = rf.score(X_test, y_test)

print('grid_search_std:',grid_search.cv_results_['std_test_score'][grid_search.best_index_]*100*3)  
print('grid_search_best_score:',grid_search.best_score_)
print('fitted_test_score',clf_score)

grid_search_std: 29.824640656735674
grid_search_best_score: 0.5514440292337709
fitted_test_score 0.6505050505050505


### Save the prediction on test data

In [None]:
rf.fit(X,y)
results = pd.DataFrame(data=[],columns=['ID','target_class'])
results['ID']=test_id
results['target_class'] =rf.predict(my_testing_data[column_to_train])
results.to_csv(rf.__class__.__name__+'score_'+str(clf_score)+'_tuned.csv',index=False)

## Random search

In [None]:
from sklearn.model_selection import RandomizedSearchCV


n_estimators = [int(x) for x in range(100,2000,200)]

max_features = ['d1', 'sqrt',2,3,4,5]

max_depth = [int(x) for x in np.linspace(2, 110, num = 5)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 300, 500, 700, 900, 1100, 1300, 1500, 1700, 1900], 'max_features': ['d1', 'sqrt', 2, 3, 4, 5], 'max_depth': [2, 29, 56, 83, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:

rf = ensemble.RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits




RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 29, 56, 83, 110, None],
                                        'max_features': ['d1', 'sqrt', 2, 3,
                                                         4, 5],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 300, 500, 700,
                                                         900, 1100, 1300, 1500,
                                                         1700, 1900]},
                   random_state=42, verbose=2)

In [None]:
rf_random.best_params_

{'n_estimators': 1300,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 5,
 'max_depth': 2,
 'bootstrap': True}

## Grid seach on all the models

In [None]:
vote_est = [
    ('ada', ensemble.AdaBoostClassifier()),
    ('bc', ensemble.BaggingClassifier()),
    ('etc',ensemble.ExtraTreesClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('rfc', ensemble.RandomForestClassifier()),
    ('gpc', gaussian_process.GaussianProcessClassifier()),
    ('lr', linear_model.LogisticRegressionCV()),
    ('knn', neighbors.KNeighborsClassifier()),
    ('svc', svm.SVC(probability=True)),
   ('xgb', XGBClassifier())

]


In [None]:

grid_n_estimator = [10, 50, 100, 300,400,500]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10, 50,80,100]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]


grid_param = [
            [{
            'n_estimators': grid_n_estimator, 
            'learning_rate': grid_learn, 
            'random_state': grid_seed
            }],
    
            [{
            'n_estimators': grid_n_estimator, 
            'max_samples': grid_ratio,
             }],
    
            [{
            'n_estimators': grid_n_estimator, 
            'criterion': grid_criterion, 
            'max_depth': grid_max_depth, 
             }],

            [{
            'learning_rate': [0.01,0.02,0.03,0.05,0.1], 
            'n_estimators': [10,50,100,300,400], 
            'max_depth': grid_max_depth, 
             }],

            [{
      
            'n_estimators': grid_n_estimator, 
            'criterion': grid_criterion,
            'max_depth': grid_max_depth, 
            'max_features': [2, 3,4,5],
            'min_samples_leaf': [3, 4, 5],
            'min_samples_split': [8, 10, 12],
            'bootstrap': [True],
            'oob_score': [True], 
             }
            ], 
    
            [{    
            'max_iter_predict': grid_n_estimator,
            'random_state': grid_seed
            }],
        

            [{
            'fit_intercept': grid_bool, 
            'penalty': ['l1','l2'],
            'solver': [ 'liblinear',  'saga'], 
            'random_state': grid_seed,
            'max_iter':[5000]
             },
            {
            'fit_intercept': grid_bool, 
            'penalty': ['l2'],
            'solver': [ 'lbfgs','newton-cg','sag'],
            'random_state': grid_seed,
            'max_iter':[5000]
             }],
            
            [{
            'n_neighbors': [1,2,3,4,5,6,7],
            'weights': ['uniform', 'distance'],
            'algorithm': ['d1', 'ball_tree', 'kd_tree', 'brute']
            }],
            
    
            [{
            'C': [1,2,3,4,5], 
            'gamma': grid_ratio, 
            'decision_function_shape': ['ovo', 'ovr'],
            'probability': [True],
            'random_state': grid_seed
             }],

    
            [{
            'learning_rate': grid_learn, 
            'max_depth': [1,2,4,6,8,10],
            'n_estimators': grid_n_estimator, 
             }]   
        ]


In [None]:


all_models_results_col = [ 'Search Parameters', 'Search score', 'Search score 3*STD' ,'Time(s)','Clf_Score']
all_models_results = pd.DataFrame(data=[],columns = all_models_results_col,index=[clf[1].__class__.__name__ for clf in vote_est])
start_total = time.perf_counter() 
for clf, param in zip (vote_est, grid_param): 
  
    
    start = time.perf_counter()        
    best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = 'roc_auc')
    best_search.fit(X_train, y_train)
    run = time.perf_counter() - start

    best_param = best_search.best_params_
    clf[1].set_params(**best_param) 
    clf[1].fit(X_train,y_train)
    clf_score = clf[1].score(x_test, y_test)


    clf[1].fit(X,y)
    results = pd.DataFrame(data=[],columns=['ID','target_class'])
    results['ID']=test_id
    results['target_class'] =clf[1].predict(my_testing_data[column_to_train])
    results.to_csv(clf[1].__class__.__name__+'score_'+str(clf_score)+'_tuned.csv',index=False)
    
  

    all_models_results.loc[clf[1].__class__.__name__]['Search Parameters']=best_search.best_params_
    all_models_results.loc[clf[1].__class__.__name__]['Search score']=best_search.best_score_
    all_models_results.loc[clf[1].__class__.__name__]['Search score 3*STD']=best_search.cv_results_['std_test_score'][best_search.best_index_]*100*3
    all_models_results.loc[clf[1].__class__.__name__]['Time(s)']=run
    all_models_results.loc[clf[1].__class__.__name__]['Clf_Score']=clf_score
    
    print('The best parameter for {} is {} with a search score of {:.2f} clf score of {:.2f} and runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param,best_search.best_score_,clf_score, run))

run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))

print('-'*10)
all_models_results.sort_values(by = ['Search score'], ascending = False, inplace = True)
all_models_results