# Meta Techniques with classifiers

In this session, we'll play around with classifiers, and techniques to optimize them.

## Step 0 - Imports and load training data

In [16]:
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

import warnings
warnings.filterwarnings('ignore')

# Add input as import path
sys.path.insert(0,'../input')

import joblib #or your dataset handler
X, Y = joblib.load("traindata.pkl")
X_test, Y_test = joblib.load("testdata.pkl")

#Double check if all is well
print("TrainData=",X[0:1]," with shape ", X.shape)
print("TrainLables=", Y[0:1], Y.shape) 

print("TestData=",X_test[0:1], " with shape ",X_test.shape)
print("TestLabels=", Y_test[0:1], Y.shape) # We don't have test labels. Should be NaNs

TrainData= [[ -1.54609786e+00  -1.34499549e+00   2.75868709e-16   4.81287772e-01
   -4.44999502e-01  -1.56828509e+00   1.27192065e+00  -2.40989649e+00
   -1.90080686e+00   5.98139336e-01   7.33522906e-02]]  with shape  (654, 11)
TrainLables= id
277    1
Name: survived, dtype: int64 (654,)
TestData= [[ 0.84191642 -1.34499549 -0.8449216   0.48128777 -0.4449995   0.53185321
  -1.30008367  0.50783544  0.59567091 -0.719428    0.07335229]]  with shape  (655, 11)
TestLabels= id
621   NaN
Name: survived, dtype: float64 (654,)


In [30]:
import sklearn

# Initialize different models
# Random_state 42 https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy
random_state=42
multiple_classifier = []
multiple_classifier.append(SVC(random_state=random_state))
multiple_classifier.append(DecisionTreeClassifier(random_state=random_state))
multiple_classifier.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),learning_rate=0.1))
multiple_classifier.append(RandomForestClassifier(random_state=random_state))
multiple_classifier.append(ExtraTreesClassifier(random_state=random_state))
multiple_classifier.append(GradientBoostingClassifier(random_state=random_state))
multiple_classifier.append(MLPClassifier(random_state=random_state))
multiple_classifier.append(KNeighborsClassifier())
multiple_classifier.append(LogisticRegression(random_state=random_state))
multiple_classifier.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in multiple_classifier :
    cv_results.append(cross_val_score(classifier, X, y = Y, scoring = "accuracy", cv = StratifiedKFold(n_splits=10), n_jobs=4))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis"]})


In [31]:
print(cv_res)

                    Algorithm  CrossValMeans  CrossValerrors
0                         SVC       0.814895        0.044701
1                DecisionTree       0.750653        0.046052
2                    AdaBoost       0.758392        0.032068
3                RandomForest       0.784406        0.049920
4                  ExtraTrees       0.770629        0.047003
5            GradientBoosting       0.801166        0.040181
6     MultipleLayerPerceptron       0.808858        0.042147
7                 KNeighboors       0.798112        0.032997
8          LogisticRegression       0.785921        0.032841
9  LinearDiscriminantAnalysis       0.787622        0.043737


In [36]:
# RandomForest Hyperparameter tuning
random_forest = RandomForestClassifier()
kfold = StratifiedKFold(n_splits=10)

## Optimal parameters grid
random_forest_params = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[100, 300],
                  "criterion": ["gini"]}


opt_random_forest = GridSearchCV(random_forest,param_grid = random_forest_params, cv=kfold, scoring="accuracy", n_jobs= 4, verbose=2)
opt_random_forest.fit(X,Y)
opt_random_forest_best = opt_random_forest.best_estimator_

# Best score
print("random_forest:", opt_random_forest.best_score_)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   13.4s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   27.7s
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:   44.5s finished


random_forest: 0.824159021407


In [42]:
# Gradient boosting tunning

gradient_boost = GradientBoostingClassifier()
gradient_boost_params = {'loss' : ["deviance"],
              'n_estimators' : [100, 200, 300, 500, 800, 1000],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8, 16],
              'min_samples_leaf': [150, 400, 100, 1000],
              'max_features': [0.3, 0.1] 
              }

opt_gradient_boost = GridSearchCV(gradient_boost,param_grid = gradient_boost_params, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
opt_gradient_boost.fit(X,Y)
opt_gradient_boost_best = opt_gradient_boost.best_estimator_

# Best score
print("random_forest:", opt_gradient_boost.best_score_)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done 530 tasks      | elapsed:   23.3s
[Parallel(n_jobs=4)]: Done 1530 tasks      | elapsed:   57.7s
[Parallel(n_jobs=4)]: Done 2930 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 4320 out of 4320 | elapsed:  2.6min finished


random_forest: 0.804281345566


In [44]:
### SVC tunning
svc = SVC(probability=True)
svc_params = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}

opt_svc = GridSearchCV(svc,param_grid = svc_params, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

opt_svc.fit(X,Y)

opt_svc_best = opt_svc.best_estimator_

# Best score
opt_svc.best_score_

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Done  50 tasks      | elapsed:    5.9s
[Parallel(n_jobs=4)]: Done 280 out of 280 | elapsed:   18.4s finished


0.8165137614678899

In [47]:
voting = VotingClassifier(estimators=[('svc', opt_svc), ('gb', opt_gradient_boost), ('rf', opt_random_forest)], voting='soft', n_jobs=4)

voting = voting.fit(X, Y)


AttributeError: 'VotingClassifier' object has no attribute 'best_score_'

In [55]:
from utils import accuracy_score_numpy

# Ensemble
Y_test = voting.predict(X_test)
print(accuracy_score_numpy(Y_test))

# opt_svc
Y_test = opt_svc.predict(X_test)
print(accuracy_score_numpy(Y_test))

# opt_random_forest
Y_test = opt_random_forest.predict(X_test)
print(accuracy_score_numpy(Y_test))

# opt_gradient_boost
Y_test = opt_gradient_boost.predict(X_test)
print(accuracy_score_numpy(Y_test))



0.803053435115
0.793893129771
0.806106870229
0.798473282443
