# Meta Techniques with classifiers

In this session, we'll play around with classifiers, and techniques to optimize them.

## Step 0 - Imports and load training data

In [81]:
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.preprocessing import Normalizer
import warnings
warnings.filterwarnings('ignore')

# Add input as import path
sys.path.insert(0,'../input')

import joblib #or your dataset handler
X, Y = joblib.load("traindata.pkl")
X_test, Y_test = joblib.load("testdata.pkl")

#Double check if all is well
print("TrainData=",X[0:1]," with shape ", X.shape)
print("TrainLables=", Y[0:1], Y.shape) 

print("TestData=",X_test[0:1], " with shape ",X_test.shape)
print("TestLabels=", Y_test[0:1], Y.shape) # We don't have test labels. Should be NaNs

TrainData= [[-1.54609786e+00 -1.34499549e+00  2.75868709e-16  4.81287772e-01
  -4.44999502e-01 -1.56828509e+00  1.27192065e+00 -2.40989649e+00
  -1.90080686e+00  5.98139336e-01  7.33522906e-02]]  with shape  (654, 11)
TrainLables= id
277    1
Name: survived, dtype: int64 (654,)
TestData= [[ 0.84191642 -1.34499549 -0.8449216   0.48128777 -0.4449995   0.53185321
  -1.30008367  0.50783544  0.59567091 -0.719428    0.07335229]]  with shape  (655, 11)
TestLabels= id
621   NaN
Name: survived, dtype: float64 (654,)


In [38]:
import sklearn

# Initialize different models
# Random_state 42 https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy
random_state=42
multiple_classifier = []
multiple_classifier.append(SVC(random_state=random_state))
multiple_classifier.append(DecisionTreeClassifier(random_state=random_state))
multiple_classifier.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),learning_rate=0.1))
multiple_classifier.append(RandomForestClassifier(random_state=random_state))
multiple_classifier.append(ExtraTreesClassifier(random_state=random_state))
multiple_classifier.append(GradientBoostingClassifier(random_state=random_state))
multiple_classifier.append(MLPClassifier(random_state=random_state))
multiple_classifier.append(KNeighborsClassifier())
multiple_classifier.append(LogisticRegression(random_state=random_state))
multiple_classifier.append(LinearDiscriminantAnalysis())
multiple_classifier.append(Perceptron())

cv_results = []
for classifier in multiple_classifier :
    cv_results.append(cross_val_score(classifier, X, y = Y, scoring = "accuracy", cv = StratifiedKFold(n_splits=10), n_jobs=4))
    classifier.fit(X,Y)
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis","Perceptron"]})


In [39]:
print(cv_res)

                     Algorithm  CrossValMeans  CrossValerrors
0                          SVC       0.814895        0.044701
1                 DecisionTree       0.750653        0.046052
2                     AdaBoost       0.766014        0.027747
3                 RandomForest       0.784406        0.049920
4                   ExtraTrees       0.770629        0.047003
5             GradientBoosting       0.801166        0.040181
6      MultipleLayerPerceptron       0.808858        0.042147
7                  KNeighboors       0.798112        0.032997
8           LogisticRegression       0.785921        0.032841
9   LinearDiscriminantAnalysis       0.787622        0.043737
10                  Perceptron       0.700303        0.053873


In [89]:
# RandomForest Hyperparameter tuning
random_forest = RandomForestClassifier()
kfold = StratifiedKFold(n_splits=10)

## Optimal parameters grid
random_forest_params = {"max_depth": [None],
                  "max_features": [3],
                  "min_samples_split": [2],
                  "min_samples_leaf": [10,30],
                  "bootstrap": [True],
                  "n_estimators" :[500,5000],
                  "criterion": ["gini"]}


opt_random_forest = GridSearchCV(random_forest,param_grid = random_forest_params, cv=kfold, scoring="accuracy", n_jobs= 4, verbose=1)
opt_random_forest.fit(X,Y)
opt_random_forest_best = opt_random_forest.best_estimator_
print(opt_random_forest_best)
# Best score
print("random_forest:", opt_random_forest.best_score_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:  1.1min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
random_forest: 0.8241590214067278


In [93]:
# X_TEST -> Y_TEST
multiple_classifier.append(opt_random_forest)

new_X = np.zeros((len(Y), len(multiple_classifier)))
for classifier in multiple_classifier:
    try:
        new_X[:,multiple_classifier.index(classifier)] = classifier.predict_proba(X)[:,1]
    except:
        try:
            new_X[:,multiple_classifier.index(classifier)] = classifier.decision_function(X)[:]
        except:
            pass
    
# Scale data
norm = Normalizer()
norm.fit(new_X)
X_new_normalized= norm.transform(new_X)

print(X_new_normalized.shape)

# Train the ensamble
mlp = MLPClassifier(hidden_layer_sizes=(20,20))
mlp.fit(X_new_normalized,Y)




(654, 13)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [94]:
new_X_test = np.zeros((len(X_test), len(multiple_classifier)))
for classifier in multiple_classifier:
    try:
        new_X_test[:,multiple_classifier.index(classifier)] = classifier.predict_proba(X_test)[:,1]
    except:
        try:
            new_X_test[:,multiple_classifier.index(classifier)] = classifier.decision_function(X_test)[:]
        except:
            pass
    
# Scale data
X_new_normalized_test = norm.transform(new_X_test)

print(X_new_normalized_test[1,:])
print(X_new_normalized_test.shape)

# Train the ensamble

y_new_predict = mlp.predict(X_new_normalized_test)
# Score
print(accuracy_score_numpy(y_new_predict))


[ 0.1590304   0.18818362  0.18818362  0.15054689  0.18818362  0.09609291
  0.1082718   0.18818362  0.09329373  0.10303251 -0.87153291  0.10269861
  0.        ]
(655, 13)
0.7709923664122137


In [14]:
from utils import accuracy_score_numpy

# Ensemble
Y_test = voting.predict(X_test)
print(accuracy_score_numpy(Y_test))

# opt_svc
Y_test = opt_svc.predict(X_test)
print(accuracy_score_numpy(Y_test))

# opt_random_forest
Y_test = opt_random_forest.predict(X_test)
print(accuracy_score_numpy(Y_test))

# opt_gradient_boost
Y_test = opt_gradient_boost.predict(X_test)
print(accuracy_score_numpy(Y_test))



0.8045801526717558
0.7938931297709924
0.8045801526717558
0.8030534351145038
