In [33]:
from mlxtend.classifier import StackingClassifier
from numpy import recfromcsv

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import RFE

from sklearn.preprocessing import StandardScaler, Imputer, PolynomialFeatures
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC


from sklearn.metrics import classification_report, accuracy_score

In [34]:
dataset_x = recfromcsv("train.csv", names=None)
dataset_y = recfromcsv("trainLabels.csv", names=None)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=0.2)

## Preprocsessing

In [36]:
scaler = StandardScaler()
imputer = Imputer(strategy='mean') # if value is nan replace with mean of the column
select = RFE(LinearSVC(), n_features_to_select=20)
poly = PolynomialFeatures(2)
pca = KernelPCA(kernel="rbf")

In [37]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

## Gradient Boosting Classifier

In [38]:
gbt = GradientBoostingClassifier(n_estimators=300,
                                 learning_rate=0.02,
                                 min_samples_leaf=7, # since training examples is less
                                 max_features="sqrt")

In [39]:
gbt_pipeline = Pipeline([
                     ('imputer', imputer),
                     ('scaler', scaler), 
#                     ('select', select),
#                      ('pca', pca), 
                     ('poly', poly), 
                     ('gbt', gbt)]
)
param_grid = [{
             'gbt__loss': ['deviance', 'exponential'],
             'gbt__max_depth': [9, 13, 17]
}]

In [None]:
gbt_estimator = GridSearchCV(gbt_pipeline, 
                         param_grid, 
                         cv=cv, 
                         scoring='accuracy',
                         n_jobs=8,
                         iid=False
)

gbt_estimator.fit(X_train, y_train)

In [41]:
gbt_model = gbt_estimator.best_estimator_

gbt_model = gbt_model.fit(X_train, y_train) # train with full training data

y_pred = gbt_model.predict(X_test) # test with only test data
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.89      0.92      0.91       100
          1       0.92      0.89      0.90       100

avg / total       0.91      0.91      0.90       200



0.90500000000000003

Since f1-score is relativelely good we can conclude that we did not <strong>overfit</strong> our model.

## RandomForestClassifier

In [42]:
rfc = RandomForestClassifier(n_estimators=300,
                             min_samples_leaf=7,
                             max_features="sqrt")

In [43]:
rfc_pipeline = Pipeline([
                     ('imputer', imputer),
                     ('scaler', scaler),
#                     ('select', select), #did not work better
                     ('poly', poly), 
                     ('rfc', rfc)]
)
param_grid = [{'rfc__max_depth': [9, 13, 17]}]

In [None]:
rfc_estimator = GridSearchCV(rfc_pipeline, 
                         param_grid, 
                         cv=cv,
                         scoring='accuracy',
                         n_jobs=8,
                        )
rfc_estimator.fit(X_train, y_train)

In [45]:
rfc_model = rfc_estimator.best_estimator_

rfc_model = rfc_model.fit(X_train, y_train) # train with full training data

y_pred = gbt_model.predict(X_test)
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.89      0.92      0.91       100
          1       0.92      0.89      0.90       100

avg / total       0.91      0.91      0.90       200



0.90500000000000003

## Stacking Classifier

In [52]:
lr = LogisticRegression()

sclf = StackingClassifier(classifiers=[gbt_model, rfc_model], meta_classifier=lr)

In [None]:
sclf.fit(X_train, y_train)

In [54]:
y_pred = sclf.predict(X_test)
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.92      0.91      0.91       100
          1       0.91      0.92      0.92       100

avg / total       0.92      0.92      0.91       200



0.91500000000000004