In [None]:
def confusion_mat(y_pred, y_test):
    plt.figure()
    sns.set(font_scale=1.5)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='g')
    plt.title('Confusion matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)

scores = cross_val_score(knn, X,Y, cv=RepeatedStratifiedKFold(n_repeats=CV_N_REPEATS))
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

In [None]:
dt = DecisionTreeClassifier(random_state=1,max_depth=2)
dt = dt.fit(X_train, y_train)
dt_scores = cross_val_score(dt, X,Y, cv=RepeatedStratifiedKFold(n_repeats=CV_N_REPEATS))
print('Accuracy: %0.2f (+/- %0.2f)' % (dt_scores.mean(), dt_scores.std() * 2))

In [None]:
tree.fit(X_train, y_train)
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5,random_state=0)
tree.fit(X_train, y_train)

export_graphviz(tree, out_file='tree.dot', feature_names=X.columns, rounded=True, filled=True)


In [None]:
bag = BaggingClassifier(n_estimators=100, oob_score=True, random_state=1)
bag=bag.fit(X_train, y_train)

bag_scores = cross_val_score(bag, X,Y, cv=RepeatedStratifiedKFold(n_repeats=CV_N_REPEATS))
print('Accuracy: %0.2f (+/- %0.2f)' % (bag_scores.mean(), bag_scores.std() * 2))

In [None]:
num_estimators = 100
rf = RandomForest(n_estimators=num_estimators)
rf.fit(X_train, y_train)

rf_scores = rf.score(X_test, y_test)
print('Accuracy: %0.2f (+/- %0.2f)' % (rf_scores.mean(), rf_scores.std() * 2))
y_pred = rf.predict(X_test)
confusion_mat(y_pred, y_test)


In [None]:
feature_names = X_train.columns.values
show_weights(rf, feature_names=feature_names)

In [None]:
scores = cross_val_score(rf, X,Y, RepeatedStratifiedKFold(n_repeats=CV_N_REPEATS))
scores_est = scores.copy()
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

score_line = f"Scores (Accuracy) mean={scores.mean():0.2f} std={scores.std():0.2f}"
plt.figure()
fig, ax = plt.subplots() 
pd.Series(scores).hist(ax=ax, bins=BINS)
ax.set_title(f"RepeatedKFold({len(scores)} folds) with randomForest \n"+score_line)
ax.set_xlabel("Score")
ax.set_ylabel("Frequency")

In [None]:
plt.figure()
fig,ax = plt.subplots()
df_dummy_est_scores = pd.DataFrame({'dummy': scores_dummy, 'RF': scores_est})
df_dummy_est_scores.plot(kind='hist',ax=ax, bins=20)
ax.set_xlabel('Score')
ax.set_title("Dummy vs RandomForest scores")

In [None]:
from sklearn.metrics import mean_squared_log_error
params = {'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 4, 'loss': 'deviance', 'random_state': 0}
gbm = GradientBoostingClassifier(**params)
gbm.fit(X_train, y_train)

In [None]:
y_pred = gmb.predict(X_test)
gmb_score = accuracy_score(y_test,y_pred)
print("Accuracy of GMB Classifier: {0:0.2f}".format(gmb_score*100.0))

confusion_mat(y_pred, y_test)

In [None]:
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i,y_pred in enumerate(gbm.staged_predict(X_test)):
    test_score[i] = gbm.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('GMB Deviance w.r.t Number of Estimators')
plt.plot(np.avarange(params['n_estimators'])+1, gbm.train_score_, 'b-', label='Training Set Deviance')
plt.plot(np.avarange(params['n_estimators'])+1, test_score, 'r-', label='Test Set Deviance')
plt.legend(loc='best')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

In [None]:
feature_importance = gmb.feature_importances_
feature_importance = 100*(feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X_train.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
pip install mlxtend

In [None]:
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[knn,rf,xgb, gmb], meta_classifier=rf)
print('10-fold cross validation:\n')

In [None]:
for clf, label in zip([knn, rf, xgb, gmb], ['KNearest Neighbors', 'Random Forest', 'XGB', 'GMB', 'MetaClassifier']):
    sclf_scores = model_selection.cross_val_score(clf, X,Y,cv=10, scoring = 'accuracy')

print('Accuracy: %0.2f (+/- %0.2f)[%s]' % (sclf_scores.mean(), sclf_scores.std(), label))

In [None]:
models = []
models.append(('KNN', knn))
models.append(('DT', dt))
models.append(('RF', rf))
models.append(('XGB', xgb))
models.append(('GMB', gmb))
models.append(('Voting', ensemble_knn_rf_xgb))


In [None]:
results = []
names = []
for name, model in models:
    kfold = RepeatedStratifiedKFold(n_repeats=CV_N_REPEATS)
    cv_results = cross_val_score(model, X,Y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

results.append(np.asarray(sclf_scores))
names.append('Stacking')

In [None]:
fig = plt.figure(figsize=(10,6))
fig.suptitle('Algorithm Comparison')
ax.sns.boxplot(x=names, y=results)
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.show()

In [None]:
visualizer = ClassificationReport(xgb, classes = ["N"])