## Set Up

In [None]:
#Load basic packages
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
import time
from operator import itemgetter
import os

## Clean Data

-no missing values

-make target variable binary and move to column position 0

-transform any categorical/factor variables

-delete any variables that do not have predictive power (i.e. ID)

-check for skewness of any variables

-check for balance with target variable

## Create Test/Train Dataset

Choose size of test/train datasets by changing "test_size"

In [None]:
features_train, features_test, target_train, target_test = train_test_split(df.iloc[:,1:].values, df.iloc[:,0].values, test_size=0.30, random_state=0)

## KNN (K Nearest Neighbor)

"Lazy learner," looks at k nearest neighbors to determine where to classify a datapoint

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#build model
knn = KNeighborsClassifier()
knn = knn.fit(features_train, target_train) #Fit to training data
target_predicted_knn = knn.predict(features_test) #Predict model again test data

#check accuracy
print("Knn Accuracy Score", accuracy_score(target_test, target_predicted_knn))
print(classification_report(target_test, target_predicted_knn)) 
cm=confusion_matrix(target_test, target_predicted_knn)
print(cm2)
plt.matshow(cm2)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate 
scores = cross_val_score(knn, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Decision Tree

"Greedy algorithm," variables are split into buckets based on how much they contribute to the decision, attribute with highest information gain is first place to split, then repeat

In [None]:
from sklearn import tree 

#build model
dt = tree.DecisionTreeClassifier()
dt = dt.fit(features_train, target_train)
target_predicted_dt = dt.predict(features_test)

#check accuracy
print("DT Accuracy Score", accuracy_score(target_test, target_predicted_dt))
print(classification_report(target_test, target_predicted_dt))
#confusion matrix
cm=confusion_matrix(target_test, target_predicted_dt)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(dt, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Extra Trees

Algorithm uses a random number of "extra" decision trees on subsets of data; then aggregates using an average

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

#build model
xdt = ExtraTreesClassifier()
xdt.fit(features_train, target_train)
predicted_xdt=xdt.predict(features_test)
expected = target_test

#checking accuracy
print("Extra Trees", accuracy_score(expected,predicted_xdt))
print(classification_report(expected, predicted_xdt))
cm=confusion_matrix(expected, predicted_xdt)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(xdt, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Random Forest

Average (or mode) of many decision trees, often less overfitting than decision trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

#build model
rf = RandomForestClassifier()
rf = rf.fit(features_train, target_train)
target_predicted_rf = rf.predict(features_test)

#check accuracy
print("RF Accuracy Score", accuracy_score(target_test, target_predicted_rf))
print(classification_report(target_test, target_predicted_rf))
#confusion matrix
cm=confusion_matrix(target_test, target_predicted_rf)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(rf, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Bagging

Bagging resamples training data, performs bootstrapping, and then aggregates (either by voting or an average); this method improves on the performance of weak learners (in this example I improve upon a knn model)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

#build model
clf_bag = BaggingClassifier(KNeighborsClassifier())
clf_bag.fit(features_train, target_train)
predicted_bag=clf_bag.predict(features_test)
expected = target_test

#check accuracy
print("Bagging Accuracy", accuracy_score(expected,predicted_bag))
print(classification_report(expected, predicted_bag))
cm=confusion_matrix(expected, predicted_bag)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(clf_bag, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Adaboost

Algorithm can also be used to boost performance of weak learners (this example boosts a typical decision tree); combines output of weaker learners using a weighted sum; uses an iterative procedure so that new models are influenced by previously built ones (the weighting allows there to be a focus on what previous models got wrong)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#build model
bdt = AdaBoostClassifier(DecisionTreeClassifier())
bdt.fit(features_train, target_train)
predicted_bdt=bdt.predict(features_test)
expected = target_test

#check accuracy
print("Adaboost Accuracy", accuracy_score(expected,predicted_bdt))
print(classification_report(expected, predicted_bdt))
cm=confusion_matrix(target_test, predicted_bdt)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(bdt, features_train, target_train, cv=3)
print("Cross Validation Score for each K",scores)
scores.mean()

## Stochastic Gradient Descent

Finds the minimum value of cuntions where a closed form solution is not easily obtainable or available; forms partial differentiation equations, sets to 0, and solves; we want to find the minimum because that's where the lowest error is (and the best model); the "gradient" refers to the function you are trying to minimize

In [None]:
from sklearn.linear_model import SGDClassifier

#build model
sgd = SGDClassifier() 
sgd.fit(features_train, target_train)
predicted_sgd=sgd.predict(features_test)
expected = target_test

#check accuracy
print("SGD Accuracy Score", accuracy_score(expected,predicted_sgd))
print(classification_report(expected, predicted_sgd))
cm=confusion_matrix(target_test, predicted_sgd)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(sgd, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Gradient Boosting

Combination of gradient descent and boosting; uses weak learners to progressively improve upon shortcomings of previous weak learners 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#build model
GBC = GradientBoostingClassifier()
GBC.fit(features_train, target_train)
predicted_GBC=GBC.predict(features_test)
expected = target_test

#checking accuracy
print("Gradient Boost Accuracy", accuracy_score(expected,predicted_GBC))
print(classification_report(expected, predicted_GBC))
cm=confusion_matrix(expected, predicted_GBC)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(GBC, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## SVM Linear

Support Vector Machine model; searches for linear optimal separating hyperplane (decision boundary) using support vectors and margins; the support vectors define the distance and direction of the hyperplane while the margin gives the distance between support vectors; a kernel chooses a hyperplane that gives the largest margin

In [None]:
from sklearn.svm import LinearSVC

#build model
clf_linSVC=LinearSVC()
clf_linSVC.fit(features_train, target_train)
predicted_SVC=clf_linSVC.predict(features_test)
expected = target_test

#checking accuracy
print(accuracy_score(expected,predicted_SVC))
print(classification_report(expected, predicted_SVC))
cm=confusion_matrix(expected, predicted_SVC)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(clf_linSVC, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## SVM RBF

Support Vector Machine using Radial Basis Function kernel; "kernel trick" = mapping original feature space to some higher-dimensional feature space where the training set is seperable

*may take awhile to run

In [None]:
from sklearn.svm import SVC

#build model
clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(features_train, target_train)
predicted_rbf=clf_rbf.predict(features_test)
expected = target_test

#checking accuracy
print(accuracy_score(expected,predicted_rbf))
print(classification_report(expected, predicted_rbf))
cm=confusion_matrix(expected, predicted_rbf)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(clf_rbf, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

# ANN

Artificial Neural Network, set of connected input/output units where each connection has a weight associated with it; during learning phase, network readjusts the weights so it can better predict the correct class label; similar to way the brain learns

In [None]:
from sklearn.neural_network import MLPClassifier

#build model
mlp = MLPClassifier()
mlp.fit(features_train, target_train)
predicted_mlp=mlp.predict(features_test)
expected = target_test

#checking accuracy
print(accuracy_score(expected,predicted_mlp))
print(classification_report(expected, predicted_mlp))
cm=confusion_matrix(expected, predicted_mlp)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(mlp, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Stacking

"Stacks" several different predictive models ontop of each other to form a new model; often has better performance than the individual models alone; this example stacks Random Forest, Gaussian Naive Bayes, and Adaboost Classifiers with "hard" voting (meaning the final class label is predicted by whatever class has been predicted most frequently by the classification models)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

#build model
clf1 = RandomForestClassifier()
clf2 = GaussianNB()
bdt = AdaBoostClassifier()
eclf2 = VotingClassifier(estimators=[('rf', clf1), ('gnb', clf2), ('bdt', bdt)], voting='hard')
eclf2.fit(features_train, target_train)
predicted_eclf2=eclf2.predict(features_test)
expected = target_test

#checking accuracy
print(accuracy_score(expected,predicted_eclf2))
print(classification_report(expected, predicted_eclf2))
cm=confusion_matrix(expected, predicted_eclf2)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(eclf2, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()