## Set Up

In [None]:
#Load basic packages
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
import time
from operator import itemgetter
import os

## Clean Data

-no missing values

-make target variable binary and move to column position 0

-transform any categorical/factor variables

-delete any variables that do not have predictive power (i.e. ID)

-check for skewness of any variables

-check for balance with target variable

## Create Test/Train Dataset

Choose size of test/train datasets by changing "test_size"

In [None]:
features_train, features_test, target_train, target_test = train_test_split(df.iloc[:,1:].values, df.iloc[:,0].values, test_size=0.30, random_state=0)

## KNN (K Nearest Neighbor)

"Lazy learner," looks at k nearest neighbors to determine where to classify a datapoint

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#build model
knn = KNeighborsClassifier()
knn = knn.fit(features_train, target_train) #Fit to training data
target_predicted_knn = knn.predict(features_test) #Predict model again test data

#check accuracy
print("Knn Accuracy Score", accuracy_score(target_test, target_predicted_knn))
print(classification_report(target_test, target_predicted_knn)) 
cm=confusion_matrix(target_test, target_predicted_knn)
print(cm2)
plt.matshow(cm2)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate 
scores = cross_val_score(knn, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Decision Tree

"Greedy algorithm," variables are split into buckets based on how much they contribute to the decision, attribute with highest information gain is first place to split, then repeat

In [None]:
from sklearn import tree 

#build model
dt = tree.DecisionTreeClassifier()
dt = dt.fit(features_train, target_train)
target_predicted_dt = dt.predict(features_test)

#check accuracy
print("DT Accuracy Score", accuracy_score(target_test, target_predicted_dt))
print(classification_report(target_test, target_predicted_dt))
#confusion matrix
cm=confusion_matrix(target_test, target_predicted_dt)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(dt, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Random Forest

Average (or mode) of many decision trees, often less overfitting than decision trees

In [None]:
from sklearn.ensemble import RandomForestClassifier

#build model
rf = RandomForestClassifier()
rf = rf.fit(features_train, target_train)
target_predicted_rf = rf.predict(features_test)

#check accuracy
print("RF Accuracy Score", accuracy_score(target_test, target_predicted_rf))
print(classification_report(target_test, target_predicted_rf))
#confusion matrix
cm=confusion_matrix(target_test, target_predicted_rf)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(rf, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Bagging

Bagging resamples training data, performs bootstrapping, and then aggregates (either by voting or an average); this method improves on the performance of weak learners (in this example I improve upon a knn model)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

#build model
clf_bag = BaggingClassifier(KNeighborsClassifier())
clf_bag.fit(features_train, target_train)
predicted_bag=clf_bag.predict(features_test)
expected = target_test

#check accuracy
print("Bagging Accuracy", accuracy_score(expected,predicted_bag))
print(classification_report(expected, predicted_bag))
cm=confusion_matrix(expected, predicted_bag)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(clf_bag, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

## Adaboost

Algorithm can also be used to boost performance of weak learners (this example boosts a typical decision tree); combines output of weaker learners using a weighted sum; uses an iterative procedure so that new models are influenced by previously built ones (the weighting allows there to be a focus on what previous models got wrong)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#build model
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), algorithm="SAMME", n_estimators=200)
bdt.fit(features_train, target_train)
predicted_bdt=bdt.predict(features_test)
expected = target_test

#check accuracy
print("Adaboost Accuracy", accuracy_score(expected,predicted_bdt))
print(classification_report(expected, predicted_bdt))
cm=confusion_matrix(target_test, predicted_bdt)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(bdt, features_train, target_train, cv=3)
print("Cross Validation Score for each K",scores)
scores.mean()

## Stochastic Gradient Descent

Finds the minimum value of cuntions where a closed form solution is not easily obtainable or available; forms partial differentiation equations, sets to 0, and solves; we want to find the minimum because that's where the lowest error is (and the best model); the "gradient" refers to the function you are trying to minimize

In [None]:
from sklearn.linear_model import SGDClassifier

#build model
sgd = SGDClassifier() 
sgd.fit(features_train, target_train)
predicted_sgd=sgd.predict(features_test)
expected = target_test

#check accuracy
print("SGD Accuracy Score", accuracy_score(expected,predicted_sgd))
print(classification_report(expected, predicted_sgd))
cm=confusion_matrix(target_test, predicted_sgd)
print(cm)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

#cross validate
scores = cross_val_score(sgd, features_train, target_train, cv=10)
print("Cross Validation Score for each K",scores)
scores.mean()

In [None]:
test1

In [None]:
test2

In [None]:
test3