# Non-neural classifiers

In [52]:
"""
Import standard libraries, helper functions and sklearn classifiers.
"""
import csv
import numpy as np
import common
import preprocess

from sklearn import svm, neighbors, tree
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, f1_score
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

## Load and Pre-process Datasets

In [53]:
"""
This cell loads the training and the testing data.
"""

X_train = preprocess.load_data(common.X_train_file)
X_test = preprocess.load_data(common.X_test_file)
y_train = preprocess.load_label(common.y_train_file)

X_train = common.normalize(X_train)
X_test = common.normalize(X_test)

X_train_pca = common.reduce_dimension(X_train, common.DIMENSION)
X_test_pca = common.reduce_dimension(X_test, common.DIMENSION)

## Classifiers

In [54]:
"""
This cell defines functions for all non-neural classifiers used in our
submission. Each of the classifiers returns an object 'clf' which
implements 'fit'.
"""

def dtree(max_depth=None):
    # Decision tree classifier
    clf = tree.DecisionTreeClassifier(max_depth=max_depth)
    return clf

def dtree_adaboost():
    # Single layer decision trees with AdaBoost
    single_tree = dtree(max_depth=1)
    clf = AdaBoostClassifier(single_tree, algorithm='SAMME', n_estimators=20)
    return clf

def SVM(kernel, degree=3, C=1.0, gamma='auto'):
    # Support vector machines
    clf = svm.SVC(kernel=kernel, degree=degree, C=C, gamma=gamma)
    return clf

def kNN(n_neighbor=15):
    # k nearest neighbours
    clf = neighbors.KNeighborsClassifier(n_neighbor)
    return clf

## Accuracy Evaluators

In [None]:
"""
This cell defines functions to compute the performance of our model.
"""

def compute_f1(model, X, y):
    """
    Given a model and the evaluation data, returns the F1 score.
    """
    return np.mean(cross_val_score(model, X, y, cv=common.N_FOLDS, scoring='f1_weighted'))

def accuracy(model, X, y):
    """
    Given a model and the evaluation data, returns the accuracy
    score evaluated using cross validation.
    """
    return np.mean(cross_val_score(model, X, y, cv=common.N_FOLDS, scoring='accuracy'))

## Evaluation 

In [None]:
"""
This cell uses different classifiers and prints their F1 scores and accuracies.
"""

svm_linear = SVM('linear')
f1 = compute_f1(svm_linear, X_train, y_train)
print("F1 of linear model is:", f1)
acc = accuracy(svm_linear, X_train, y_train)
print("Accuracy of linear model is",acc)

# Upon experimentation, we found that the results are the best
# when degree = 2
svm_poly = SVM('poly', degree=2)
f1 = compute_f1(svm_poly, X_train, y_train)
print("F1 of polynomial model is:", f1)
acc = accuracy(svm_poly, X_train, y_train)
print("Accuracy of polynomial model is", acc)

svm_RBF = SVM('rbf',C=1000,gamma=0.001)
f1 = compute_f1(svm_RBF, X_train, y_train)
print("F1 of RBF model is:", f1)
acc = accuracy(svm_RBF,X_train,y_train)
print("Accuracy of RBF model is",acc)

decision_tree = dtree()
f1 = compute_f1(decision_tree, X_train, y_train)
print("F1 of decision tree model is:", f1)
acc = accuracy(decision_tree,X_train,y_train)
print("Accuracy of decision tree model is",acc)

dtree_ada = dtree_adaboost()
f1 = compute_f1(dtree_ada, X_train, y_train)
print("F1 of decision tree model with adaboost is:", f1)
acc = accuracy(dtree_ada,X_train,y_train)
print("Accuracy of decision tree model with adaboost is",acc)

for k in (1, 3, 5, 7, 9, 11, 13, 15):
    print("For ",k, "neighbours")
    knn = kNN(n_neighbor=k)
    f1 = compute_f1(knn, X_train, y_train)
    print("F1 of kNN model is:", f1)
    acc = accuracy(knn,X_train,y_train)
    print("Accuracy of kNN is",acc)

F1 of linear model is: 0.806997267283
Accuracy of linear model is 0.808477296359
F1 of polynomial model is: 0.811523934767
Accuracy of polynomial model is 0.813680232456


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


F1 of RBF model is: 0.23737993217


## Evaluation after Dimensionality Reduction with PCA

In [62]:
"""
This cell uses different classifiers and prints their F1 scores and accuracies.
"""

svm_linear = SVM('linear')
f1 = compute_f1(svm_linear, X_train_pca, y_train)
print("F1 of linear model is:", f1)
acc = accuracy(svm_linear, X_train_pca, y_train)
print("Accuracy of linear model is",acc)

# Upon experimentation, we found that the results are the best
# when degree = 2
svm_poly = SVM('poly', degree=2)
f1 = compute_f1(svm_poly, X_train_pca, y_train)
print("F1 of polynomial model is:", f1)
acc = accuracy(svm_poly, X_train_pca, y_train)
print("Accuracy of polynomial model is", acc)

svm_RBF = SVM('rbf',C=1000,gamma=0.001)
f1 = compute_f1(svm_RBF, X_train_pca, y_train)
print("F1 of RBF model is:", f1)
acc = accuracy(svm_RBF,X_train_pca,y_train)
print("Accuracy of RBF model is",acc)

decision_tree = dtree()
f1 = compute_f1(decision_tree, X_train_pca, y_train)
print("F1 of decision tree model is:", f1)
acc = accuracy(decision_tree,X_train_pca,y_train)
print("Accuracy of decision tree model is",acc)

dtree_ada = dtree_adaboost()
f1 = compute_f1(dtree_ada, X_train_pca, y_train)
print("F1 of decision tree model with adaboost is:", f1)
acc = accuracy(dtree_ada,X_train_pca,y_train)
print("Accuracy of decision tree model with adaboost is",acc)

for k in (1, 3, 5, 7, 9, 11, 13, 15):
    print("For ",k, "neighbours")
    knn = kNN(n_neighbor=k)
    f1 = compute_f1(knn, X_train_pca, y_train)
    print("F1 of kNN model is:", f1)
    acc = accuracy(knn,X_train_pca,y_train)
    print("Accuracy of kNN is",acc)

F1 of linear model is: 0.803147509028
Accuracy of linear model is 0.804395663706


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


F1 of polynomial model is: 0.23737993217
Accuracy of polynomial model is 0.408926148179
F1 of RBF model is: 0.815126354916
Accuracy of RBF model is 0.817820243382
F1 of decision tree model is: 0.354563405262
Accuracy of decision tree model is 0.359029026118


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


F1 of decision tree model with adaboost is: 0.346508389508
Accuracy of decision tree model with adaboost is 0.42979627084


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


F1 of kNN model is: 0.480730096533
Accuracy of kNN model is 0.558203469212


## Optimal Hyper-parameter Search Using Grid Search

In [63]:
# Commented out as this is time-consuming

# RBF with optimal hyperparameters
# param_grid = {'C': [1000, 50000, 100000],
#              'gamma': [0.0005, 0.005, 0.001, 0.1], }
# svm_RBF_optimal = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
# f1 = compute_f1(svm_RBF_optimal, X_train, y_train)
# print("F1 of RBF model with optimal hyperparameters is:", f1)

# f1 = compute_f1(svm_RBF_optimal, X_train_pca, y_train)
# print("F1 of RBF model with optimal hyperparameters and reduced dimensions is:", f1)