In [11]:
from __future__ import print_function

import os
import pprint
from time import time

import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn import svm
from sklearn.calibration import calibration_curve
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, \
    QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
import pandas as pd
from IPython.display import display, HTML
import warnings

import __root__

plt.style.use('ggplot')
%matplotlib inline


from sklearn import decomposition
from sklearn import datasets

PROJECT_ROOT = __root__.path()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [None]:
class Data:
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None

    def load_data(self, train_rows=None, test_rows=None, shfl=False, scl=True):
        data_folder = os.path.join(PROJECT_ROOT, "data")
        X_train_file_path = os.path.join(data_folder, "Train/X_train.txt")
        y_train_file_path = os.path.join(data_folder, "Train/y_train.txt")
        X_test_file_path = os.path.join(data_folder, "Test/X_test.txt")
        y_test_file_path = os.path.join(data_folder, "Test/y_test.txt")
        self.X_train = np.loadtxt(X_train_file_path, delimiter=' ')[
                       0:train_rows, :]
        self.y_train = np.loadtxt(y_train_file_path, delimiter=' ')[
                       0:train_rows, ]
        self.X_test = np.loadtxt(X_test_file_path, delimiter=' ')[0:test_rows,
                      :]
        self.y_test = np.loadtxt(y_test_file_path, delimiter=' ')[0:test_rows, ]
        # self.y_train = np.loadtxt(y_test_file_path, delimiter=' ').reshape(
        # -1, 1)
        # self.y_test = np.loadtxt(y_test_file_path, delimiter=' ').reshape(
        # -1, 1)
        if scl:
            StandardScaler().fit_transform(self.X_train)
        if shfl:
            self.X_train, self.y_train = shuffle(self.X_train, self.y_train)


class Classifiers:
    def __init__(self, data):
        self.X_train = data.X_train
        self.y_train = data.y_train
        self.X_test = data.X_test
        self.y_test = data.y_test
        self.y_predict = {}  # {k - name_of_classifier, v - [predictions]}
        self.cls = {}  # {k - name_of_classifier, v - sklearn_classifier_object}
        self.scores = {}  # {k - name_of_classifier, { k - score_metric,
        # v - score}}

    def add_classifier(self, classifier_name, classifier):
        self.cls[classifier_name] = classifier

    def fit(self):
        for classifier_name, classifier in self.cls.iteritems():
            t0 = time()
            classifier.fit(self.X_train, self.y_train)
            print("%s done in %0.3fs" % (classifier_name, time() - t0))

    def predict(self):
        for classifier_name, classifier in self.cls.iteritems():
            self.y_predict[classifier_name] = classifier.predict(self.X_test)

    def get_scores(self):
        for classifier_name, classifier in self.cls.iteritems():
            self.__get_score(classifier_name, classifier)

    def __get_score(self, classifier_name, classifier):
        self.scores[classifier_name] = {
            "accuracy": accuracy_score(self.y_test,
                                       self.y_predict[classifier_name]),
            "precision": precision_score(self.y_test,
                                         self.y_predict[classifier_name],
                                         average='weighted'),
            "recall": recall_score(self.y_test,
                                   self.y_predict[classifier_name],
                                   average='weighted')
            # "confusion_matrix": confusion_matrix(self.y_test,
            # self.y_predict[classifier_name])
        }

    def print_scores(self):
        df = pd.DataFrame(self.scores)
        display(df)

    def show_plt(self):
        cls_names = []
        accuracy = []
        precision = []
        recall = []
        #pprint.pprint(self.scores)
        for name, score_values in self.scores.iteritems():
            cls_names.append(name)
            # print(score_values)
            for score_type, v in score_values.iteritems():
                if score_type == 'accuracy':
                    accuracy.append(v)
                elif score_type == 'precision':
                    precision.append(v)
                elif score_type == 'recall':
                    recall.append(v)
        accuracy, cls_names = (list(t) for t in zip(*sorted(zip(accuracy, cls_names))))
        fig, ax = plt.subplots()
        fig.set_size_inches(7, 7)
        # plt.figure(figsize=(7, 7))  # Don't create a humongous figure
        bg = ax.barh(np.arange(len(cls_names)), accuracy, align='center', alpha=0.4)
        # plt.plot(accuracy, np.arange(len(cls_names)),'bo')
        ax.set_yticks(np.arange(len(cls_names)), cls_names)
        ax.set_ylabel("Classifiers")
        ax.set_xlabel("Accuracy")
        # ax.subplots_adjust(left=.3)
        ax.set_title("Accuracy vs. Classifiers")
        plt.show()


def cls_compare_no_shuff():
    data = Data()
    data.load_data(shfl=False)
    cls = Classifiers(data)
    dtc = DecisionTreeClassifier()
    gnb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    qda = QuadraticDiscriminantAnalysis()
    linear_svc = svm.SVC(kernel='linear', class_weight='balanced')
    poly_svc = svm.SVC(kernel='poly', class_weight='balanced')
    rbf_svc = svm.SVC(kernel='rbf', class_weight='balanced')
    # LinearSVC minimizes the squared hinge loss while SVC minimizes the
    # regular hinge loss.
    # LinearSVC uses the One-vs-All (also known as One-vs-Rest) multiclass
    # reduction while
    # SVC uses the One-vs-One multiclass reduction.

    rfc = RandomForestClassifier()
    cls.add_classifier("decision-trees", dtc)
    cls.add_classifier("random-forest", rfc)
    cls.add_classifier("gaussian-naive-bayes", gnb)
    cls.add_classifier("linear-discriminant-analysis", lda)
    cls.add_classifier("quadratic-discriminant-analysis", qda)
    cls.add_classifier("linear-support-vector-machine", linear_svc)
    cls.add_classifier("poly-support-vector-machine", poly_svc)
    cls.add_classifier("rbf-support-vector-machine", rbf_svc)
    cls.fit()
    cls.predict()
    cls.get_scores()
    cls.print_scores()
    cls.show_plt()


def cls_compare_shuff():
    data = Data()
    data.load_data(shfl=True)
    cls = Classifiers(data)
    dtc = DecisionTreeClassifier()
    gnb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    qda = QuadraticDiscriminantAnalysis()
    linear_svc = svm.SVC(kernel='linear')
    poly_svc = svm.SVC(kernel='poly')
    rbf_svc = svm.SVC(kernel='rbf')
    # LinearSVC minimizes the squared hinge loss while SVC minimizes the
    # regular hinge loss.
    # LinearSVC uses the One-vs-All (also known as One-vs-Rest) multiclass
    # reduction while
    # SVC uses the One-vs-One multiclass reduction.

    rfc = RandomForestClassifier()
    cls.add_classifier("decision-trees", dtc)
    cls.add_classifier("random-forest", rfc)
    cls.add_classifier("gaussian-naive-bayes", gnb)
    cls.add_classifier("linear-discriminant-analysis", lda)
    cls.add_classifier("quadratic-discriminant-analysis", qda)
    cls.add_classifier("linear-support-vector-machine", linear_svc)
    cls.add_classifier("poly-support-vector-machine", poly_svc)
    cls.add_classifier("rbf-support-vector-machine", rbf_svc)
    cls.fit()
    cls.predict()
    cls.get_scores()
    cls.print_scores()
    cls.show_plt()


def svm_all_unweighted():
    data = Data()
    data.load_data(shfl=True)
    cls = Classifiers(data)
    linear_svc = svm.SVC(kernel='linear')
    poly_svc = svm.SVC(kernel='poly')
    rbf_svc = svm.SVC(kernel='rbf')
    cls.add_classifier("linear-support-vector-machine", linear_svc)
    cls.add_classifier("poly-support-vector-machine", poly_svc)
    cls.add_classifier("rbf-support-vector-machine", rbf_svc)
    cls.fit()
    cls.predict()
    cls.get_scores()
    cls.print_scores()
    cls.show_plt()


def svm_all_weighted():
    data = Data()
    data.load_data(shfl=True)
    cls = Classifiers(data)
    linear_svc = svm.SVC(kernel='linear', C=1.00092594323,
                         class_weight='balanced')
    poly_svc = svm.SVC(kernel='poly', class_weight='balanced')
    rbf_svc = svm.SVC(kernel='rbf', class_weight='balanced')
    cls.add_classifier("linear-support-vector-machine", linear_svc)
    cls.add_classifier("poly-support-vector-machine", poly_svc)
    cls.add_classifier("rbf-support-vector-machine", rbf_svc)
    cls.fit()
    cls.predict()
    cls.get_scores()
    cls.print_scores()
    cls.show_plt()


def svm_linear_grid_search():
    data = Data()
    data.load_data(shfl=True)
    cls = Classifiers(data)
    # Default c = 1
    param_grid = {'C': [1, 1e3, 5e3, 1e4, 5e4, 1e5]}

    linear_svc = GridSearchCV(svm.SVC(kernel='linear', class_weight='balanced'),
                              param_grid)
    cls.add_classifier("linear-support-vector-machine", linear_svc)
    cls.fit()
    print("Best estimator found by grid search:")
    print(cls.cls['linear-support-vector-machine'].best_estimator_)
    cls.predict()
    cls.get_scores()
    cls.print_scores()
    cls.show_plt()


def cls_compare_pca():
    pca_values = [5, 10, 25, 30, 40, 50, 100, 150, 200, 250, 300, 561]
    data = Data()
    data.load_data(shfl=True)
    scores = {}
    for v in pca_values:
        print("Running PCA with %s components" % v)
        pca = PCA(n_components=v)
        pca.fit_transform(data.X_train)
        pca.transform(data.X_test)
        cls = Classifiers(data)
        dtc = DecisionTreeClassifier()
        gnb = GaussianNB()
        lda = LinearDiscriminantAnalysis()
        qda = QuadraticDiscriminantAnalysis()
        linear_svc = svm.SVC(kernel='linear', class_weight='balanced')
        poly_svc = svm.SVC(kernel='poly', class_weight='balanced')
        rbf_svc = svm.SVC(kernel='rbf', class_weight='balanced')
        cls.add_classifier("decision-trees", dtc)
        cls.add_classifier("gaussian-naive-bayes", gnb)
        cls.add_classifier("linear-discriminant-analysis", lda)
        cls.add_classifier("quadratic-discriminant-analysis", qda)
        cls.add_classifier("linear-support-vector-machine", linear_svc)
        cls.add_classifier("poly-support-vector-machine", poly_svc)
        cls.add_classifier("rbf-support-vector-machine", rbf_svc)
        cls.fit()
        cls.predict()
        cls.get_scores()
        cls.print_scores()
        for k, v in cls.scores.items():
            try:
                scores[k] += [v["accuracy"]]
            except KeyError:
                scores[k] = [v["accuracy"]]
    for k, v in scores.items():
        plt.plot(pca_values, v, label=k)
    plt.xlabel("PCA Values")
    plt.ylabel("Accuracy")
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=1, mode="expand", borderaxespad=0.)

def svm_rbf_grid_search():
    data = Data()
    data.load_data(shfl=True)
    cls = Classifiers(data)
    # Default c = 1 , gamma = 1/no_of_features
    param_grid = {'C': [1, 1e3, 5e3, 1e4, 5e4, 1e5],
                  'gamma': [1 / data.X_train.shape[1], 0.0001, 0.0005, 0.001,
                            0.005, 0.01, 0.1], }

    linear_svc = GridSearchCV(svm.SVC(kernel='rbf', class_weight='balanced'),
                              param_grid)
    cls.add_classifier("rbf-support-vector-machine", linear_svc)
    cls.fit()
    print("Best estimator found by grid search:")
    print(cls.cls['linear-support-vector-machine'].best_estimator_)
    cls.predict()
    cls.get_scores()
    cls.print_scores()
    cls.show_plt()

In [None]:
cls_compare_no_shuff()

linear-discriminant-analysis done in 2.427s
poly-support-vector-machine done in 37.249s
gaussian-naive-bayes done in 0.177s
quadratic-discriminant-analysis done in 3.243s
random-forest done in 2.081s
decision-trees done in 8.801s
rbf-support-vector-machine done in 24.838s


In [None]:
cls_compare_shuff()

In [None]:
cls_compare_pca()