In [4]:
import numpy as np
import pandas as pd
import os.path, matplotlib
import matplotlib.pyplot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle
import time

In [5]:
def preprocess_data(data, center=True):
    '''
    centers and splits the data for testing and training
    returns x_train, x_test, y_train, y_test
    '''
    label = data.label
    df = data.drop(['label','filename'], axis=1)
    if center:
        scaler = StandardScaler()
        scaler.fit(df)
        df = scaler.transform(df)
    return train_test_split(df, label, test_size=0.5)

In [6]:
def gridsearch(classifier, params, x_train, y_train, name="Test_"):
    '''
    Uses GridSearchCV to tune hyperparameters and saves the GridSearchCV results
    Trains the classifier with the best parameters and scores the model
    '''
    start_time = time.time()
    clf = GridSearchCV(classifier, params, n_jobs=-1, cv=10)
    grid = clf.fit(x_train, y_train)
    print("GridSearchCV elapsed time: {}".format(time.time() - start_time))

    # best_params = grid.best_params_
    # best_score = grid.best_score_
    # print("{}GridSearch \nBest params: {} \nScore: {}".format(name, best_params, best_score))
    
    # Saves GridSearch Result
    filename = "{}GridSearch.sav".format(name)
    pickle.dump(grid, open(filename, 'wb'))
    return grid

In [31]:
def rfc(data, center=True):
    '''
    Uses GridSearchCV to tune hyperparameters for RandomForestClassification
    Saves the grid results to a pickle
    '''
    x_train, x_test, y_train, y_test = preprocess_data(data)
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 5)]
    max_depth = [int(x) for x in np.linspace(start = 10, stop = 500, num = 5)]
    max_features = ['sqrt','log2', None]
    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
    }
    grid = gridsearch(RandomForestClassifier(), params, x_train, y_train, name="RandomForest_")
    return grid

In [8]:
data = pd.read_csv('data/data.csv')

In [None]:
rfc_grid = rfc(data)

In [9]:
def non_linear_svm(data, center=True):
    x_train, x_test, y_train, y_test = preprocess_data(data)
    C = np.logspace(-2, 4, 7)
    gamma = np.logspace(-3, 3, 7)
    kernel = ['poly', 'rbf', 'sigmoid']
    params = {
        'C': C,
        'gamma': gamma,
        'kernel': kernel,
    }
    grid = gridsearch(SVC(), params, x_train, y_train, name="NonLinearSVC_")
    return grid

In [11]:
svm_grid = non_linear_svm(data)

GridSearchCV elapsed time: 4.168776988983154


In [22]:
res = svm_grid.cv_results_

In [23]:
import matplotlib.pyplot as plt


In [24]:
res['mean_test_score']

array([0.112, 0.112, 0.112, 0.112, 0.112, 0.112, 0.414, 0.112, 0.208,
       0.576, 0.112, 0.224, 0.576, 0.112, 0.226, 0.576, 0.112, 0.228,
       0.576, 0.112, 0.228, 0.112, 0.112, 0.112, 0.112, 0.36 , 0.358,
       0.584, 0.198, 0.356, 0.576, 0.112, 0.308, 0.576, 0.112, 0.3  ,
       0.576, 0.112, 0.302, 0.576, 0.112, 0.308, 0.112, 0.362, 0.358,
       0.192, 0.528, 0.49 , 0.586, 0.61 , 0.282, 0.576, 0.132, 0.2  ,
       0.576, 0.128, 0.218, 0.576, 0.12 , 0.218, 0.576, 0.12 , 0.216,
       0.112, 0.542, 0.492, 0.414, 0.644, 0.584, 0.576, 0.64 , 0.266,
       0.576, 0.146, 0.228, 0.576, 0.128, 0.23 , 0.576, 0.12 , 0.24 ,
       0.576, 0.12 , 0.23 , 0.112, 0.614, 0.592, 0.584, 0.596, 0.568,
       0.576, 0.64 , 0.274, 0.576, 0.146, 0.234, 0.576, 0.128, 0.214,
       0.576, 0.12 , 0.236, 0.576, 0.12 , 0.23 , 0.192, 0.6  , 0.584,
       0.586, 0.6  , 0.526, 0.576, 0.64 , 0.264, 0.576, 0.146, 0.23 ,
       0.576, 0.128, 0.216, 0.576, 0.12 , 0.23 , 0.576, 0.12 , 0.23 ,
       0.414, 0.6  ,

In [26]:
np.array(res['mean_test_score']).reshape(len(grid_param_2),len(grid_param_1))

NameError: name 'grid_param_2' is not defined