In [111]:
import numpy as np
import pandas as pd
import os.path, matplotlib
import matplotlib.pyplot
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
import pickle
import time

In [2]:
def preprocess_data(data, center=True):
    '''
    centers and splits the data for testing and training
    returns x_train, x_test, y_train, y_test
    '''
    label = data.label
    df = data.drop(['label','filename'], axis=1)
    if center:
        scaler = StandardScaler()
        scaler.fit(df)
        df = scaler.transform(df)
    return train_test_split(df, label, test_size=0.5)

In [3]:
def gridsearch(classifier, params, x_train, y_train, name="Test_"):
    '''
    Uses GridSearchCV to tune hyperparameters and saves the GridSearchCV results
    Trains the classifier with the best parameters and scores the model
    '''
    start_time = time.time()
    clf = GridSearchCV(classifier, params, n_jobs=-1, cv=10)
    grid = clf.fit(x_train, y_train)
    print("GridSearchCV elapsed time: {}".format(time.time() - start_time))

    # best_params = grid.best_params_
    # best_score = grid.best_score_
    # print("{}GridSearch \nBest params: {} \nScore: {}".format(name, best_params, best_score))
    
    # Saves GridSearch Result
    filename = "{}GridSearch.sav".format(name)
    pickle.dump(grid, open(filename, 'wb'))
    return grid

In [107]:
def rfc(data, center=True):
    '''
    Uses GridSearchCV to tune hyperparameters for RandomForestClassification
    Saves the grid results to a pickle
    '''
    x_train, x_test, y_train, y_test = preprocess_data(data)
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 5)]
    max_depth = [int(x) for x in np.linspace(start = 10, stop = 500, num = 5)]
    max_features = ['sqrt','log2', None]
    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
    }
    grid = gridsearch(RandomForestClassifier(), params, x_train, y_train, name="RandomForest_")
    return grid, x_train, x_test, y_train, y_test

In [5]:
data = pd.read_csv('data/data.csv')

In [None]:
rfc_grid = rfc(data)

In [6]:
def non_linear_svm(data, center=True):
    x_train, x_test, y_train, y_test = preprocess_data(data)
    C = np.logspace(-2, 4, 7)
    gamma = np.logspace(-3, 3, 7)
    kernel = ['poly', 'rbf', 'sigmoid']
    params = {
        'C': C,
        'gamma': gamma,
        'kernel': kernel,
    }
    grid = gridsearch(SVC(), params, x_train, y_train, name="NonLinearSVC_")
    return grid, x_train, x_test, y_train, y_test

In [23]:
svm_grid, x_train, x_test, y_train, y_test = non_linear_svm(data)
svm_res = svm_grid.cv_results_

GridSearchCV elapsed time: 4.502078056335449


In [None]:
rfc_grid = pickle.load(open('RandomForest_GridSearch.sav', 'rb'))
svm_grid = pickle.load(open('NonLinearSVC_GridSearch.sav', 'rb'))
rfc_res = rfc_grid.cv_results_
svm_res = svm_grid.cv_results_

In [92]:
svm_grid.best_estimator_

SVC(C=1000.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [83]:
svm_clf = svm_grid.best_estimator_
scores = cross_val_score(svm_clf, x_test, y_test, cv=10)

In [93]:
# 9 degrees of freedome 95% two tailed CI
t = 2.262
mean = np.mean(scores)
se = np.std(scores)/10
ci = [mean - (t*se), mean + (t*se)]

In [94]:
print("95% Confidence Interval: [{}, {}]".format(ci[0], ci[1]))

95% Confidence Interval: [0.5745889293401935, 0.6134110706598065]


In [96]:
np.concatenate((scores, scores))

array([0.58, 0.62, 0.62, 0.64, 0.64, 0.42, 0.76, 0.6 , 0.5 , 0.56, 0.58,
       0.62, 0.62, 0.64, 0.64, 0.42, 0.76, 0.6 , 0.5 , 0.56])

In [100]:
len(scores)

10

In [102]:
def get_ci(scores):
    '''
    19 degrees of freedome 95% two tailed CI
    '''
    t = 2.093
    mean = np.mean(scores)
    se = np.std(scores)/len(scores)
    ci = [mean - (t*se), mean + (t*se)]
    print("95% Confidence Interval: [{}, {}]".format(ci[0], ci[1]))

In [108]:
x_train, x_test, y_train, y_test = preprocess_data(data)
grid = pickle.load(open('RandomForest0_GridSearch.sav', 'rb'))
res = grid.cv_results_
clf = grid.best_estimator_
scores = cross_val_score(clf, x_test, y_test, cv=10)

grid = pickle.load(open('RandomForest0_GridSearch.sav', 'rb'))
res = grid.cv_results_
clf = grid.best_estimator_
scores = np.concatenate((scores, cross_val_score(clf, x_train, y_train, cv=10)))
get_ci(scores)

95% Confidence Interval: [0.5962719059880272, 0.607728094011973]


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   filename            1000 non-null   object 
 1   tempo               1000 non-null   float64
 2   beats               1000 non-null   int64  
 3   chroma_stft         1000 non-null   float64
 4   rmse                1000 non-null   float64
 5   spectral_centroid   1000 non-null   float64
 6   spectral_bandwidth  1000 non-null   float64
 7   rolloff             1000 non-null   float64
 8   zero_crossing_rate  1000 non-null   float64
 9   mfcc1               1000 non-null   float64
 10  mfcc2               1000 non-null   float64
 11  mfcc3               1000 non-null   float64
 12  mfcc4               1000 non-null   float64
 13  mfcc5               1000 non-null   float64
 14  mfcc6               1000 non-null   float64
 15  mfcc7               1000 non-null   float64
 16  mfcc8  

In [125]:
x_train, x_test, y_train, y_test = preprocess_data(data)
mlp = MLPClassifier(hidden_layer_sizes=(10,10,10),learning_rate_init=0.001, max_iter=3000)
mlp.fit(x_train, y_train)
print("train")
print(mlp.score(x_train, y_train))
print("test")
print(mlp.score(x_test, y_test))

train
0.998
test
0.488


