# Preprocessing

In [20]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import neighbors
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
# import other libraries
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# load data
X = np.load('data/samples.npy')
y = np.load('data/labels.npy')

In [16]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Create models

In [4]:
## Initialize models
tree = DecisionTreeClassifier()
logreg = LogisticRegression(max_iter=1000)
# tktk lasso (TODO)
# tktk polyreg (TODO)
ridge = RidgeClassifier()
knearest = neighbors.KNeighborsClassifier(n_jobs=-1)
svm_model = svm.SVC()
naive = GaussianNB()

In [5]:
# models dictionary
# add lasso, polyreg to dictionary (TODO)
models = {
    "Decision Tree": tree,
    "Logistic Regression": logreg,
    "Ridge Regression": ridge,
    "K-Nearest Neighbors": knearest,
    "SVM": svm_model,
    "Naive Bayes": naive
}

# Initial evaluation of models, default parameters

## Average accuracy

In [21]:
# avg_accuracy dictionary
avg_accuracy = {}
avg_accuracy["Multilayer Perceptron"] = 0.92 # mlp.py

In [23]:
for name, m in models.items():
  scores = cross_val_score(m, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
  avg_accuracy[name] = sum(scores) / len(scores)

In [24]:
# display avg_accuracy
for i in sorted(avg_accuracy, key=avg_accuracy.get, reverse=True):
  print("%-30s%-20s" % (i, "{:.2f}".format(avg_accuracy[i]*100)))

Multilayer Perceptron         92.00               
Ridge Regression              82.73               
Logistic Regression           79.29               
SVM                           73.38               
Decision Tree                 71.84               
K-Nearest Neighbors           70.38               
Naive Bayes                   53.64               


# Grid Search

## Decision Tree

In [36]:
tree2 = DecisionTreeClassifier(min_impurity_decrease=0.2)
# param grid
param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4],
    'min_samples_leaf': [2, 3, 4, 5],
    'max_samples_leaf': list(range(2, 100)),
    'min_impurity_decrease': [0, 0.1, 0.2, 0.4, 0.5],
    'max_features': [20, 50, 70, 90, 110]
}

In [37]:
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_tree, verbose=1, cv=3)

In [38]:
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 28800 candidates, totalling 86400 fits


ValueError: Invalid parameter max_samples_leaf for estimator DecisionTreeClassifier(max_depth=2, max_features=20, random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.

In [25]:
for x in min_impurity:
    tree2 = DecisionTreeClassifier(min_impurity_decrease=x)
    scores = cross_val_score(tree2, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
    avg_accuracy2 = sum(scores) / len(scores)
    print(x, avg_accuracy2)

0 0.7230919540229885
0.1 0.7337931034482759
0.2 0.7337931034482759
0.4 0.7337931034482759
0.5 0.7337931034482759
0.6 0.7337931034482759
0.7 0.7337931034482759
0.8 0.7337931034482759
2 0.7337931034482759


In [19]:
# create param_grids for each model
## decision tree

## logistic regression
param_grid_logreg = {
    
}
## ridge regression
param_grid_ridge = {
    
}
## k-nearest neighbors
param_grid_knearest = {
    
}
## svm
param_grid_svm_model = {
    
}
## naive bayes
param_grid_naive = {
    
}

NameError: name 'criterion' is not defined

# Final evaluation of models, fine-tuned hyperparameters

## Average accuracy

In [None]:
# avg_accuracy dictionary
avg_accuracy = {}
avg_accuracy["Multilayer Perceptron"] = 0.92 # mlp.py

In [None]:
for name, m in models.items():
  scores = cross_val_score(m, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
  avg_accuracy[name] = sum(scores) / len(scores)

In [None]:
# display avg_accuracy
for i in sorted(avg_accuracy, key=avg_accuracy.get, reverse=True):
  print("%-30s%-20s" % (i, "{:.2f}".format(avg_accuracy[i]*100)))

Multilayer Perceptron         92.00               
Ridge Regression              82.73               
Logistic Regression           79.29               
SVM                           73.38               
Decision Tree                 71.84               
K-Nearest Neighbors           70.38               
Naive Bayes                   53.64               
