# Preprocessing

In [3]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import neighbors
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
# import other libraries
import numpy as np
from sklearn.model_selection import cross_val_score
import csv

In [4]:
# load data
X = np.load('data/samples.npy')
y = np.load('data/labels.npy')

In [5]:
# create csv
OUTPUT_FILE = "data/grid_search/grid_search_results.csv"

# Grid Search

### Decision Tree

In [7]:
## Hyperparameter grid
criterion = ['gini', 'entropy']
max_depth = [3, 4, 5]
max_features = [55, 56]
max_leaf_nodes = list(range(2, 30))
min_impurity_decrease = [0, 0.1]
min_samples_leaf = [2, 3]

hyperparameters = [criterion, max_depth, max_features, max_leaf_nodes, min_impurity_decrease, min_samples_leaf]

In [17]:
best_acc = -1
features = []

with open(OUTPUT_FILE, "a", newline='') as csvfile:
    csvWriter = csv.writer(csvfile)
    header = ["DecisionTreeClassifier", "Accuracy"]
    header.append(hyperparameters)
    csvWriter.writerow(header)

    for c in criterion:
        for d in max_depth:
            for f in max_features:
                for l in max_leaf_nodes:
                    for i in min_impurity_decrease:
                        for ml in min_samples_leaf:
                            model = DecisionTreeClassifier(criterion=c, max_depth=d, max_features=f, max_leaf_nodes=l, min_impurity_decrease=i, min_samples_leaf=ml)
                            scores = cross_val_score(model, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
                            avg_accuracy = sum(scores) / len(scores) # evaluation metric

                            # update best accuracy score
                            if avg_accuracy > best_acc: 
                                best_acc = avg_accuracy
                                features = [c, d, f, l, i, ml]
                                print(best_acc, features)
                                row = ["DecisionTreeClassifier", best_acc]
                                row.append(features)
                                csvWriter.writerow(row)

0.7337931034482759 ['gini', 3, 55, 2, 0, 2]
0.7441494252873566 ['gini', 3, 55, 3, 0, 2]
0.7527126436781613 ['gini', 3, 55, 4, 0, 2]
0.7535172413793105 ['gini', 3, 55, 5, 0, 3]
0.7541379310344829 ['gini', 3, 55, 6, 0, 2]
0.7555172413793106 ['gini', 3, 55, 8, 0, 2]
0.7564137931034487 ['gini', 3, 55, 14, 0, 2]
0.7571379310344831 ['gini', 3, 55, 15, 0, 2]
0.7574367816091958 ['gini', 3, 55, 15, 0, 3]
0.7596551724137933 ['gini', 3, 55, 16, 0, 3]
0.7674252873563222 ['gini', 3, 55, 19, 0, 3]
0.7743218390804598 ['entropy', 4, 55, 28, 0, 2]


### Logistic Regression

In [30]:
## Hyperparameter grid
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['saga', 'sag', 'liblinear', 'lbfgs', 'newton-cg']

hyperparameters = [C, penalty, solver]

In [31]:
## Initialize model, variables
best_acc = -1
features = []
with open(OUTPUT_FILE, "a", newline='') as csvfile:
    csvWriter = csv.writer(csvfile)
    header = ["Logistic Regression", "Accuracy"]
    header.append(hyperparameters)
    csvWriter.writerow(header)

    for c in C:
        for p in penalty:
            for s in solver:
                model = LogisticRegression(C=c, penalty=p, solver=s)
                scores = cross_val_score(model, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
                avg_accuracy = sum(scores) / len(scores) # evaluation metric

                # update best accuracy score
                if avg_accuracy > best_acc: 
                    best_acc = avg_accuracy
                    features = [c, p, s]
                    print(best_acc, features)
                    row = ["Logistic Regression", best_acc]
                    row.append(features)
                    csvWriter.writerow(row)

0.7337931034482759 [0.001, 'l1', 'saga']
0.8120459770114944 [0.001, 'none', 'saga']
0.829977011494253 [0.001, 'none', 'sag']
0.8357701149425288 [0.001, 'none', 'lbfgs']
0.8833333333333333 [0.001, 'none', 'newton-cg']


KeyboardInterrupt: 

### Ridge Regression

In [32]:
## Hyperparameter grid
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 10]
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']

hyperparameters = [alpha, solver]

In [33]:
## Initialize model, variables
best_acc = -1
features = []
with open(OUTPUT_FILE, "a", newline='') as csvfile:
    csvWriter = csv.writer(csvfile)
    header = ["Ridge Regression", "Accuracy"]
    header.append(hyperparameters)
    csvWriter.writerow(header)

    for a in alpha:
        for s in solver:
            model = RidgeClassifier(alpha=a, solver=s)
            scores = cross_val_score(model, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
            avg_accuracy = sum(scores) / len(scores) # evaluation metric

            # update best accuracy score
            if avg_accuracy > best_acc: 
                best_acc = avg_accuracy
                features = [a, s]
                print(best_acc, features)
                row = ["Ridge Regression", best_acc]
                row.append(features)
                csvWriter.writerow(row)

0.8796091954022991 [0.0001, 'auto']
0.8799425287356324 [0.001, 'sparse_cg']
0.881264367816092 [0.01, 'auto']
0.8815977011494255 [0.01, 'sparse_cg']


### SVM

In [None]:
## Hyperparameter grid
degree = [0, 1, 2, 3, 4, 5, 6]
C = [0.0001, 100]
gamma = [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 10, 100]
kernel = ['rbf', 'linear']

hyperparameters = [degree, C, gamma, kernel]

In [None]:
best_acc = -1
features = []

with open(OUTPUT_FILE, "a", newline='') as csvfile:
    csvWriter = csv.writer(csvfile)
    header = ["SVM", "Accuracy"]
    header.append(hyperparameters)
    csvWriter.writerow(header)

    for d in degree:
        for c in C:
            for g in gamma:
                for k in kernel:
                    model = svm.SVC(degree=d,C=c,gamma=g,kernel=k)
                    scores = cross_val_score(model, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
                    avg_accuracy = sum(scores) / len(scores) # evaluation metric

                    # update best accuracy score
                    if avg_accuracy > best_acc: 
                        best_acc = avg_accuracy
                        features = [d, c, g, k]
                        print(best_acc, features)
                        row = ["SVM", best_acc]
                        row.append(features)
                        csvWriter.writerow(row)

0.7337931034482759 [0, 0.0001, 0.0001, 'rbf']
0.8799770114942531 [0, 100, 0.0001, 'linear']


### K-Nearest Neighbors

In [1]:
## Hyperparameter grid
leaf_size = list(range(1, 10))
n_neighbors = list(range(20, 40))
p = [1, 2, 3]

hyperparameters = [leaf_size, n_neighbors, p]

In [6]:
best_acc = -1
features = []

with open(OUTPUT_FILE, "a", newline='') as csvfile:
    csvWriter = csv.writer(csvfile)
    header = ["K-Nearest Neighbors", "Accuracy"]
    header.append(hyperparameters)
    csvWriter.writerow(header)

    for ls in leaf_size:
        for n in n_neighbors:
            for p_iter in p:
                model = neighbors.KNeighborsClassifier(leaf_size=ls,n_neighbors=n,p=p_iter)
                scores = cross_val_score(model, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
                avg_accuracy = sum(scores) / len(scores) # evaluation metric

                # update best accuracy score
                if avg_accuracy > best_acc: 
                    best_acc = avg_accuracy
                    features = [ls, n, p_iter]
                    print(best_acc, features)
                    row = ["K-Nearest Neighbors", best_acc]
                    row.append(features)
                    csvWriter.writerow(row)

### Naive Bayes

In [5]:
## Hyperparameter grid
var_smoothing = np.logspace(0, -9, num=100)

hyperparameters = [var_smoothing]

In [6]:
best_acc = -1
features = []

with open(OUTPUT_FILE, "a", newline='') as csvfile:
    csvWriter = csv.writer(csvfile)
    header = ["Naive Bayes", "Accuracy"]
    header.append(hyperparameters)
    csvWriter.writerow(header)

    for vs in var_smoothing:
        model = GaussianNB(var_smoothing=vs)
        scores = cross_val_score(model, X, y, cv=100, n_jobs=-1) # 100-fold cross-validation
        avg_accuracy = sum(scores) / len(scores) # evaluation metric

        # update best accuracy score
        if avg_accuracy > best_acc: 
            best_acc = avg_accuracy
            features = [vs]
            print(best_acc, features)
            row = ["Naive Bayes", best_acc]
            row.append(features)
            csvWriter.writerow(row)

0.7337931034482759 [1.0]
