# Preprocessing

In [20]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeClassifier
from sklearn import neighbors
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
# import other libraries
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [21]:
# load data
X = np.load('data/samples.npy')
y = np.load('data/labels.npy')

In [22]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Grid Search

### Decision Tree

In [38]:
# param grid
param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [2, 3],
    'max_leaf_nodes': list(range(1, 100)),
    'min_impurity_decrease': [0, 0.1],
    'max_features': [45, 50, 55]
}

In [39]:
grid_search_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_tree, verbose=1, cv=100, n_jobs=-1)

In [40]:
grid_search_tree.fit(X, y)

Fitting 100 folds for each of 9504 candidates, totalling 950400 fits


KeyboardInterrupt: 

In [26]:
print(grid_search_tree.best_estimator_)
print(grid_search_tree.best_score_)
print(grid_search_tree.best_params_)

DecisionTreeClassifier(criterion='entropy', max_depth=6, max_features=45,
                       max_leaf_nodes=16, min_impurity_decrease=0,
                       min_samples_leaf=2, random_state=42)
0.7754528985507245
{'criterion': 'entropy', 'max_depth': 6, 'max_features': 45, 'max_leaf_nodes': 16, 'min_impurity_decrease': 0, 'min_samples_leaf': 2}


### Logistic Regression

In [27]:
# param grid
param_grid_logreg = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['saga', 'sag', 'liblinear', 'lbfgs', 'newton-cg']
}

In [28]:
grid_search_logreg = GridSearchCV(LogisticRegression(random_state=42), param_grid_logreg, verbose=1, cv=100, n_jobs=-1)

In [29]:
grid_search_logreg.fit(X_train, y_train)

Fitting 100 folds for each of 140 candidates, totalling 14000 fits


 0.73355072 0.73355072 0.73355072 0.73355072        nan        nan
        nan        nan        nan 0.81139493 0.8240942         nan
 0.83268116 0.87432971 0.73355072        nan 0.73355072        nan
        nan 0.73355072 0.73355072 0.73355072 0.73355072 0.73355072
        nan        nan        nan        nan        nan 0.81139493
 0.8240942         nan 0.83268116 0.87432971 0.73355072        nan
 0.73355072        nan        nan 0.73438406 0.73438406 0.73565217
 0.73438406 0.73438406        nan        nan        nan        nan
        nan 0.81139493 0.8240942         nan 0.83268116 0.87432971
 0.79030797        nan 0.83              nan        nan 0.78061594
 0.78393116 0.78487319 0.78442029 0.7890942         nan        nan
        nan        nan        nan 0.81139493 0.8240942         nan
 0.83268116 0.87432971 0.80932971        nan 0.86099638        nan
        nan 0.80679348 0.81815217 0.84146739 0.82574275 0.84152174
        nan        nan        nan        nan        nan 0.8113

GridSearchCV(cv=100, estimator=LogisticRegression(random_state=42), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['saga', 'sag', 'liblinear', 'lbfgs',
                                    'newton-cg']},
             verbose=1)

In [30]:
print(grid_search_logreg.best_estimator_)
print(grid_search_logreg.best_score_)
print(grid_search_logreg.best_params_)

LogisticRegression(C=1000, random_state=42, solver='liblinear')
0.8785507246376812
{'C': 1000, 'penalty': 'l2', 'solver': 'liblinear'}


### Ridge Regression

In [31]:
# param grid
param_grid_ridge = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [32]:
grid_search_ridge = GridSearchCV(RidgeClassifier(random_state=42), param_grid_ridge, verbose=1, cv=100, n_jobs=-1)

In [33]:
grid_search_ridge.fit(X_train, y_train)

Fitting 100 folds for each of 56 candidates, totalling 5600 fits


GridSearchCV(cv=100, estimator=RidgeClassifier(random_state=42), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 10],
                         'solver': ['auto', 'svd', 'cholesky', 'lsqr',
                                    'sparse_cg', 'sag', 'saga']},
             verbose=1)

In [34]:
print(grid_search_ridge.best_estimator_)
print(grid_search_ridge.best_score_)
print(grid_search_ridge.best_params_)

RidgeClassifier(alpha=0.001, random_state=42, solver='sparse_cg')
0.8764673913043479
{'alpha': 0.001, 'solver': 'sparse_cg'}


### SVM

In [35]:
# param grid
param_grid_svm = {
    'degree': [0, 1, 2, 3, 4, 5, 6],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 10, 100],
    'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 10, 100],
    'kernel': ['rbf', 'linear']
}

In [36]:
grid_search_svm = GridSearchCV(svm.SVC(), param_grid_svm, verbose=1, cv=100, n_jobs=-1)

In [37]:
grid_search_svm.fit(X_train, y_train)

Fitting 100 folds for each of 1134 candidates, totalling 113400 fits


KeyboardInterrupt: 

In [None]:
print(grid_search_svm.best_estimator_)
print(grid_search_svm.best_score_)
print(grid_search_svm.best_params_)

SVC(C=100, degree=0, gamma=0.0001, kernel='linear')
0.8713948450322254
{'C': 100, 'degree': 0, 'gamma': 0.0001, 'kernel': 'linear'}


### K-Nearest Neighbors

In [None]:
# param grid
param_grid_knearest = {
    'leaf_size': list(range(1, 50)),
    'n_neighbors': list(range(1, 30)),
    'p': [1, 2]
}

In [None]:
grid_search_knearest = GridSearchCV(neighbors.KNeighborsClassifier(), param_grid_knearest, verbose=1, cv=100, n_jobs=-1)

In [None]:
grid_search_knearest.fit(X_train, y_train)

Fitting 3 folds for each of 2842 candidates, totalling 8526 fits


GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]},
             verbose=1)

In [None]:
print(grid_search_knearest.best_estimator_)
print(grid_search_knearest.best_score_)
print(grid_search_knearest.best_params_)

KNeighborsClassifier(leaf_size=1, n_neighbors=4, p=1)
0.780559472293804
{'leaf_size': 1, 'n_neighbors': 4, 'p': 1}


### Naive Bayes

In [None]:
# param grid
param_grid_naive = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

In [None]:
grid_search_naive = GridSearchCV(GaussianNB(), param_grid_naive, verbose=1, cv=100, n_jobs=-1)

In [None]:
grid_search_naive.fit(X_train, y_train)

Fitting 100 folds for each of 100 candidates, totalling 10000 fits


KeyboardInterrupt: 

In [None]:
print(grid_search_naive.best_estimator_)
print(grid_search_naive.best_score_)
print(grid_search_naive.best_params_)