# Notebook for ML methods

We use simple grid search approach to find optimal parameters for all methods:
* KNeighborsClassifier
* LogisticRegression
* SVC
* GaussianNB
* Decision Tree
* Random Forest

Table for comparation will be cteated at the end of this notebook and logs as well. 

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os.path import join as p_join

##################################
## GLOBAL SETTINGS ###############
##################################
plt.rcParams["figure.figsize"] = (12, 8)

def seed_all(seed=42):

    random.seed(seed)
    np.random.seed(seed)
    print("[ Using Seed : ", seed, " ]")

####################################
#####   SEED ALL EXPERIMENTS   #####
####################################
seed_all()

In [None]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utils import utils

In [None]:
L_PARAMETR = 1.6         # Set L parametr to create dataset
data_path = p_join('..', '..', 'data')
FOLDERS = [p_join(os.path.abspath(data_path), item)
           for item in os.listdir(data_path) if f'L={L_PARAMETR}' in item]              

In [None]:
###############################
##### IMPORT ML METHODS   #####
###############################
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#################################
####### GLOBAL CONFIG ###########
#################################
CONFIG = {'scoring': ['accuracy', 'f1_macro'],
          'cv': 5,
          'n_jobs': -1}

### Create dataset

In [None]:
X, Y = utils.create_dataset(FOLDERS)
X = utils.normalize_data(X)

### Try `KNeighborsClassifier` Grid Search

In [None]:
params = {'n_neighbors': [3, 5, 7, 9], 'weights':['uniform', 'distance']}
knn_grid_search_res = utils.greed_searc_cv(KNeighborsClassifier(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `LogisticRegression` Grid Search

In [None]:
params = [{'penalty': ['l1', 'l2'],
          'C': [1/i for i in range(1, 20, 3)],
          'solver': ['saga'],
          'max_iter': [2000]},
          {'penalty': ['elasticnet'],
           'C': [1/i for i in range(1, 20, 3)],
           'solver': ['saga'],
           'l1_ratio': np.linspace(0, 1, 5),
           'max_iter': [2000]}
         ]

log_reg_grid_search_res = utils.greed_searc_cv(LogisticRegression(),
                                               params,
                                               CONFIG,
                                               X, Y)

### Try `SVC` Grid Search (note: very long operation)

In [None]:
params = {'kernel': ('linear', 'rbf', 'poly'),
          'C':[1, 3, 5, 10, 15, 20, 25, 30, 100, 200, 1000, 2000]}
svc_grid_search_res = utils.greed_searc_cv(svm.SVC(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `GNB` Grid Search

In [None]:
params = {'var_smoothing': [1e-9]}
gnb_grid_search_res = utils.greed_searc_cv(GaussianNB(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `Decision Tree` Grid Search 

In [None]:
params = {'criterion': ['gini', 'entropy'],
         'max_depth': [4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
dt_grid_search_res = utils.greed_searc_cv(DecisionTreeClassifier(),
                                          params,
                                          CONFIG,
                                          X, Y)

### Try `Ramdom Forest` Grid Search 

In [None]:
params = {'criterion': ['gini', 'entropy'],
          'n_estimators': [10, 50, 100],
          'max_depth': [3, 5, 10],
          'min_samples_split': [2, 5, 10]}
rf_grid_search_res = utils.greed_searc_cv(RandomForestClassifier(),
                                          params,
                                          CONFIG,
                                          X, Y)

### Plot Pandas DataFrame with summary

In [None]:
all_results = {'KNN': knn_grid_search_res,
               'Logistic regression': log_reg_grid_search_res,
               'SVM': svc_grid_search_res,
               'GaussianNB': gnb_grid_search_res, 
               'Decision Tree': dt_grid_search_res,
               'Ramdom Forest': rf_grid_search_res}

data = {}
for method in all_results:
    res = all_results[method]
    bi = res['best_index']
    val = []
    for kk in res['cv_results']:
        if 'mean_test' in kk:
            val.append(res['cv_results'][kk][bi])
    data[method] = val

In [None]:
df = pd.DataFrame.from_dict(data, orient='index', columns=CONFIG['scoring'])
df

In [None]:
if not os.path.exists("logs"):
    os.makedirs("logs")

with open(f'logs/all_res_ml_l_{L_PARAMETR}.txt', 'w') as f:
    f.write(str(all_results))