In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os.path import join as p_join

import seaborn as sns

from tqdm import tqdm


##################################
## GLOBAL SETTINGS ###############
##################################
plt.rcParams["figure.figsize"] = (12,8)

def seed_all(seed=42):

    random.seed(seed)
    np.random.seed(seed)
    print("[ Using Seed : ", seed, " ]")

####################################
#####   SEED ALL EXPERIMENTS   #####
####################################
seed_all()

[ Using Seed :  42  ]


In [2]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utils import utils

In [3]:
data_path = p_join('..', '..', 'data_2')
FOLDERS = [p_join(os.path.abspath(data_path), item) for item in os.listdir(data_path) if 'L=1.6' in item]              

In [4]:
###############################
##### IMPORT ML METHODS   #####
###############################
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#################################
####### GLOBAL CONFIG ###########
#################################
CONFIG = {'scoring': ['accuracy', 'f1_macro'],
          'cv': 5,
          'n_jobs': -1}

### Create dataset

In [5]:
X, Y = utils.create_dataset(FOLDERS)
X = utils.normalize_data(X)

100%|███████████████████████████████████████████| 16/16 [00:00<00:00, 22.06it/s]


### Try `KNeighborsClassifier` Grid Search

In [None]:
params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']}
knn_grid_search_res = utils.greed_searc_cv(KNeighborsClassifier(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `LogisticRegression` Grid Search

In [None]:
CONFIG = {'scoring': 'accuracy',
          'cv': 5,
          'n_jobs': -1}

In [None]:
params = [{'penalty': ['l1', 'l2'],
          'C': [1/i for i in range(1, 20, 3)],
          'solver': ['saga'],
          'max_iter': [250]},
          {'penalty': ['elasticnet'],
           'C': [1/i for i in range(1, 20, 3)],
           'solver': ['saga'],
           'l1_ratio': np.linspace(0, 1, 5),
           'max_iter': [250]}
         ]

log_reg_grid_search_res = utils.greed_searc_cv(LogisticRegression(),
                                               params,
                                               CONFIG,
                                               X, Y)

In [None]:
log_reg_grid_search_res

### Try `SVC` Grid Search

In [None]:
params = {'kernel': ('linear', 'rbf'), 'C':[1, 10]}
svc_grid_search_res = utils.greed_searc_cv(svm.SVC(),
                                               params,
                                               CONFIG,
                                               X, Y)

### Try `GNB` Grid Search

In [None]:
params = {'var_smoothing': [1e-9]}
gnb_grid_search_res = utils.greed_searc_cv(GaussianNB(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `Decision Tree` Grid Search 

In [59]:
params = {'criterion': ['gini', 'entropy'],
         'max_depth': [4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
dt_grid_search_res = utils.greed_searc_cv(DecisionTreeClassifier(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `Ramdom Forest` Grid Search 

In [None]:
params = {'criterion': ['gini', 'entropy'],
          'n_estimators': [10, 50, 100],
          'max_depth': [3, 5, 10],
          'min_samples_split': [2, 5, 10]}
rf_grid_search_res = utils.greed_searc_cv(RandomForestClassifier(),
                                           params,
                                           CONFIG,
                                           X, Y)

In [40]:
### Just try Lin Classifier
accuracies_lin_reg = utils.calc_ml_method(SGDClassifier(), CONFIG, X, Y)
accuracies_lin_reg

{"['accuracy', 'f1_micro']": {'fit_time': array([1.03328466, 0.92777061, 0.92106748, 1.01769853, 0.95516992]),
  'score_time': array([0.00574589, 0.00654411, 0.00691605, 0.00567198, 0.00566912]),
  'test_accuracy': array([0.49347503, 0.49605376, 0.49386575, 0.49581933, 0.49695217]),
  'test_f1_micro': array([0.49347503, 0.49605376, 0.49386575, 0.49581933, 0.49695217])}}

### Plot Pandas DataFrame with summary

In [60]:
# all_results = [knn_grid_search_res, log_reg_grid_search_res,
#                svc_grid_search_res, gnb_grid_search_res, 
#                dt_grid_search_res, rf_grid_search_res]
all_results = {'KNN': knn_grid_search_res,
               'DT': dt_grid_search_res}

data = {}
for method in all_results:
    res = all_results[method]
    bi = res['best_index']
    val = []
    for kk in res['cv_results']:
        if 'mean_test' in kk:
            print(kk)
            val.append(res['cv_results'][kk][bi])
    data[method] = val
    print(val)

df = pd.DataFrame.from_dict(data, orient='index', columns=CONFIG['scoring'])

mean_test_accuracy
mean_test_f1_macro
[0.4984214916051631, 0.49838445073286647]
mean_test_accuracy
mean_test_f1_macro
[0.5019849073723247, 0.49847959992454527]


In [61]:
df

Unnamed: 0,accuracy,f1_macro
KNN,0.498421,0.498384
DT,0.501985,0.49848
