In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from os.path import join as p_join

import seaborn as sns

from tqdm import tqdm


##################################
## GLOBAL SETTINGS ###############
##################################
plt.rcParams["figure.figsize"] = (12,8)

def seed_all(seed=42):

    random.seed(seed)
    np.random.seed(seed)
    print("[ Using Seed : ", seed, " ]")

####################################
#####   SEED ALL EXPERIMENTS   #####
####################################
seed_all()

[ Using Seed :  42  ]


In [2]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utils import utils

In [3]:
data_path = p_join('..', '..', 'data_2')
FOLDERS = [p_join(os.path.abspath(data_path), item) for item in os.listdir(data_path) if 'L=10.6' in item]              

In [4]:
###############################
##### IMPORT ML METHODS   #####
###############################
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#################################
####### GLOBAL CONFIG ###########
#################################
CONFIG = {'scoring': ['accuracy', 'f1_macro'],
          'cv': 5,
          'n_jobs': -1}

### Create dataset

In [5]:
X, Y = utils.create_dataset(FOLDERS)
X = utils.normalize_data(X)

100%|███████████████████████████████████████████| 16/16 [00:00<00:00, 19.88it/s]


### Try `KNeighborsClassifier` Grid Search

In [6]:
params = {'n_neighbors': [3,5,7,9], 'weights':['uniform', 'distance']}
knn_grid_search_res = utils.greed_searc_cv(KNeighborsClassifier(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `LogisticRegression` Grid Search

In [7]:
params = [{'penalty': ['l1', 'l2'],
          'C': [1/i for i in range(1, 20, 3)],
          'solver': ['saga'],
          'max_iter': [250]},
          {'penalty': ['elasticnet'],
           'C': [1/i for i in range(1, 20, 3)],
           'solver': ['saga'],
           'l1_ratio': np.linspace(0, 1, 5),
           'max_iter': [250]}
         ]

log_reg_grid_search_res = utils.greed_searc_cv(LogisticRegression(),
                                               params,
                                               CONFIG,
                                               X, Y)

### Try `SVC` Grid Search

In [8]:
params = {'kernel': ('linear', 'rbf'), 'C':[1, 10]}
svc_grid_search_res = utils.greed_searc_cv(svm.SVC(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `GNB` Grid Search

In [9]:
params = {'var_smoothing': [1e-9]}
gnb_grid_search_res = utils.greed_searc_cv(GaussianNB(),
                                           params,
                                           CONFIG,
                                           X, Y)

### Try `Decision Tree` Grid Search 

In [10]:
params = {'criterion': ['gini', 'entropy'],
         'max_depth': [4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
dt_grid_search_res = utils.greed_searc_cv(DecisionTreeClassifier(),
                                          params,
                                          CONFIG,
                                          X, Y)

### Try `Ramdom Forest` Grid Search 

In [11]:
params = {'criterion': ['gini', 'entropy'],
          'n_estimators': [10, 50, 100],
          'max_depth': [3, 5, 10],
          'min_samples_split': [2, 5, 10]}
rf_grid_search_res = utils.greed_searc_cv(RandomForestClassifier(),
                                          params,
                                          CONFIG,
                                          X, Y)

In [12]:
### Just try Lin Classifier
accuracies_lin_reg = utils.calc_ml_method(SGDClassifier(), CONFIG, X, Y)
accuracies_lin_reg

{"['accuracy', 'f1_macro']": {'fit_time': array([1.36417985, 1.38120723, 1.59769154, 1.52987623, 1.32992411]),
  'score_time': array([0.00665188, 0.00638509, 0.00531602, 0.00560117, 0.00663424]),
  'test_accuracy': array([0.31343284, 0.30483707, 0.31710557, 0.33203094, 0.32041263]),
  'test_f1_macro': array([0.3089507 , 0.29651253, 0.30639866, 0.32732993, 0.3042673 ])}}

### Plot Pandas DataFrame with summary

In [13]:
all_results = {'KNN': knn_grid_search_res, 'Logistic regression': log_reg_grid_search_res,
               'SVM': svc_grid_search_res, 'GaussianNB': gnb_grid_search_res, 
               'Decision Tree': dt_grid_search_res, 'Ramdom Forest': rf_grid_search_res}

data = {}
for method in all_results:
    res = all_results[method]
    bi = res['best_index']
    val = []
    for kk in res['cv_results']:
        if 'mean_test' in kk:
            val.append(res['cv_results'][kk][bi])
    data[method] = val

In [14]:
df = pd.DataFrame.from_dict(data, orient='index', columns=CONFIG['scoring'])
df

Unnamed: 0,accuracy,f1_macro
KNN,0.643036,0.643162
Logistic regression,0.350697,0.346904
SVM,0.716304,0.71618
GaussianNB,0.314,0.292555
Decision Tree,0.581614,0.581595
Ramdom Forest,0.570768,0.568279


In [15]:
with open('logs/all_res_ml_l_10_6.txt', 'w') as f:
    f.write(str(all_results))