# Classification with classic Machine Learning

### Load training and evaluation sets

In [1]:
from pathlib import Path
import pickle
import numpy as np

base_dir = "./features/"

load_file = f"{base_dir}X_train.pkl"
with open(load_file, "rb") as load_file:
    X = pickle.load(load_file)
    
load_file = f"{base_dir}y_train.pkl"
with open(load_file, "rb") as load_file:
    y = pickle.load(load_file)
    
load_file = f"{base_dir}X_val.pkl"
with open(load_file, "rb") as load_file:
    X_val = pickle.load(load_file)
    
load_file = f"{base_dir}y_val.pkl"
with open(load_file, "rb") as load_file:
    y_val = pickle.load(load_file)

### Load labels

In [2]:
load_file = f"{base_dir}multiLabelBinarizer.pkl"
with open(load_file, "rb") as load_file:
    mlb = pickle.load(load_file)

### Test different models

In [3]:
# Reduce to one dimension for testing
y_red = np.array([a.argmax() for a in y])
y_val_red = np.array([y.argmax() for y in y_val])

In [4]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import ComplementNB # Dice que es bueno para sets no balanceados
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier # Especiales multiclass
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold

# Ya que los datos no están bien balanceados, vamos a probar con un StratifiedKFold (aunque ya tengamos un 1.0)
models = {'SGDClassifier': SGDClassifier(), 'SGDCStandarized': Pipeline([('standarize', StandardScaler()), 
            ('sgd', SGDClassifier())]),'LogisticRegression': LogisticRegression(max_iter=1000), 'LrStandarized': Pipeline([('standarize', StandardScaler()), 
            ('lr', LogisticRegression(max_iter=5000))]),'RandomForestClassifier': RandomForestClassifier(), 'ComplementNB': ComplementNB(), 
            'DecisionTreeClassifier': DecisionTreeClassifier(), 'HistGradientBoostingClassifier': HistGradientBoostingClassifier(),
            'SVC': SVC()
        }

for model_tag, model in models.items():
    print(f"************{model_tag}************")
    skf = StratifiedKFold(n_splits=5)

    for train_index, test_index in skf.split(X, y_red):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_red[train_index], y_red[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        bal_accuracy = balanced_accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted")
        f1 = f1_score(y_test, y_pred, average="weighted")
        
        print(f"Acc: {accuracy}, Bal_acc: {bal_accuracy}, Prec: {precision}, Recall: {recall}, F1: {f1}")
        print("-------------------------------------------")

************SGDClassifier************
Acc: 0.5593457943925234, Bal_acc: 0.5236324731468529, Prec: 0.5898132929266139, Recall: 0.5593457943925234, F1: 0.5569301356174161
-------------------------------------------
Acc: 0.5359813084112149, Bal_acc: 0.507179791399078, Prec: 0.6438168709241187, Recall: 0.5359813084112149, F1: 0.5509031978550087
-------------------------------------------
Acc: 0.6044880785413744, Bal_acc: 0.5823679287726027, Prec: 0.6541573300548035, Recall: 0.6044880785413744, F1: 0.613818645174068
-------------------------------------------
Acc: 0.5642823749415615, Bal_acc: 0.5692967275454861, Prec: 0.6398387560072258, Recall: 0.5642823749415615, F1: 0.5849699907223391
-------------------------------------------
Acc: 0.5488546049555867, Bal_acc: 0.5293686706788285, Prec: 0.6369648244156135, Recall: 0.5488546049555867, F1: 0.5219629569145579
-------------------------------------------
************SGDCStandarized************
Acc: 0.5738317757009346, Bal_acc: 0.5070831248056

### Try OneVSRest techniques

In [5]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

models = {'RandomForestClassifier': RandomForestClassifier(), 'HistGradientBoostingClassifier': HistGradientBoostingClassifier(),
          'LinearSVC': LinearSVC(max_iter=10000)
        }

for model_tag, model in models.items():
    print(f"************{model_tag}************")
    skf = StratifiedKFold(n_splits=5)    
    
    for train_index, test_index in skf.split(X, y_red):
        X_train, X_test = X[train_index], X[test_index]        
        y_train, y_test = y_red[train_index], y_red[test_index]
        
        ovrc = OneVsRestClassifier(model).fit(X_train, y_train)        
        y_pred = ovrc.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        bal_accuracy = balanced_accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted")
        f1 = f1_score(y_test, y_pred, average="weighted")
        
        print(f"Acc: {accuracy}, Bal_acc: {bal_accuracy}, Prec: {precision}, Recall: {recall}, F1: {f1}")
        print("-------------------------------------------")    

************RandomForestClassifier************
Acc: 0.6322429906542056, Bal_acc: 0.5576368548699788, Prec: 0.6341390996701167, Recall: 0.6322429906542056, F1: 0.6184009081998519
-------------------------------------------
Acc: 0.6355140186915887, Bal_acc: 0.5574181231210386, Prec: 0.6364151808464261, Recall: 0.6355140186915887, F1: 0.6164339553925604
-------------------------------------------
Acc: 0.6470313230481534, Bal_acc: 0.5715425226278865, Prec: 0.6556866693933593, Recall: 0.6470313230481534, F1: 0.6290085397249165
-------------------------------------------
Acc: 0.6484338475923329, Bal_acc: 0.5821079058003412, Prec: 0.6538555804304789, Recall: 0.6484338475923329, F1: 0.6309784070300946
-------------------------------------------
Acc: 0.6470313230481534, Bal_acc: 0.5626207296844117, Prec: 0.6625688075292161, Recall: 0.6470313230481534, F1: 0.6291343720735867
-------------------------------------------
************HistGradientBoostingClassifier************
Acc: 0.6224299065420561



Acc: 0.4897196261682243, Bal_acc: 0.3508998220635418, Prec: 0.5956801982868698, Recall: 0.4897196261682243, F1: 0.43985892680140887
-------------------------------------------




Acc: 0.4995327102803738, Bal_acc: 0.4080614205211727, Prec: 0.6078611164359724, Recall: 0.4995327102803738, F1: 0.47205258055753274
-------------------------------------------




Acc: 0.4927536231884058, Bal_acc: 0.52321799670838, Prec: 0.6647185307034893, Recall: 0.4927536231884058, F1: 0.524359576469929
-------------------------------------------




Acc: 0.4941561477325853, Bal_acc: 0.4037935078516054, Prec: 0.6112593992207624, Recall: 0.4941561477325853, F1: 0.4650391778206846
-------------------------------------------




Acc: 0.6002805049088359, Bal_acc: 0.519164434263372, Prec: 0.6202787500055076, Recall: 0.6002805049088359, F1: 0.5909176602421848
-------------------------------------------




### Test different hyperparams for RandomForest
RandomForestClassifier seems to be the best, let's try different hyperparams

In [6]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__bootstrap': bootstrap}

ovr = OneVsRestClassifier(RandomForestClassifier())
model_tunning = RandomizedSearchCV(estimator = ovr, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, n_jobs = 2) #n_jobs = -1 se come la cpu
# Fit the random search model
model_tunning.fit(X, y)

print(model_tunning.best_score_)
print(model_tunning.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [7]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=1, 
                               max_features='sqrt', max_depth=None, bootstrap=False)

skf = StratifiedKFold(n_splits=5)    
    
for train_index, test_index in skf.split(X, y_red):
    X_train, X_test = X[train_index], X[test_index]        
    y_train, y_test = y_red[train_index], y_red[test_index]
      
    #ovrc = OneVsRestClassifier(model).fit(X_train, y_train)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    bal_accuracy = balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
        
    print(f"Acc: {accuracy}, Bal_acc: {bal_accuracy}, Prec: {precision}, Recall: {recall}, F1: {f1}")

Acc: 0.6350467289719626, Bal_acc: 0.5568489420464032, Prec: 0.6302391324628442, Recall: 0.6350467289719626, F1: 0.617750679689817
Acc: 0.6308411214953271, Bal_acc: 0.5499028275187472, Prec: 0.6332506733594748, Recall: 0.6308411214953271, F1: 0.6117795514513703
Acc: 0.648901355773726, Bal_acc: 0.5702959488139603, Prec: 0.6597347686370506, Recall: 0.648901355773726, F1: 0.6315333705600262
Acc: 0.6465638148667602, Bal_acc: 0.579677042823108, Prec: 0.6569017797598655, Recall: 0.6465638148667602, F1: 0.6284655048281669
Acc: 0.6432912575970079, Bal_acc: 0.5550280071217134, Prec: 0.6553652955796706, Recall: 0.6432912575970079, F1: 0.6238180899118312


In [8]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 20)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()

## RUNNING THIS TAKES A LOT OF TIME
# model_tunning = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring='f1_weighted', 
                                   n_iter = 50, cv = 5, verbose=2, n_jobs = 4) #n_jobs = -1 se come la cpu
# Fit the random search model
model_tunning.fit(X, y_red)

print(model_tunning.best_score_)
print(model_tunning.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
0.6283313337738358
{'n_estimators': 739, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}


In [9]:
import pandas as pd
df = pd.DataFrame(model_tunning.cv_results_)

In [10]:
df.sort_values(by=['rank_test_score'],axis=0).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
17,169.823963,3.46502,1.365002,0.184131,739,5,1,sqrt,100.0,False,"{'n_estimators': 739, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}",0.625889,0.613578,0.639824,0.634983,0.627382,0.628331,0.008954,1
31,85.309796,4.169874,0.538199,0.155772,374,5,2,sqrt,,False,"{'n_estimators': 374, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}",0.623442,0.612791,0.632557,0.630678,0.624244,0.624742,0.006943,2
13,185.918416,4.231527,1.416732,0.155078,843,2,2,sqrt,50.0,False,"{'n_estimators': 843, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}",0.625812,0.607099,0.63564,0.63149,0.62236,0.62448,0.009819,3
21,216.571228,5.534254,1.492288,0.15589,947,10,1,sqrt,40.0,False,"{'n_estimators': 947, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}",0.626371,0.607433,0.633672,0.626755,0.626405,0.624127,0.008797,4
28,30.848386,0.913423,0.273792,0.021918,218,2,2,log2,70.0,False,"{'n_estimators': 218, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 70, 'bootstrap': False}",0.62472,0.60961,0.635107,0.626176,0.624569,0.624036,0.008198,5


In [11]:
model = RandomForestClassifier(n_estimators=739, min_samples_split=5, min_samples_leaf=1, 
                               max_features='sqrt', max_depth=100, bootstrap=False)

model.fit(X,y_red)
y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val_red, y_pred)
bal_accuracy = balanced_accuracy_score(y_val_red, y_pred)
precision = precision_score(y_val_red, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_val_red, y_pred, average="weighted")
f1 = f1_score(y_val_red, y_pred, average="weighted")
        
print(f"Acc: {accuracy}, Bal_acc: {bal_accuracy}, Prec: {precision}, Recall: {recall}, F1: {f1}")

Acc: 0.6119847399274216, Bal_acc: 0.5257601053229649, Prec: 0.6444412191273847, Recall: 0.6119847399274216, F1: 0.5934413707722972


## Save model

In [12]:
import pickle

# Save model for our project (Accuracy 0.612)
base_dir = "./models/"

dump_filename = f"{base_dir}random_forest_classifier.pkl"
with open(dump_filename, "wb") as dump_file:
    pickle.dump(mlb, dump_file)