# 02 - Treinamento dos Modelos

## Importação

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import scipy.stats as ss

from tqdm.notebook import tqdm

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report

# Scaler
from sklearn.preprocessing import StandardScaler

# Pipeline
from sklearn.pipeline import Pipeline

# Gráficos
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV, StratifiedKFold

## Constantes e Sets

In [2]:
# PALETTE = 'RdYlGn'
PALETTE = 'viridis'
SEED = 42
N_ITER = 10
TRAINVAL_SPLITS = 5  # Ver se tudo bem!

pd.set_option('display.max_columns', None)
np.random.seed(SEED)

## Funções

In [3]:
def generate_colors(num_colors):
    colors = px.colors.sample_colorscale(PALETTE, [n/(num_colors - 1) for n in range(num_colors)])
    
    return colors

In [4]:
def validate_model(model, X, y, param_grid, n_iter=10, n_trainval_splits=10):
    skf = StratifiedKFold(n_splits=n_iter, shuffle=True)   # Shuffle?
    skf_folds = skf.split(X, y)
    
    runs_metrics = {}
    for n, (trainval_idx, test_idx) in enumerate(tqdm(skf_folds, total=n_iter)):
        X_trainval = X[trainval_idx]
        y_trainval = y[trainval_idx]
        
        X_test = X[test_idx]
        y_test = y[test_idx]

        best_params = get_best_params(model, X_trainval, y_trainval, param_grid, n_trainval_splits=n_trainval_splits, 
                                      display_results=False)
        
        model.set_params(**best_params)
        model.fit(X_trainval, y_trainval)
        
        model_metrics = evaluate_model_performance(model, X_test, y_test)
        model_metrics['best_params'] = best_params
        runs_metrics[n] = model_metrics

    runs_metrics = aggregate_run_metrics(runs_metrics)
    return runs_metrics

In [5]:
def get_best_params(model, X_trainval, y_trainval, param_grid, n_trainval_splits=10, display_results=False):
    skf = StratifiedKFold(n_splits=n_trainval_splits, shuffle=True)  # Shuffle?
    
    grid_search = GridSearchCV(model, param_grid=param_grid, refit=False, cv=skf, n_jobs=-1)
    grid_search.fit(X_trainval, y_trainval)
    
    best_params = grid_search.best_params_

    if display_results:
        df_res = pd.DataFrame(grid_search.cv_results_)
        df_res = df_res.sort_values('rank_test_score', ascending=True)
        display(df_res)

    return best_params

In [6]:
def evaluate_model_performance(model, X, y):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)

    report_dict = classification_report(y, y_pred, output_dict=True)
    report_dict['cm'] = cm

    return report_dict

In [7]:
def aggregate_run_metrics(runs_res):
    runs_ids = runs_res.keys()
    labels = [label for label in runs_res[0].keys() if label not in ['accuracy', 'micro avg', 'best_params', 'macro avg', 
                                                                     'weighted avg', 'cm']]  # Porco?

    accuracies = [runs_res[i]['accuracy'] for i in runs_ids]
    cms = [runs_res[i]['cm'] for i in runs_ids]
    best_params = [runs_res[i]['best_params'] for i in runs_ids]
    recalls = {i: [runs_res[j][i]['recall'] for j in runs_ids] for i in labels}
    precisions = {i: [runs_res[j][i]['precision'] for j in runs_ids] for i in labels}
    f1_scores = {i: [runs_res[j][i]['f1-score'] for j in runs_ids] for i in labels}

    metrics = {
        'accuracies': accuracies,
        'cms': cms,
        'f1-scores': f1_scores,
        'recalls': recalls,
        'precisions': precisions,
        'best_params': best_params
      }

    return metrics

In [8]:
def print_res(res):  # Cumulo da porquice, mas funciona bem
    print(f"===> ACURÁCIA MÉDIA <===\n{np.mean(res['accuracies']):.4f}", end='\n\n')
    print(f"===> MATRIZ DE CONFUSÃO GERAL <===\n{np.sum(res['cms'], axis=0)}", end='\n\n')
    
    print('===> RECALL, PRECISION E F1-SCORE MÉDIO <===') 
    print(f"{'Label'.ljust(10)} | {'Recall'.ljust(10)} | {'Precision'.ljust(10)} | {'F1-Score'.ljust(10)}")
    print('-'*48)
    for l in res['recalls'].keys():  # Igual para todos
        mean_recall = np.mean(res['recalls'][l])
        mean_precision = np.mean(res['precisions'][l])
        mean_f1_score = np.mean(res['f1-scores'][l])
        print(f'{l.ljust(10)} | {str(np.round(mean_recall, 4)).ljust(10)} | {str(np.round(mean_precision, 4)).ljust(10)} | '
              f'{str(np.round(mean_f1_score, 4)).ljust(10)}')

    print('\n===> MELHORES HIPERPARÂMETROS <===') 
    print(f"{'Ocorrências'.ljust(12)} | {'Valores'.ljust(75)}")
    print('-'*130)
    params_counts = Counter(tuple(param.items()) for param in res['best_params'])
    params_counts_mc = params_counts.most_common()
    
    for pcm in params_counts_mc:  # [:5] para mostrar apenas top 5
        pcm_values = pcm[0]
        pcm_occ = pcm[1]
    
        print(f'{str(pcm_occ).ljust(12)} | {pcm_values}')

In [9]:
def plot_accuracies(accuracies):
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=accuracies, nbinsx=5))
    fig.add_vline(x=np.mean(accuracies), line_dash='dash', annotation_text=f'Acurácia Média: {np.mean(accuracies):.2f}')
    fig.update_layout(title='Histograma da Acurácia', height=600)
    fig.show()

In [10]:
def plot_label_metrics(label_score, score_title):
    labels = list(label_score.keys())
    marker_colors = generate_colors(len(labels))

    fig = go.Figure()
    for i, l in enumerate(labels):
        name = f'Label {l}'
        color = marker_colors[i]
        f1_score = label_score[l]

        fig.add_trace(go.Box(y=f1_score, name=name, marker_color=color, legendgroup=i))

    fig.update_layout(title=f'Boxplots dos {score_title.title()} por label', height=600)
    fig.show()

In [11]:
def plot_confusion_matrix(cms):
    cm = np.sum(cms, axis=0)
    cm_mean = np.mean(cms, axis=0)
    cm_recall = cm/np.sum(cm, axis=1)
    cm_precision = cm/np.sum(cm, axis=0)
    # cm_f1score = np.nan_to_num(2*(cm_precision*cm_recall)/(cm_precision + cm_recall))  # Faz sentido?
    
    axis_labels = list(range(len(cm)))

    fig = make_subplots(rows=2, cols=2, shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.1,
                        subplot_titles=['Padrão', 'Média', 'Recall', 'Precision'])
    
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm, text=cm, texttemplate='%{text}', 
                             showscale=False, colorscale=PALETTE), 
                  row=1, col=1)

    # Média
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_mean, text=cm_mean, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=1, col=2)
    
    # F1-score
    # fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_f1score, text=cm_f1score, texttemplate='%{text:.2f}', 
    #                          showscale=False, colorscale=PALETTE), 
    #               row=1, col=2)

    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_recall, text=cm_recall, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=2, col=1)
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_precision, text=cm_precision, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=2, col=2)

    fig.update_layout(title='Matrizes de Confusão', yaxis1_title='Real', yaxis3_title='Real', xaxis3_title='Predito', 
                      xaxis4_title='Predito', yaxis1_autorange='reversed', yaxis2_autorange='reversed', 
                      yaxis3_autorange='reversed', height=800)

    return fig

## Scripts

### Leitura

In [12]:
df = pd.read_pickle('../data/processed/steel-plates-fault.pkl')

df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,1687.0,1.0,0.0,80.0,0.0498,0.2415,0.1818,0.0047,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,0
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,1687.0,1.0,0.0,80.0,0.7647,0.3793,0.2069,0.0036,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,0
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,1623.0,1.0,0.0,100.0,0.9710,0.3426,0.3333,0.0037,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,0
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,1353.0,0.0,1.0,290.0,0.7287,0.4413,0.1556,0.0052,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,0
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,1353.0,0.0,1.0,185.0,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1263,221.0,242.0,3948212.0,3948253.0,519.0,33.0,41.0,48309.0,65.0,124.0,1360.0,1.0,0.0,200.0,0.3250,0.3972,0.5122,0.0154,0.6364,1.0000,1.0,2.7152,1.3222,1.6128,0.4878,-0.2728,0.9765,5
1264,1111.0,1121.0,4032298.0,4032320.0,110.0,20.0,22.0,12351.0,100.0,127.0,1354.0,1.0,0.0,200.0,0.3442,0.5000,0.4545,0.0074,0.5000,1.0000,1.0,2.0414,1.0000,1.3424,0.5454,-0.1228,0.3663,5
1265,995.0,1006.0,4085316.0,4085344.0,140.0,25.0,28.0,16076.0,103.0,132.0,1356.0,1.0,0.0,200.0,0.5162,0.5454,0.3929,0.0081,0.4400,1.0000,1.0,2.1461,1.0414,1.4472,0.6071,-0.1029,0.5096,5
1266,396.0,418.0,4116853.0,4116868.0,231.0,26.0,16.0,25096.0,56.0,141.0,1356.0,1.0,0.0,200.0,0.5841,0.3000,0.6818,0.0162,0.8461,0.9375,0.0,2.3636,1.3424,1.1761,-0.3182,-0.1512,0.5461,5


In [13]:
df.describe()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
count,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0
mean,508.635647,565.126183,1759791.0,1759855.0,2588.908517,143.205836,103.716088,283134.8,79.209779,130.299685,1445.250789,0.402997,0.597003,66.072555,0.310388,0.409378,0.589186,0.04052,0.597253,0.8026,0.562303,2.603358,1.388906,1.456804,0.067459,-0.135566,0.61065,2.695584
std,513.760815,481.284469,1719432.0,1719449.0,6107.39222,362.087198,524.113646,600689.8,32.625475,16.654437,137.299471,0.490694,0.490694,35.923484,0.303147,0.134019,0.264234,0.062152,0.241613,0.241219,0.484225,0.880245,0.535012,0.468135,0.483909,0.137824,0.3496,1.802939
min,0.0,6.0,7430.0,7458.0,2.0,2.0,1.0,250.0,0.0,70.0,1227.0,0.0,0.0,40.0,0.0,0.0,0.0083,0.0015,0.0144,0.105,0.0,0.301,0.301,0.0,-0.9319,-0.9989,0.119,0.0
25%,41.0,191.0,626630.0,626635.8,87.0,15.0,13.0,10110.0,46.0,124.0,1358.0,0.0,0.0,40.0,0.0585,0.31485,0.3757,0.0066,0.4,0.53985,0.0,1.9395,1.0,1.0792,-0.3648,-0.192625,0.2482,1.0
50%,283.0,330.5,1412536.0,1412546.0,200.5,28.0,28.0,21351.5,85.5,127.0,1362.0,0.0,1.0,60.0,0.18145,0.40745,0.5714,0.01015,0.6335,0.9565,1.0,2.3021,1.1761,1.38905,0.07225,-0.143,0.5708,2.0
75%,955.0,963.25,2246608.0,2246677.0,3638.5,184.25,115.0,369638.8,103.0,140.0,1624.0,1.0,1.0,70.0,0.53305,0.493725,0.837025,0.067625,0.7778,1.0,1.0,3.5609,2.01175,1.8129,0.468275,-0.08165,1.0,5.0
max,1688.0,1696.0,12987660.0,12987690.0,152655.0,10449.0,18152.0,11591410.0,196.0,252.0,1794.0,1.0,1.0,290.0,0.9923,0.9439,1.0,0.6226,1.0,1.0,1.0,5.1837,2.9385,4.2587,0.9917,0.5917,1.0,5.0


### Divisão

In [14]:
X = df.drop(columns=['target']).values
y = df['target'].values

X.shape, y.shape

((1268, 27), (1268,))

### Treinamento

#### KNN

In [15]:
knn_param_grid = {
    'model__n_neighbors': [5, 1, 3, 10],
    'model__p': [2, 1],
    'model__weights': ['uniform', 'distance']    
}

knn_model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier())])
knn_res = validate_model(knn_model, X, y, knn_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(knn_res)

  0%|          | 0/10 [00:00<?, ?it/s]

  _data = np.array(data, dtype=dtype, copy=copy,


===> ACURÁCIA MÉDIA <===
0.8880

===> MATRIZ DE CONFUSÃO GERAL <===
[[103   4   1   0   1  49]
 [  2 174   6   0   0   8]
 [  3   2 380   0   0   6]
 [  0   1   0  68   0   3]
 [  3   1   0   0  50   1]
 [ 25  14   4   2   6 351]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Label      | Recall     | Precision  | F1-Score  
------------------------------------------------
0          | 0.6512     | 0.7588     | 0.6926    
1          | 0.9158     | 0.8969     | 0.9023    
2          | 0.9718     | 0.9723     | 0.9717    
3          | 0.9446     | 0.9732     | 0.9557    
4          | 0.91       | 0.8981     | 0.8971    
5          | 0.873      | 0.8424     | 0.8562    

===> MELHORES HIPERPARÂMETROS <===
Ocorrências  | Valores                                                                    
----------------------------------------------------------------------------------------------------------------------------------
4            | (('model__n_neighbors', 5), ('model__p', 1), ('mode

In [16]:
knn_acc = knn_res['accuracies']

plot_accuracies(knn_acc)

In [17]:
knn_f1 = knn_res['f1-scores']

plot_label_metrics(knn_f1, 'f1-scores')

In [18]:
knn_cms = knn_res['cms']

plot_confusion_matrix(knn_cms)

#### Árvore de Decisão

In [19]:
tree_param_grid = {
    'max_depth': [None, 5, 10, 20, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

tree_model = DecisionTreeClassifier()
tree_res = validate_model(tree_model, X, y, tree_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(tree_res)

  0%|          | 0/10 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
0.8612

===> MATRIZ DE CONFUSÃO GERAL <===
[[105   3   0   1   5  44]
 [  3 175   2   1   0   9]
 [  3   1 377   0   1   9]
 [  0   1   0  65   0   6]
 [  5   1   0   0  42   7]
 [ 46   8  13   4   3 328]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Label      | Recall     | Precision  | F1-Score  
------------------------------------------------
0          | 0.6642     | 0.669      | 0.6576    
1          | 0.9211     | 0.9278     | 0.9219    
2          | 0.9642     | 0.963      | 0.9631    
3          | 0.9036     | 0.9224     | 0.9108    
4          | 0.7667     | 0.8324     | 0.7827    
5          | 0.8162     | 0.8164     | 0.8151    

===> MELHORES HIPERPARÂMETROS <===
Ocorrências  | Valores                                                                    
----------------------------------------------------------------------------------------------------------------------------------
1            | (('criterion', 'entropy'), ('max_depth', 20), ('min

In [20]:
tree_acc = tree_res['accuracies']

plot_accuracies(tree_acc)

In [21]:
tree_f1 = tree_res['f1-scores']

plot_label_metrics(tree_f1, 'f1-scores')

In [22]:
tree_cms = tree_res['cms']

plot_confusion_matrix(tree_cms)

#### Floresta Aleatória

In [23]:
forest_param_grid = {
    'max_depth': [None, 5, 10, 20, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    #'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 50, 200, 500]
}

forest_model = RandomForestClassifier()
forest_res = validate_model(forest_model, X, y, forest_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(forest_res)

  0%|          | 0/10 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
0.9133

===> MATRIZ DE CONFUSÃO GERAL <===
[[121   4   1   0   0  32]
 [  3 172   5   0   0  10]
 [  3   0 384   0   0   4]
 [  0   0   0  67   0   5]
 [  5   0   0   0  46   4]
 [ 25   2   3   2   2 368]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Label      | Recall     | Precision  | F1-Score  
------------------------------------------------
0          | 0.7667     | 0.7784     | 0.7683    
1          | 0.9053     | 0.9685     | 0.9336    
2          | 0.9821     | 0.9775     | 0.9796    
3          | 0.9304     | 0.9778     | 0.9488    
4          | 0.84       | 0.9633     | 0.8881    
5          | 0.9155     | 0.8725     | 0.8924    

===> MELHORES HIPERPARÂMETROS <===
Ocorrências  | Valores                                                                    
----------------------------------------------------------------------------------------------------------------------------------
2            | (('max_depth', 100), ('min_samples_leaf', 1), ('min

In [24]:
forest_acc = forest_res['accuracies']

plot_accuracies(forest_acc)

In [25]:
forest_f1 = forest_res['f1-scores']

plot_label_metrics(forest_f1, 'f1-scores')

In [26]:
forest_cms = forest_res['cms']

plot_confusion_matrix(forest_cms)

#### Regressão Logística

In [27]:
lr_param_grid = {
    'model__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
    'model__penalty': ['l2', 'l1', 'elasticnet', None],
    'model__C': [1, 0.1, 0.01, 10, 100]
}

lr_model = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression(max_iter=5000))])
lr_res = validate_model(lr_model, X, y, lr_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(lr_res)

  0%|          | 0/10 [00:00<?, ?it/s]



175 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mauri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mauri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mauri\AppDat

===> ACURÁCIA MÉDIA <===
0.8682

===> MATRIZ DE CONFUSÃO GERAL <===
[[117   8   0   1   4  28]
 [  3 164   6   0   4  13]
 [  1   3 375   2   2   8]
 [  0   1   0  69   0   2]
 [  6   1   0   0  37  11]
 [ 26  16  10   4   7 339]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Label      | Recall     | Precision  | F1-Score  
------------------------------------------------
0          | 0.7404     | 0.7791     | 0.7471    
1          | 0.8632     | 0.8564     | 0.8563    
2          | 0.9591     | 0.9601     | 0.9592    
3          | 0.9607     | 0.9163     | 0.934     
4          | 0.6667     | 0.7414     | 0.6851    
5          | 0.8437     | 0.8464     | 0.8432    

===> MELHORES HIPERPARÂMETROS <===
Ocorrências  | Valores                                                                    
----------------------------------------------------------------------------------------------------------------------------------
2            | (('model__C', 1), ('model__penalty', 'l1'), ('model

In [28]:
lr_acc = lr_res['accuracies']

plot_accuracies(lr_acc)

In [29]:
lr_f1 = lr_res['f1-scores']

plot_label_metrics(lr_f1, 'f1-scores')

In [30]:
lr_cms = lr_res['cms']

plot_confusion_matrix(lr_cms)

#### Naive Bayes

In [31]:
norm_test = ss.normaltest(X[:,1])
p_values = norm_test.pvalue
p_values >= 0.01

np.False_

In [32]:
p_values

np.float64(2.4442545452454702e-45)

In [33]:
p_values

np.float64(2.4442545452454702e-45)

#### SVM

In [34]:
svc_param_grid = {
    'model__C': [1, 0.1, 0.01, 10, 100],
    'model__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'model__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}

svc_model = Pipeline([('scaler', StandardScaler()), ('model', SVC())])
svc_res = validate_model(svc_model, X, y, svc_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(svc_res)

  0%|          | 0/10 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
0.8959

===> MATRIZ DE CONFUSÃO GERAL <===
[[114   2   2   0   0  40]
 [  4 173   7   0   0   6]
 [  2   2 381   2   0   4]
 [  0   0   0  68   0   4]
 [  5   0   0   0  47   3]
 [ 23  14   2   5   5 353]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Label      | Recall     | Precision  | F1-Score  
------------------------------------------------
0          | 0.7221     | 0.7788     | 0.7413    
1          | 0.9105     | 0.9139     | 0.9084    
2          | 0.9744     | 0.9722     | 0.9731    
3          | 0.9429     | 0.92       | 0.9236    
4          | 0.8533     | 0.919      | 0.8726    
5          | 0.8784     | 0.866      | 0.8701    

===> MELHORES HIPERPARÂMETROS <===
Ocorrências  | Valores                                                                    
----------------------------------------------------------------------------------------------------------------------------------
4            | (('model__C', 100), ('model__gamma', 0.01), ('model

In [35]:
svc_acc = svc_res['accuracies']

plot_accuracies(svc_acc)

In [36]:
svc_f1 = svc_res['f1-scores']

plot_label_metrics(svc_f1, 'f1-scores')

In [37]:
svc_cms = svc_res['cms']

plot_confusion_matrix(svc_cms)

#### Multi Layer Perceptron

In [38]:
mlp_param_grid = {
    'model__hidden_layer_sizes': [(10, ), (50,), (10, 10), (10, 30, 10)],
    'model__activation': ['relu', 'tanh'],
    'model__solver': ['adam', 'sgd'], # Justificar ausência do lfbgf
    'model__alpha': [0.0001, 0.001, 0.01],
    'model__learning_rate': ['constant', 'invscaling', 'adaptive']
    #'model__batch_size': ['auto', 4, 8, 16, 32]
}

mlp_model = Pipeline([('scaler', StandardScaler()), ('model', MLPClassifier(max_iter=1000))])
mlp_res = validate_model(mlp_model, X, y, mlp_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(mlp_res)

  0%|          | 0/10 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
0.9109

===> MATRIZ DE CONFUSÃO GERAL <===
[[126   1   0   1   2  28]
 [  1 178   2   0   0   9]
 [  2   2 382   0   0   5]
 [  0   0   0  69   0   3]
 [  4   0   0   0  47   4]
 [ 20  11   7   4   7 353]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Label      | Recall     | Precision  | F1-Score  
------------------------------------------------
0          | 0.7975     | 0.8348     | 0.8105    
1          | 0.9368     | 0.9311     | 0.9318    
2          | 0.9769     | 0.9775     | 0.9768    
3          | 0.9571     | 0.9408     | 0.9449    
4          | 0.86       | 0.8539     | 0.847     
5          | 0.8783     | 0.8834     | 0.8783    

===> MELHORES HIPERPARÂMETROS <===
Ocorrências  | Valores                                                                    
----------------------------------------------------------------------------------------------------------------------------------
3            | (('model__activation', 'tanh'), ('model__alpha', 0.

In [39]:
mlp_acc = mlp_res['accuracies']

plot_accuracies(mlp_acc)

In [40]:
mlp_f1 = mlp_res['f1-scores']

plot_label_metrics(mlp_f1, 'f1-scores')

In [41]:
mlp_cms = mlp_res['cms']

plot_confusion_matrix(mlp_cms)