# 04 - Treinamento dos Modelos com Ensemble

Neste notebook, é realizado o treinamento e avaliação de modelos ensemble.

## Importação

In [89]:
# Bibliotecas padrão
import pickle
import warnings
import os
from datetime import datetime as dt
from collections import Counter

# Bibliotecas utilitárias de terceiros
import numpy as np
import pandas as pd
import scipy.stats as ss
from tqdm.notebook import tqdm

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# Funções de avaliação e seleção de modelos
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Para escalonamento de dados e pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Para utilização de bagging
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Gráficos
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

## Constantes e Sets

In [90]:
# PALETTE = 'RdYlGn'
# PALETTE = 'viridis'
# PALETTE = 'inferno'
PALETTE = 'balance'
BLUE = '#346CB5'
SEED = 42
N_ITER = 20  # Podemos diminuir
TRAINVAL_SPLITS = 3  # Parece um número bom pra mim! 3, 5, 10...
FONT_SIZE = 16
EPS = np.finfo(float).eps

warnings.simplefilter('ignore'); os.environ['PYTHONWARNINGS'] = 'ignore'  # Para remover warnings desnecessários que só poluem o notebook
pd.set_option('display.max_columns', None)
np.random.seed(SEED)

pio.templates['bigger-font'] = pio.templates['plotly']
pio.templates['bigger-font']['layout']['font']['size'] = FONT_SIZE  # Usado para exportar para png

In [176]:
KNN_BEST_PARAMS = {
    'model__n_neighbors': 5,
    'model__p': 1,
    'model__weights': 'distance'    
}

DT_BEST_PARAMS = {
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2    
}

LR_BEST_PARAMS = {
    'model__solver': 'saga',
    'model__penalty': 'l1',
    'model__C': 1
}

SVM_PARAM_GRID = {
    'model__C': 10,
    'model__kernel': 'rbf',
    'model__gamma': 'scale'
}

MLP_PARAM_GRID = {
    'model__hidden_layer_sizes': (50,),
    'model__activation': 'tanh',
    'model__solver': 'adam',
    'model__alpha': 0.0001,
    'model__learning_rate': 'constant'
}


BAGGING_PARAM_GRID = {
    'n_estimators': [10, 50, 100, 200]
}

## Funções

In [92]:
def generate_colors(num_colors):
    colors = px.colors.sample_colorscale(PALETTE, [n/(num_colors - 1) for n in range(num_colors)])
    
    return colors

In [93]:
def validate_ensemble(model, X, y, param_grid, targets_map={}, n_iter=10, n_trainval_splits=10):
    skf = StratifiedKFold(n_splits=n_iter, random_state=SEED, shuffle=True)   # Shuffle?
    skf_folds = skf.split(X, y)
    
    runs_metrics = {}

    for n, (trainval_idx, test_idx) in enumerate(tqdm(skf_folds, total=n_iter)):
        X_trainval = X[trainval_idx]
        y_trainval = y[trainval_idx]
        
        X_test = X[test_idx]
        y_test = y[test_idx]
    
        best_params = get_best_params(model, X_trainval, y_trainval, param_grid, n_trainval_splits=n_trainval_splits, 
                                      display_results=False)

        t0_t = dt.now()
        model.set_params(**best_params)
        model.fit(X_trainval, y_trainval)    
        tf_t = dt.now()
        tt_delta = tf_t - t0_t
        
        model_metrics = evaluate_model_performance(model, X_test, y_test, targets_map=targets_map)
        model_metrics['best_params'] = best_params
        model_metrics['training_time'] = tt_delta
        runs_metrics[n] = model_metrics
    
    runs_metrics = aggregate_run_metrics(runs_metrics)
    return runs_metrics

In [94]:
def compute_feasibility(base_model, X, y, n_runs=3, n_splits=20, n_estimators=100):
    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)  # Shuffle?
    skf_folds = skf.split(X, y)
    train_idx, test_idx = next(skf_folds) #apenas pava avaliar uma das runs do problema original

    bagging_model = BaggingClassifier(base_model, n_estimators=n_estimators, random_state=SEED)

    fit_times = []
    pred_times = []
    for _ in range(n_runs):
        X_train = X[train_idx]
        y_train = y[train_idx]
        
        X_test = X[test_idx]
        #y_test = y[test_idx]
                
        t0_f = dt.now()
        bagging_model.fit(X_train, y_train)
        tf_f = dt.now()
        tf_delta = (tf_f - t0_f).total_seconds()
        fit_times.append(tf_delta)

        t0_p = dt.now()
        _ = bagging_model.predict(X_test) # não associado para nenhuma variável pois só queremos avaliar o tempo, não o resultado
        tf_p = dt.now()
        tp_delta = (tf_p - t0_p).total_seconds()
        pred_times.append(tp_delta)

    feasibility_results = {'mean_pred_time': np.mean(pred_times),
                           'std_pred_time': np.std(pred_times, ddof=1),
                           'mean_fit_time': np.mean(fit_times),
                           'std_fit_time': np.std(fit_times, ddof=1)
                           }

    return feasibility_results

In [95]:
def get_best_params(model, X_trainval, y_trainval, param_grid, n_trainval_splits=10, display_results=False):
    skf = StratifiedKFold(n_splits=n_trainval_splits, random_state=SEED, shuffle=True)  # Shuffle?
    
    grid_search = GridSearchCV(model, param_grid=param_grid, refit=False, cv=skf, n_jobs=-1)
    grid_search.fit(X_trainval, y_trainval)
    
    best_params = grid_search.best_params_

    if display_results:
        df_res = pd.DataFrame(grid_search.cv_results_)
        df_res = df_res.sort_values('rank_test_score', ascending=True)
        display(df_res)

    return best_params

In [96]:
def evaluate_model_performance(model, X, y, targets_map={}):
    t0_p = dt.now()
    y_pred = model.predict(X)
    tf_p = dt.now()
    tp_delta = tf_p - t0_p
    
    cm = confusion_matrix(y, y_pred)

    if targets_map != {}:
        target_names = [targets_map[yy] for yy in np.unique(y)]

    else:
        target_names = None
            
    report_dict = classification_report(y, y_pred, output_dict=True, target_names=target_names)
    report_dict['cm'] = cm
    report_dict['prediction_time'] = tp_delta

    return report_dict

In [97]:
def aggregate_run_metrics(runs_res):
    runs_ids = runs_res.keys()
    labels = [label for label in runs_res[0].keys() if label not in ['accuracy', 'micro avg', 'best_params', 'macro avg', 
                                                                     'weighted avg', 'training_time', 'prediction_time', 'cm']]  # Porco?

    accuracies = [runs_res[i]['accuracy'] for i in runs_ids]
    cms = [runs_res[i]['cm'] for i in runs_ids]
    f1_scores = {i: [runs_res[j][i]['f1-score'] for j in runs_ids] for i in labels}
    recalls = {i: [runs_res[j][i]['recall'] for j in runs_ids] for i in labels}
    precisions = {i: [runs_res[j][i]['precision'] for j in runs_ids] for i in labels}
    best_params = [runs_res[i]['best_params'] for i in runs_ids]
    training_time = [runs_res[i]['training_time'] for i in runs_ids]
    prediction_time = [runs_res[i]['prediction_time'] for i in runs_ids]

    metrics = {
        'accuracies': accuracies,
        'cms': cms,
        'f1-scores': f1_scores,
        'recalls': recalls,
        'precisions': precisions,
        'best_params': best_params,
        'training_time': training_time,
        'prediction_time': prediction_time
      }

    return metrics

In [98]:
def print_res(res):  # Cumulo da porquice, mas funciona bem
    mean_acc = np.mean(res['accuracies']) * 100
    std_acc = np.std(res['accuracies'], ddof=1) * 100
    
    print(f"===> ACURÁCIA MÉDIA <===\n({mean_acc:.2f} ± {std_acc:.2f})%", end='\n\n')
    print(f"===> MATRIZ DE CONFUSÃO GERAL <===\n{np.sum(res['cms'], axis=0)}", end='\n\n')
    
    print('===> RECALL, PRECISION E F1-SCORE MÉDIO <===') 
    print(f"{'Target'.ljust(12)} | {'Recall'.ljust(12)} (%) | {'Precision'.ljust(12)} (%) | {'F1-Score'.ljust(12)} (%)")
    print('-'*70)
    mean_recalls = []
    mean_precisions = []
    mean_f1_scores = []
    for l in res['recalls'].keys():  # Igual para todos
        mean_recall = np.mean(res['recalls'][l]) * 100
        mean_precision = np.mean(res['precisions'][l]) * 100
        mean_f1_score = np.mean(res['f1-scores'][l]) * 100

        std_recall = np.std(res['recalls'][l], ddof=1) * 100
        std_precision = np.std(res['precisions'][l], ddof=1) * 100
        std_f1_score = np.std(res['f1-scores'][l], ddof=1) * 100

        mean_recalls.append(mean_recall)
        mean_precisions.append(mean_precision)
        mean_f1_scores.append(mean_f1_score)
        
        print(f'{l.ljust(12)} | '
              f'{(str(np.round(mean_recall, 2)) + " ± " + str(np.round(std_recall, 2))).ljust(16)} | '
              f'{(str(np.round(mean_precision, 2)) + " ± " + str(np.round(std_precision, 2))).ljust(16)} | '
              f'{(str(np.round(mean_f1_score, 2)) + " ± " + str(np.round(std_f1_score, 2))).ljust(16)}')

    std_recalls = np.std(mean_recalls, ddof=1)
    std_precisions = np.std(mean_precisions, ddof=1)
    std_f1_scores = np.std(mean_f1_scores, ddof=1)
    
    mean_recalls = np.mean(mean_recalls)
    mean_precisions = np.mean(mean_precisions)
    mean_f1_scores = np.mean(mean_f1_scores)
    
    print(f"\n===> MÉDIA DO RECALL MÉDIO <===\n({mean_recalls:.2f} ± {std_recalls:.2f})%", end='\n\n')
    print(f"===> MÉDIA DO PRECISION MÉDIO <===\n({mean_precisions:.2f} ± {std_precisions:.2f})%", end='\n\n')
    print(f"===> MÉDIA DO F1-SCORE MÉDIO <===\n({mean_f1_scores:.2f} ± {std_f1_scores:.2f})%", end='\n\n')

    print('\n===> MELHORES HIPERPARÂMETROS <===') 
    print(f"{'Ocorrências'.ljust(12)} | {'Valores'.ljust(75)}")
    print('-'*130)
    params_counts = Counter(tuple(param.items()) for param in res['best_params'])
    params_counts_mc = params_counts.most_common()
    
    for pcm in params_counts_mc:  # [:5] para mostrar apenas top 5
        pcm_values = pcm[0]
        pcm_occ = pcm[1]
    
        print(f'{str(pcm_occ).ljust(12)} | {pcm_values}')

    print(f"\n===> TEMPO DE TREINAMENTO MÉDIO <===\n{np.mean(res['training_time'])}")

    print(f"\n===> TEMPO DE INFERÊNCIA MÉDIO <===\n{np.mean(res['prediction_time'])}")

In [99]:
def plot_accuracies(res, export=False, filename='acc', path='.'):
    accuracies = [r*100 for r in res['accuracies']]  # Acho que fica melhor em percentual
    
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=accuracies, nbinsx=5, marker_color=BLUE))
    fig.add_vline(x=np.mean(accuracies), line_dash='dash', annotation_text=f' Acurácia Média: {np.mean(accuracies):.2f}%')
    fig.update_layout(title=f'Distribuição das Acurácias após {N_ITER} iterações', yaxis_title='Frequência', xaxis_title='Acurácia (%)',
                      height=600, autosize=True)

    if export:
        export_fig(fig, filename, path)
    
    fig.show()

In [100]:
def plot_label_metrics(res, metric, export=False, filename='label_metric', path='.'):
    label_score = {k: [v*100 for v in res[metric][k]] for k in res[metric].keys()}  # Percentual
    
    labels = list(label_score.keys())
    marker_colors = generate_colors(len(labels))

    fig = go.Figure()
    for i, l in enumerate(labels):
        color = marker_colors[i]
        f1_score = label_score[l]

        fig.add_trace(go.Box(y=f1_score, name=l, marker_color=color, legendgroup=i))#, boxpoints='all'))

    fig.update_layout(title=f'Boxplots de {metric.title()} por label após {N_ITER} iterações', yaxis_title=f"{metric.title()} (%)",
                      xaxis_title="Classes", yaxis_range=(0, 101), height=600, autosize=True)  # Para ter comparabilidade entre os modelos

    if export:
        export_fig(fig, filename, path)
    
    fig.show()

In [101]:
def plot_confusion_matrix(res, export=False, filename='cm', path='.'):
    cms = res['cms']
    axis_labels = list(res['recalls'].keys())  # Igual para todos
    
    cm = np.sum(cms, axis=0)
    cm_mean = np.mean(cms, axis=0)
    cm_recall = cm/cm.sum(axis=1, keepdims=True)  # Normalização pela linha
    cm_precision = cm/cm.sum(axis=0, keepdims=True)  # Normalização pela coluna

    fig = make_subplots(rows=2, cols=2, shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.1,
                        subplot_titles=['Matriz de Confusão agregada', 'Matriz de Confusão média', 
                                        'Matriz de Confusão normalizada pelas linhas', 
                                        'Matriz de Confusão normalizada pelas colunas'])
    
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm, text=cm, texttemplate='%{text}', 
                             showscale=False, colorscale=PALETTE), 
                  row=1, col=1)
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_mean, text=cm_mean, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=1, col=2)


    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_recall, text=cm_recall, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=2, col=1)
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_precision, text=cm_precision, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=2, col=2)

    fig.update_layout(title=f'Matrizes de Confusão geradas após {N_ITER} iterações', yaxis1_title='Real', yaxis3_title='Real', 
                      xaxis3_title='Predito', xaxis4_title='Predito', yaxis1_autorange='reversed', yaxis2_autorange='reversed', 
                      yaxis3_autorange='reversed', height=1000, autosize=True)

    if export:
        export_fig(fig, filename, path)

    fig.show()

In [102]:
def export_fig(fig, filename, path):
    fig_png = go.Figure(fig)  # Deep copy para não alterar o objeto original
    fig_html = go.Figure(fig)

    fig_png = fig_png.update_layout(template='bigger-font', width=1100)  # Para cobrir toda lateral da pag
    fig_html = fig_html.update_layout(width=None, height=None, autosize=True)  # Para mudar conforme a página html

    fig_png.write_image(f'{path}/{filename}.png', scale=3)
    fig_html.write_html(f'{path}/{filename}.html')

## Scripts

### Leitura

#### Base de dados

Como primeiro passo vamos realizar a leitura dos dados pré-tratados anteriormente.

In [103]:
df = pd.read_pickle('../data/processed/steel-plates-fault.pkl')
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Target
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,1687.0,1.0,0.0,80.0,0.0498,0.2415,0.1818,0.0047,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,0
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,1687.0,1.0,0.0,80.0,0.7647,0.3793,0.2069,0.0036,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,0
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,1623.0,1.0,0.0,100.0,0.9710,0.3426,0.3333,0.0037,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,0
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,1353.0,0.0,1.0,290.0,0.7287,0.4413,0.1556,0.0052,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,0
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,1353.0,0.0,1.0,185.0,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1263,221.0,242.0,3948212.0,3948253.0,519.0,33.0,41.0,48309.0,65.0,124.0,1360.0,1.0,0.0,200.0,0.3250,0.3972,0.5122,0.0154,0.6364,1.0000,1.0,2.7152,1.3222,1.6128,0.4878,-0.2728,0.9765,5
1264,1111.0,1121.0,4032298.0,4032320.0,110.0,20.0,22.0,12351.0,100.0,127.0,1354.0,1.0,0.0,200.0,0.3442,0.5000,0.4545,0.0074,0.5000,1.0000,1.0,2.0414,1.0000,1.3424,0.5454,-0.1228,0.3663,5
1265,995.0,1006.0,4085316.0,4085344.0,140.0,25.0,28.0,16076.0,103.0,132.0,1356.0,1.0,0.0,200.0,0.5162,0.5454,0.3929,0.0081,0.4400,1.0000,1.0,2.1461,1.0414,1.4472,0.6071,-0.1029,0.5096,5
1266,396.0,418.0,4116853.0,4116868.0,231.0,26.0,16.0,25096.0,56.0,141.0,1356.0,1.0,0.0,200.0,0.5841,0.3000,0.6818,0.0162,0.8461,0.9375,0.0,2.3636,1.3424,1.1761,-0.3182,-0.1512,0.5461,5


In [104]:
df.dtypes

X_Minimum                float64
X_Maximum                float64
Y_Minimum                float64
Y_Maximum                float64
Pixels_Areas             float64
X_Perimeter              float64
Y_Perimeter              float64
Sum_of_Luminosity        float64
Minimum_of_Luminosity    float64
Maximum_of_Luminosity    float64
Length_of_Conveyer       float64
TypeOfSteel_A300         float64
TypeOfSteel_A400         float64
Steel_Plate_Thickness    float64
Edges_Index              float64
Empty_Index              float64
Square_Index             float64
Outside_X_Index          float64
Edges_X_Index            float64
Edges_Y_Index            float64
Outside_Global_Index     float64
LogOfAreas               float64
Log_X_Index              float64
Log_Y_Index              float64
Orientation_Index        float64
Luminosity_Index         float64
SigmoidOfAreas           float64
Target                     int64
dtype: object

In [105]:
df.describe()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Target
count,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0
mean,508.635647,565.126183,1759791.0,1759855.0,2588.908517,143.205836,103.716088,283134.8,79.209779,130.299685,1445.250789,0.402997,0.597003,66.072555,0.310388,0.409378,0.589186,0.04052,0.597253,0.8026,0.562303,2.603358,1.388906,1.456804,0.067459,-0.135566,0.61065,2.695584
std,513.760815,481.284469,1719432.0,1719449.0,6107.39222,362.087198,524.113646,600689.8,32.625475,16.654437,137.299471,0.490694,0.490694,35.923484,0.303147,0.134019,0.264234,0.062152,0.241613,0.241219,0.484225,0.880245,0.535012,0.468135,0.483909,0.137824,0.3496,1.802939
min,0.0,6.0,7430.0,7458.0,2.0,2.0,1.0,250.0,0.0,70.0,1227.0,0.0,0.0,40.0,0.0,0.0,0.0083,0.0015,0.0144,0.105,0.0,0.301,0.301,0.0,-0.9319,-0.9989,0.119,0.0
25%,41.0,191.0,626630.0,626635.8,87.0,15.0,13.0,10110.0,46.0,124.0,1358.0,0.0,0.0,40.0,0.0585,0.31485,0.3757,0.0066,0.4,0.53985,0.0,1.9395,1.0,1.0792,-0.3648,-0.192625,0.2482,1.0
50%,283.0,330.5,1412536.0,1412546.0,200.5,28.0,28.0,21351.5,85.5,127.0,1362.0,0.0,1.0,60.0,0.18145,0.40745,0.5714,0.01015,0.6335,0.9565,1.0,2.3021,1.1761,1.38905,0.07225,-0.143,0.5708,2.0
75%,955.0,963.25,2246608.0,2246677.0,3638.5,184.25,115.0,369638.8,103.0,140.0,1624.0,1.0,1.0,70.0,0.53305,0.493725,0.837025,0.067625,0.7778,1.0,1.0,3.5609,2.01175,1.8129,0.468275,-0.08165,1.0,5.0
max,1688.0,1696.0,12987660.0,12987690.0,152655.0,10449.0,18152.0,11591410.0,196.0,252.0,1794.0,1.0,1.0,290.0,0.9923,0.9439,1.0,0.6226,1.0,1.0,1.0,5.1837,2.9385,4.2587,0.9917,0.5917,1.0,5.0


#### Mapa dos targets

Em seguida, iremos realizar a importação do mapa dos targets. Isto irá facilitar na leitura dos gráficos e validação do problema:

In [106]:
with open('../data/processed/target_maps.pkl', 'rb') as file: 
    targets_map = pickle.load(file)

targets_map

{0: 'Pastry',
 1: 'Z_Scratch',
 2: 'K_Scatch',
 3: 'Stains',
 4: 'Dirtiness',
 5: 'Bumps',
 6: 'Other_Faults'}

### Divisão

Para este projeto, utilizaremos todas as variáveis disponíveis (exceto `target`) para realizar as predições. Assim, teremos:

In [107]:
X = df.drop(columns=['Target']).values
y = df['Target'].values

X.shape, y.shape

((1268, 27), (1268,))

### Instanciação dos modelos base

\# TODO: ESCREVER SOBRE O PORQUE ESTAMOS DEFININDO OS MODELOS BASE ANTES DE TUDO!

In [108]:
%%capture

knn_base = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier())])
knn_base.set_params(**KNN_BEST_PARAMS)

dt_base = DecisionTreeClassifier(random_state=SEED)
dt_base.set_params(**DT_BEST_PARAMS)

lr_base = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression(random_state=SEED, max_iter=5000))])
lr_base.set_params(**LR_BEST_PARAMS)

svm_base = Pipeline([('scaler', StandardScaler()), ('model', SVC(random_state=SEED))])
svm_base.set_params(**SVM_PARAM_GRID)

mlp_base = Pipeline([('scaler', StandardScaler()), ('model', MLPClassifier(random_state=SEED, max_iter=100_000, validation_fraction=0.2,
                                                                            n_iter_no_change=200, tol=0.0001, early_stopping=True))])
mlp_base.set_params(**MLP_PARAM_GRID)

### Análise de Viabilidade

\# TODO: EXPLICAR A IDEIA POR TRÁS DESTA ANÁLISE!

In [109]:
# base_models = {'KNN': knn_base,
#                'DT': dt_base,
#                'LR': lr_base,
#                'SVM': svm_base,
#                'MLP': mlp_base}

# for model_name, base_model in tqdm(base_models.items()):
#     mr = compute_feasibility(base_model, X, y, n_runs=3, n_splits=N_ITER, n_estimators=100)
#     print(f'========================> {model_name.ljust(3)} <========================')
#     print(f'TEMPO DE TREINAMENTO MÉDIO: ({mr["mean_fit_time"]:.4E} ± {mr["std_fit_time"]:.4E})s')
#     print(f'TEMPO DE INFERÊNCIA MÉDIO: ({mr["mean_pred_time"]:.4E} ± {mr["std_pred_time"]:.4E})s')
#     print(f'========================================================', end='\n\n')

  0%|          | 0/5 [00:00<?, ?it/s]

TEMPO DE TREINAMENTO MÉDIO: (1.7104E-01 ± 4.4439E-03)s
TEMPO DE INFERÊNCIA MÉDIO: (1.0147E+00 ± 1.3729E-02)s

TEMPO DE TREINAMENTO MÉDIO: (1.0230E+00 ± 3.6551E-02)s
TEMPO DE INFERÊNCIA MÉDIO: (5.3790E-03 ± 1.4757E-03)s

TEMPO DE TREINAMENTO MÉDIO: (2.4094E+02 ± 9.2746E+00)s
TEMPO DE INFERÊNCIA MÉDIO: (9.5133E-03 ± 8.8643E-04)s

TEMPO DE TREINAMENTO MÉDIO: (1.5081E+00 ± 5.0651E-02)s
TEMPO DE INFERÊNCIA MÉDIO: (2.5450E-01 ± 8.3830E-03)s

TEMPO DE TREINAMENTO MÉDIO: (1.0457E+02 ± 5.5871E+00)s
TEMPO DE INFERÊNCIA MÉDIO: (1.3354E-02 ± 2.4049E-03)s



### Treinamento Ensemble Não Balanceado

Definida a variável independente $\mathbf{X}$ e a variável alvo $\mathbf{y}$, podemos começar os treinamentos.

#### K-Nearest Neighbors (KNN)

In [160]:
knn_imb_bag = BaggingClassifier(knn_base, random_state=SEED)

knn_imb_bag_res = validate_ensemble(knn_imb_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(knn_imb_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(89.59 ± 3.59)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[101   2   0   0   2  53]
 [  2 170   3   0   0  15]
 [  2   2 382   0   0   5]
 [  0   0   0  68   0   4]
 [  5   3   0   0  44   3]
 [ 17   4   3   0   7 371]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 63.84 ± 15.48    | 82.0 ± 14.24     | 70.32 ± 10.85   
Z_Scratch    | 89.33 ± 10.36    | 94.74 ± 6.27     | 91.41 ± 5.41    
K_Scatch     | 97.71 ± 3.06     | 98.51 ± 2.34     | 98.08 ± 2.01    
Stains       | 94.58 ± 13.86    | 100.0 ± 0.0      | 96.62 ± 8.85    
Dirtiness    | 80.83 ± 23.74    | 88.67 ± 17.4     | 80.67 ± 15.89   
Bumps        | 92.31 ± 7.55     | 82.71 ± 7.01     | 87.01 ± 5.7     

===> MÉDIA DO RECALL MÉDIO <===
(86.43 ± 12.48)%

===> MÉDIA DO PRECISION MÉDIO <===
(91.11 ± 7.83)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(87.35 ± 10.51)%

In [111]:
plot_accuracies(knn_imb_bag_res, export=True, filename='knn_imb_bag_acc', path='../figs/metrics')

In [112]:
plot_label_metrics(knn_imb_bag_res, 'f1-scores', export=True, filename='knn_imb_bag_f1', path='../figs/metrics')

In [113]:
plot_confusion_matrix(knn_imb_bag_res, export=True, filename='knn_imb_bag_cm', path='../figs/metrics')

In [114]:
with open('../data/processed/knn_imb_bag_res.pkl', 'wb') as file:
    pickle.dump(knn_imb_bag_res, file)

#### Árvore de Decisão

In [172]:
dt_imb_bag = BaggingClassifier(dt_base, random_state=SEED)

dt_imb_bag_res = validate_ensemble(dt_imb_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(dt_imb_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(89.20 ± 4.08)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[113   0   0   0   1  44]
 [  3 169   4   0   0  14]
 [  4   2 376   0   0   9]
 [  0   0   0  68   0   4]
 [  6   1   0   0  43   5]
 [ 30   3   3   1   3 362]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 71.52 ± 12.9     | 74.81 ± 16.5     | 72.29 ± 12.45   
Z_Scratch    | 88.83 ± 8.34     | 96.98 ± 5.6      | 92.44 ± 5.38    
K_Scatch     | 96.17 ± 4.67     | 98.23 ± 2.47     | 97.13 ± 2.81    
Stains       | 94.58 ± 13.86    | 99.0 ± 4.47      | 96.06 ± 8.97    
Dirtiness    | 78.33 ± 24.24    | 95.0 ± 13.08     | 82.57 ± 16.33   
Bumps        | 90.06 ± 6.75     | 83.01 ± 7.06     | 86.23 ± 5.89    

===> MÉDIA DO RECALL MÉDIO <===
(86.58 ± 9.68)%

===> MÉDIA DO PRECISION MÉDIO <===
(91.17 ± 9.94)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(87.79 ± 9.46)%



In [116]:
plot_accuracies(dt_imb_bag_res, export=True, filename='dt_imb_bag_acc', path='../figs/metrics')

In [117]:
plot_label_metrics(dt_imb_bag_res, 'f1-scores', export=True, filename='dt_imb_bag_f1', path='../figs/metrics')

In [118]:
plot_confusion_matrix(dt_imb_bag_res, export=True, filename='dt_imb_bag_cm', path='../figs/metrics')

In [119]:
with open('../data/processed/dt_imb_bag_res.pkl', 'wb') as file:
    pickle.dump(dt_imb_bag_res, file)

#### Support Vector Machine (SVM)

In [162]:
svm_imb_bag = BaggingClassifier(svm_base, random_state=SEED)

svm_imb_bag_res = validate_ensemble(svm_imb_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(svm_imb_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(87.39 ± 4.19)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[102   6   0   0   0  50]
 [  5 159   5   0   1  20]
 [  5   0 374   1   0  11]
 [  0   0   0  68   0   4]
 [  7   1   0   0  41   6]
 [ 17  11   4   0   6 364]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 64.38 ± 13.2     | 76.35 ± 14.29    | 69.22 ± 12.12   
Z_Scratch    | 83.61 ± 11.17    | 91.0 ± 10.09     | 86.52 ± 7.81    
K_Scatch     | 95.67 ± 4.46     | 97.71 ± 2.6      | 96.62 ± 2.8     
Stains       | 94.58 ± 13.86    | 98.75 ± 5.59     | 95.9 ± 9.13     
Dirtiness    | 75.0 ± 27.84     | 84.58 ± 26.39    | 76.86 ± 23.66   
Bumps        | 90.56 ± 7.11     | 80.38 ± 6.98     | 84.99 ± 5.84    

===> MÉDIA DO RECALL MÉDIO <===
(83.97 ± 12.31)%

===> MÉDIA DO PRECISION MÉDIO <===
(88.13 ± 9.21)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(85.02 ± 10.69)%

In [121]:
plot_accuracies(svm_imb_bag_res, export=True, filename='svm_imb_bag_acc', path='../figs/metrics')

In [122]:
plot_label_metrics(svm_imb_bag_res, 'f1-scores', export=True, filename='svm_imb_bag_f1', path='../figs/metrics')

In [123]:
plot_confusion_matrix(svm_imb_bag_res, export=True, filename='svm_imb_bag_cm', path='../figs/metrics')

In [124]:
with open('../data/processed/svm_imb_bag_res.pkl', 'wb') as file:
    pickle.dump(svm_imb_bag_res, file)

### Treinamento Ensemble Balanceado (via undersampling)

#### K-Nearest Neighbors (KNN)

In [125]:
knn_bal_und_bag = BalancedBaggingClassifier(knn_base, sampling_strategy='not minority', sampler=RandomUnderSampler(), random_state=SEED)

knn_bal_und_bag_res = validate_ensemble(knn_bal_und_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(knn_bal_und_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(82.96 ± 5.25)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[114   9   1   2  17  15]
 [  3 173   2   2   7   3]
 [  1   7 368   5   3   7]
 [  0   0   0  70   0   2]
 [  1   1   0   0  53   0]
 [ 37  48   4  15  24 274]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 72.32 ± 18.26    | 73.59 ± 14.74    | 71.9 ± 14.77    
Z_Scratch    | 90.94 ± 8.83     | 73.67 ± 10.47    | 81.03 ± 8.33    
K_Scatch     | 94.14 ± 5.27     | 98.21 ± 2.51     | 96.03 ± 2.81    
Stains       | 97.08 ± 9.08     | 78.58 ± 16.77    | 85.34 ± 9.89    
Dirtiness    | 96.67 ± 14.91    | 54.62 ± 18.92    | 68.23 ± 17.36   
Bumps        | 68.12 ± 11.47    | 91.29 ± 8.01     | 77.52 ± 9.07    

===> MÉDIA DO RECALL MÉDIO <===
(86.55 ± 12.90)%

===> MÉDIA DO PRECISION MÉDIO <===
(78.33 ± 15.29)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(80.01 ± 9.97)%

In [126]:
plot_accuracies(knn_bal_und_bag_res, export=True, filename='knn_bal_und_bag_acc', path='../figs/metrics')

In [127]:
plot_label_metrics(knn_bal_und_bag_res, 'f1-scores', export=True, filename='knn_bal_und_bag_f1', path='../figs/metrics')

In [128]:
plot_confusion_matrix(knn_bal_und_bag_res, export=True, filename='knn_bal_und_bag_cm', path='../figs/metrics')

In [129]:
with open('../data/processed/knn_bal_und_bag_res.pkl', 'wb') as file:
    pickle.dump(knn_bal_und_bag_res, file)

#### Árvore de Decisão

In [130]:
dt_bal_und_bag = BalancedBaggingClassifier(dt_base, sampling_strategy='not minority', sampler=RandomUnderSampler(), random_state=SEED)

dt_bal_und_bag_res = validate_ensemble(dt_bal_und_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(dt_bal_und_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(89.42 ± 4.81)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[131   3   0   1   6  17]
 [  2 178   4   0   2   4]
 [  5   3 372   2   0   9]
 [  0   0   0  67   0   5]
 [  4   0   0   0  48   3]
 [ 36   6   7   3  12 338]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 82.77 ± 14.06    | 74.75 ± 15.07    | 78.1 ± 13.32    
Z_Scratch    | 93.5 ± 7.47      | 94.37 ± 7.64     | 93.63 ± 5.56    
K_Scatch     | 95.17 ± 4.51     | 97.29 ± 3.59     | 96.12 ± 2.66    
Stains       | 93.33 ± 14.46    | 93.25 ± 10.67    | 92.27 ± 10.07   
Dirtiness    | 87.5 ± 20.86     | 76.67 ± 22.47    | 78.66 ± 18.41   
Bumps        | 84.07 ± 7.66     | 90.2 ± 7.47      | 86.84 ± 6.49    

===> MÉDIA DO RECALL MÉDIO <===
(89.39 ± 5.32)%

===> MÉDIA DO PRECISION MÉDIO <===
(87.75 ± 9.62)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(87.60 ± 7.77)%



In [131]:
plot_accuracies(dt_bal_und_bag_res, export=True, filename='dt_bal_und_bag_acc', path='../figs/metrics')

In [132]:
plot_label_metrics(dt_bal_und_bag_res, 'f1-scores', export=True, filename='dt_bal_und_bag_f1', path='../figs/metrics')

In [133]:
plot_confusion_matrix(dt_bal_und_bag_res, export=True, filename='dt_bal_und_bag_cm', path='../figs/metrics')

In [134]:
with open('../data/processed/dt_bal_und_bag_res.pkl', 'wb') as file:
    pickle.dump(dt_bal_und_bag_res, file)

#### Support Vector Machine (SVM)

In [135]:
svm_bal_und_bag = BalancedBaggingClassifier(svm_base, sampling_strategy='not minority', sampler=RandomUnderSampler(), random_state=SEED)

svm_bal_und_bag_res = validate_ensemble(svm_bal_und_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(svm_bal_und_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(87.38 ± 4.05)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[135   3   0   0   4  16]
 [  4 179   5   0   0   2]
 [  2   9 372   2   0   6]
 [  0   0   0  69   0   3]
 [  5   0   0   0  50   0]
 [ 51  21   6   8  13 303]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 85.54 ± 15.32    | 70.26 ± 12.54    | 76.02 ± 9.58    
Z_Scratch    | 94.17 ± 6.52     | 86.15 ± 11.53    | 89.41 ± 6.44    
K_Scatch     | 95.14 ± 4.54     | 97.28 ± 2.93     | 96.11 ± 2.31    
Stains       | 95.83 ± 10.3     | 90.17 ± 14.4     | 91.78 ± 9.0     
Dirtiness    | 90.83 ± 19.85    | 76.75 ± 20.8     | 81.58 ± 17.22   
Bumps        | 75.37 ± 11.29    | 92.27 ± 6.16     | 82.39 ± 7.42    

===> MÉDIA DO RECALL MÉDIO <===
(89.48 ± 7.89)%

===> MÉDIA DO PRECISION MÉDIO <===
(85.48 ± 10.16)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(86.22 ± 7.47)%


In [136]:
plot_accuracies(svm_bal_und_bag_res, export=True, filename='svm_bal_und_bag_acc', path='../figs/metrics')

In [137]:
plot_label_metrics(svm_bal_und_bag_res, 'f1-scores', export=True, filename='svm_bal_und_bag_f1', path='../figs/metrics')

In [138]:
plot_confusion_matrix(svm_bal_und_bag_res, export=True, filename='svm_bal_und_bag_cm', path='../figs/metrics')

In [139]:
with open('../data/processed/svm_bal_und_bag_res.pkl', 'wb') as file:
    pickle.dump(svm_bal_und_bag_res, file)

### Treinamento Ensemble Balanceado (via oversampling)

#### K-Nearest Neighbors (KNN)

In [140]:
knn_bal_ovr_bag = BalancedBaggingClassifier(knn_base, sampling_strategy='not majority', sampler=SMOTE(), random_state=SEED)

knn_bal_ovr_bag_res = validate_ensemble(knn_bal_ovr_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(knn_bal_ovr_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(88.64 ± 3.43)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[119   3   0   0   5  31]
 [  4 180   4   0   0   2]
 [  2   2 381   1   2   3]
 [  0   1   0  68   0   3]
 [  2   1   0   0  52   0]
 [ 41  20   3   4  10 324]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 75.45 ± 14.18    | 71.54 ± 9.42     | 72.72 ± 9.31    
Z_Scratch    | 94.67 ± 5.49     | 88.52 ± 11.12    | 91.08 ± 6.79    
K_Scatch     | 97.46 ± 3.07     | 98.27 ± 2.42     | 97.82 ± 1.9     
Stains       | 94.58 ± 13.86    | 94.5 ± 9.85      | 93.52 ± 9.33    
Dirtiness    | 95.0 ± 16.31     | 79.25 ± 21.97    | 84.35 ± 17.57   
Bumps        | 80.58 ± 8.25     | 89.86 ± 7.7      | 84.6 ± 5.7      

===> MÉDIA DO RECALL MÉDIO <===
(89.62 ± 9.20)%

===> MÉDIA DO PRECISION MÉDIO <===
(86.99 ± 9.93)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(87.35 ± 8.86)%



In [141]:
plot_accuracies(knn_bal_ovr_bag_res, export=True, filename='knn_bal_ovr_bag_acc', path='../figs/metrics')

In [142]:
plot_label_metrics(knn_bal_ovr_bag_res, 'f1-scores', export=True, filename='knn_bal_ovr_bag_f1', path='../figs/metrics')

In [143]:
plot_confusion_matrix(knn_bal_ovr_bag_res, export=True, filename='knn_bal_ovr_bag_cm', path='../figs/metrics')

In [144]:
with open('../data/processed/knn_bal_ovr_bag_res.pkl', 'wb') as file:
    pickle.dump(knn_bal_ovr_bag_res, file)

#### Árvore de Decisão

In [145]:
dt_bal_ovr_bag = BalancedBaggingClassifier(dt_base, sampling_strategy='not majority', sampler=SMOTE(), random_state=SEED)

dt_bal_ovr_bag_res = validate_ensemble(dt_bal_ovr_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(dt_bal_ovr_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(91.95 ± 4.02)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[129   2   0   0   5  22]
 [  3 181   0   0   0   6]
 [  4   0 381   0   0   6]
 [  0   0   0  68   0   4]
 [  4   0   0   0  48   3]
 [ 28   7   5   2   1 359]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 81.61 ± 13.59    | 78.04 ± 14.52    | 79.25 ± 12.37   
Z_Scratch    | 95.11 ± 8.36     | 95.89 ± 7.17     | 95.12 ± 5.66    
K_Scatch     | 97.47 ± 3.86     | 98.75 ± 2.23     | 98.06 ± 2.37    
Stains       | 94.58 ± 13.86    | 98.0 ± 6.16      | 95.51 ± 9.06    
Dirtiness    | 87.5 ± 20.86     | 91.25 ± 13.91    | 87.02 ± 14.45   
Bumps        | 89.31 ± 5.99     | 89.91 ± 5.52     | 89.51 ± 5.03    

===> MÉDIA DO RECALL MÉDIO <===
(90.93 ± 5.92)%

===> MÉDIA DO PRECISION MÉDIO <===
(91.97 ± 7.70)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(90.75 ± 6.97)%



In [146]:
plot_accuracies(dt_bal_ovr_bag_res, export=True, filename='dt_bal_ovr_bag_acc', path='../figs/metrics')

In [147]:
plot_label_metrics(dt_bal_ovr_bag_res, 'f1-scores', export=True, filename='dt_bal_ovr_bag_f1', path='../figs/metrics')

In [148]:
plot_confusion_matrix(dt_bal_ovr_bag_res, export=True, filename='dt_bal_ovr_bag_cm', path='../figs/metrics')

In [149]:
with open('../data/processed/dt_bal_ovr_bag_res.pkl', 'wb') as file:
    pickle.dump(dt_bal_ovr_bag_res, file)

#### Support Vector Machine (SVM)

In [150]:
svm_bal_ovr_bag = BalancedBaggingClassifier(svm_base, sampling_strategy='not majority', sampler=SMOTE(), random_state=SEED)

svm_bal_ovr_bag_res = validate_ensemble(svm_bal_ovr_bag, X, y, BAGGING_PARAM_GRID, targets_map=targets_map, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(svm_bal_ovr_bag_res)

  0%|          | 0/20 [00:00<?, ?it/s]

===> ACURÁCIA MÉDIA <===
(90.61 ± 3.80)%

===> MATRIZ DE CONFUSÃO GERAL <===
[[122   3   3   0   3  27]
 [  2 177   5   0   0   6]
 [  2   3 382   0   0   4]
 [  0   0   0  68   0   4]
 [  5   0   0   0  47   3]
 [ 29  10   4   3   3 353]]

===> RECALL, PRECISION E F1-SCORE MÉDIO <===
Target       | Recall       (%) | Precision    (%) | F1-Score     (%)
----------------------------------------------------------------------
Pastry       | 77.32 ± 14.45    | 77.92 ± 14.76    | 76.79 ± 11.8    
Z_Scratch    | 93.0 ± 7.34      | 92.49 ± 9.34     | 92.44 ± 6.53    
K_Scatch     | 97.72 ± 3.05     | 97.06 ± 2.88     | 97.34 ± 2.07    
Stains       | 94.58 ± 13.86    | 96.5 ± 8.6       | 94.63 ± 9.38    
Dirtiness    | 85.83 ± 21.13    | 88.75 ± 18.39    | 86.12 ± 17.71   
Bumps        | 87.8 ± 8.3       | 89.09 ± 6.54     | 88.26 ± 6.33    

===> MÉDIA DO RECALL MÉDIO <===
(89.38 ± 7.35)%

===> MÉDIA DO PRECISION MÉDIO <===
(90.30 ± 7.01)%

===> MÉDIA DO F1-SCORE MÉDIO <===
(89.26 ± 7.36)%



In [151]:
plot_accuracies(svm_bal_ovr_bag_res, export=True, filename='svm_bal_ovr_bag_acc', path='../figs/metrics')

In [152]:
plot_label_metrics(svm_bal_ovr_bag_res, 'f1-scores', export=True, filename='svm_bal_ovr_bag_f1', path='../figs/metrics')

In [153]:
plot_confusion_matrix(svm_bal_ovr_bag_res, export=True, filename='svm_bal_ovr_bag_cm', path='../figs/metrics')

In [154]:
with open('../data/processed/svm_bal_ovr_bag_res.pkl', 'wb') as file:
    pickle.dump(svm_bal_ovr_bag_res, file)