# 02 - Treinamento dos Modelos

## Importação

In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import scipy.stats as ss

from tqdm.notebook import tqdm

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report

# Scaler
from sklearn.preprocessing import StandardScaler

# Pipeline
from sklearn.pipeline import Pipeline

# Gráficos
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV, StratifiedKFold

ImportError: cannot import name 'DecisionforestClassifier' from 'sklearn.tree' (C:\Users\mauri\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\tree\__init__.py)

## Constantes e Sets

In [None]:
# PALETTE = 'RdYlGn'
PALETTE = 'viridis'
SEED = 42
N_ITER = 10
TRAINVAL_SPLITS = 5  # Ver se tudo bem!

pd.set_option('display.max_columns', None)
np.random.seed(SEED)

## Funções

In [None]:
def generate_colors(num_colors):
    colors = px.colors.sample_colorscale(PALETTE, [n/(num_colors - 1) for n in range(num_colors)])
    
    return colors

In [None]:
def validate_model(model, X, y, param_grid, n_iter=10, n_trainval_splits=10):
    skf = StratifiedKFold(n_splits=n_iter, shuffle=True)   # Shuffle?
    skf_folds = skf.split(X, y)
    
    runs_metrics = {}
    for n, (trainval_idx, test_idx) in enumerate(tqdm(skf_folds, total=n_iter)):
        X_trainval = X[trainval_idx]
        y_trainval = y[trainval_idx]
        
        X_test = X[test_idx]
        y_test = y[test_idx]

        best_params = get_best_params(model, X_trainval, y_trainval, param_grid, n_trainval_splits=n_trainval_splits, 
                                      display_results=False)
        
        model.set_params(**best_params)
        model.fit(X_trainval, y_trainval)
        
        model_metrics = evaluate_model_performance(model, X_test, y_test)
        model_metrics['best_params'] = best_params
        runs_metrics[n] = model_metrics

    runs_metrics = aggregate_run_metrics(runs_metrics)
    return runs_metrics

In [None]:
def get_best_params(model, X_trainval, y_trainval, param_grid, n_trainval_splits=10, display_results=False):
    skf = StratifiedKFold(n_splits=n_trainval_splits, shuffle=True)  # Shuffle?
    
    grid_search = GridSearchCV(model, param_grid=param_grid, refit=False, cv=skf, n_jobs=-1)
    grid_search.fit(X_trainval, y_trainval)
    
    best_params = grid_search.best_params_

    if display_results:
        df_res = pd.DataFrame(grid_search.cv_results_)
        df_res = df_res.sort_values('rank_test_score', ascending=True)
        display(df_res)

    return best_params

In [None]:
def evaluate_model_performance(model, X, y):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)

    report_dict = classification_report(y, y_pred, output_dict=True)
    report_dict['cm'] = cm

    return report_dict

In [None]:
def aggregate_run_metrics(runs_res):
    runs_ids = runs_res.keys()
    labels = [label for label in runs_res[0].keys() if label not in ['accuracy', 'micro avg', 'best_params', 'macro avg', 
                                                                     'weighted avg', 'cm']]  # Porco?

    accuracies = [runs_res[i]['accuracy'] for i in runs_ids]
    cms = [runs_res[i]['cm'] for i in runs_ids]
    best_params = [runs_res[i]['best_params'] for i in runs_ids]
    recalls = {i: [runs_res[j][i]['recall'] for j in runs_ids] for i in labels}
    precisions = {i: [runs_res[j][i]['precision'] for j in runs_ids] for i in labels}
    f1_scores = {i: [runs_res[j][i]['f1-score'] for j in runs_ids] for i in labels}

    metrics = {
        'accuracies': accuracies,
        'cms': cms,
        'f1-scores': f1_scores,
        'recalls': recalls,
        'precisions': precisions,
        'best_params': best_params
      }

    return metrics

In [None]:
def print_res(res):  # Cumulo da porquice, mas funciona bem
    print(f"===> ACURÁCIA MÉDIA <===\n{np.mean(res['accuracies']):.4f}", end='\n\n')
    print(f"===> MATRIZ DE CONFUSÃO GERAL <===\n{np.sum(res['cms'], axis=0)}", end='\n\n')
    
    print('===> RECALL, PRECISION E F1-SCORE MÉDIO <===') 
    print(f"{'Label'.ljust(10)} | {'Recall'.ljust(10)} | {'Precision'.ljust(10)} | {'F1-Score'.ljust(10)}")
    print('-'*48)
    for l in res['recalls'].keys():  # Igual para todos
        mean_recall = np.mean(res['recalls'][l])
        mean_precision = np.mean(res['precisions'][l])
        mean_f1_score = np.mean(res['f1-scores'][l])
        print(f'{l.ljust(10)} | {str(np.round(mean_recall, 4)).ljust(10)} | {str(np.round(mean_precision, 4)).ljust(10)} | '
              f'{str(np.round(mean_f1_score, 4)).ljust(10)}')

    print('\n===> MELHORES HIPERPARÂMETROS <===') 
    print(f"{'Ocorrências'.ljust(12)} | {'Valores'.ljust(75)}")
    print('-'*130)
    params_counts = Counter(tuple(param.items()) for param in res['best_params'])
    params_counts_mc = params_counts.most_common()
    
    for pcm in params_counts_mc:  # [:5] para mostrar apenas top 5
        pcm_values = pcm[0]
        pcm_occ = pcm[1]
    
        print(f'{str(pcm_occ).ljust(12)} | {pcm_values}')

In [None]:
def plot_accuracies(accuracies):
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=accuracies, nbinsx=5))
    fig.add_vline(x=np.mean(accuracies), line_dash='dash', annotation_text=f'Acurácia Média: {np.mean(accuracies):.2f}')
    fig.update_layout(title='Histograma da Acurácia', height=600)
    fig.show()

In [None]:
def plot_label_metrics(label_score, score_title):
    labels = list(label_score.keys())
    marker_colors = generate_colors(len(labels))

    fig = go.Figure()
    for i, l in enumerate(labels):
        name = f'Label {l}'
        color = marker_colors[i]
        f1_score = label_score[l]

        fig.add_trace(go.Box(y=f1_score, name=name, marker_color=color, legendgroup=i))

    fig.update_layout(title=f'Boxplots dos {score_title.title()} por label', height=600)
    fig.show()

In [None]:
def plot_confusion_matrix(cms):
    cm = np.sum(cms, axis=0)
    cm_mean = np.mean(cms, axis=0)
    cm_recall = cm/np.sum(cm, axis=1)
    cm_precision = cm/np.sum(cm, axis=0)
    # cm_f1score = np.nan_to_num(2*(cm_precision*cm_recall)/(cm_precision + cm_recall))  # Faz sentido?
    
    axis_labels = list(range(len(cm)))

    fig = make_subplots(rows=2, cols=2, shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.1,
                        subplot_titles=['Padrão', 'Média', 'Recall', 'Precision'])
    
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm, text=cm, texttemplate='%{text}', 
                             showscale=False, colorscale=PALETTE), 
                  row=1, col=1)

    # Média
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_mean, text=cm_mean, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=1, col=2)
    
    # F1-score
    # fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_f1score, text=cm_f1score, texttemplate='%{text:.2f}', 
    #                          showscale=False, colorscale=PALETTE), 
    #               row=1, col=2)

    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_recall, text=cm_recall, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=2, col=1)
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm_precision, text=cm_precision, texttemplate='%{text:.2f}', 
                             showscale=False, colorscale=PALETTE), 
                  row=2, col=2)

    fig.update_layout(title='Matrizes de Confusão', yaxis1_title='Real', yaxis3_title='Real', xaxis3_title='Predito', 
                      xaxis4_title='Predito', yaxis1_autorange='reversed', yaxis2_autorange='reversed', 
                      yaxis3_autorange='reversed', height=800)

    return fig

## Scripts

### Leitura

In [None]:
df = pd.read_pickle('../data/processed/steel-plates-fault.pkl')

df

In [None]:
df.describe()

### Divisão

In [None]:
X = df.drop(columns=['target']).values
y = df['target'].values

X.shape, y.shape

### Treinamento

#### KNN

In [None]:
knn_param_grid = {
    'model__n_neighbors': [5, 1, 3, 10],
    'model__p': [2, 1],
    'model__weights': ['uniform', 'distance']    
}

knn_model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier())])
knn_res = validate_model(knn_model, X, y, knn_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(knn_res)

In [None]:
knn_acc = knn_res['accuracies']

plot_accuracies(knn_acc)

In [None]:
knn_f1 = knn_res['f1-scores']

plot_label_metrics(knn_f1, 'f1-scores')

In [None]:
knn_cms = knn_res['cms']

plot_confusion_matrix(knn_cms)

#### Árvore de Decisão

In [None]:
tree_param_grid = {
    'max_depth': [None, 5, 10, 20, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']
}

tree_model = DecisionTreeClassifier()
tree_res = validate_model(tree_model, X, y, tree_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(tree_res)

In [None]:
tree_acc = tree_res['accuracies']

plot_accuracies(tree_acc)

In [None]:
tree_f1 = tree_res['f1-scores']

plot_label_metrics(tree_f1, 'f1-scores')

In [None]:
tree_cms = tree_res['cms']

plot_confusion_matrix(tree_cms)

#### Floresta Aleatória

In [None]:
forest_param_grid = {
    'max_depth': [None, 5, 10, 20, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    #'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 50, 200, 500]
}

forest_model = RandomForestClassifier()
forest_res = validate_model(forest_model, X, y, forest_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(forest_res)

In [None]:
forest_acc = forest_res['accuracies']

plot_accuracies(forest_acc)

In [None]:
forest_f1 = forest_res['f1-scores']

plot_label_metrics(forest_f1, 'f1-scores')

In [None]:
forest_cms = forest_res['cms']

plot_confusion_matrix(forest_cms)

#### Regressão Logística

In [None]:
lr_param_grid = {
    'model__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
    'model__penalty': ['l2', 'l1', 'elasticnet', None],
    'model__C': [1, 0.1, 0.01, 10, 100]
}

lr_model = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression(max_iter=5000))])
lr_res = validate_model(lr_model, X, y, lr_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(lr_res)

In [None]:
lr_acc = lr_res['accuracies']

plot_accuracies(lr_acc)

In [None]:
lr_f1 = lr_res['f1-scores']

plot_label_metrics(lr_f1, 'f1-scores')

In [None]:
lr_cms = lr_res['cms']

plot_confusion_matrix(lr_cms)

#### Naive Bayes

In [None]:
norm_test = ss.normaltest(X[:,1])
p_values = norm_test.pvalue
p_values >= 0.01

In [None]:
p_values

In [None]:
p_values

#### SVM

In [None]:
svc_param_grid = {
    'model__C': [1, 0.1, 0.01, 10, 100],
    'model__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'model__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}

svc_model = Pipeline([('scaler', StandardScaler()), ('model', SVC())])
svc_res = validate_model(svc_model, X, y, svc_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(svc_res)

In [None]:
svc_acc = svc_res['accuracies']

plot_accuracies(svc_acc)

In [None]:
svc_f1 = svc_res['f1-scores']

plot_label_metrics(svc_f1, 'f1-scores')

In [None]:
svc_cms = svc_res['cms']

plot_confusion_matrix(svc_cms)

#### Multi Layer Perceptron

In [None]:
mlp_param_grid = {
    'model__hidden_layer_sizes': [(10, ), (50,), (10, 10), (10, 30, 10)],
    'model__activation': ['relu', 'tanh'],
    'model__solver': ['adam', 'sgd'], # Justificar ausência do lfbgf
    'model__alpha': [0.0001, 0.001, 0.01],
    'model__learning_rate': ['constant', 'invscaling', 'adaptive']
    #'model__batch_size': ['auto', 4, 8, 16, 32]
}

mlp_model = Pipeline([('scaler', StandardScaler()), ('model', MLPClassifier(max_iter=1000))])
mlp_res = validate_model(mlp_model, X, y, mlp_param_grid, n_iter=N_ITER, n_trainval_splits=TRAINVAL_SPLITS)
print_res(mlp_res)

In [None]:
mlp_acc = mlp_res['accuracies']

plot_accuracies(mlp_acc)

In [None]:
mlp_f1 = mlp_res['f1-scores']

plot_label_metrics(mlp_f1, 'f1-scores')

In [None]:
mlp_cms = mlp_res['cms']

plot_confusion_matrix(mlp_cms)