# 02 - Treinamento dos Modelos

## Importação

In [41]:
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import json
import pickle

# Pre processamento
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Scaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Pipeline
from sklearn.pipeline import Pipeline

# Gráficos
import plotly.graph_objects as go
import plotly.express as px

from sklearn.metrics import confusion_matrix, roc_curve

from sklearn.model_selection import GridSearchCV

## Constantes e Sets

In [32]:
PALETTE = "RdYlGn"

In [30]:
pd.set_option('display.max_columns', None)

## Funções

In [None]:
def validate_model():
    pass

In [33]:
def plot_confusion_matrix(model, X, y):

    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)

    axis_labels = list(range(len(cm)))

    fig = go.Figure()
    fig.add_trace(go.Heatmap(x=axis_labels, y=axis_labels, z=cm, text=cm, texttemplate = "%{text}", colorscale=PALETTE))

    fig.update_layout(xaxis_title = "Predito", yaxis_title = "Real", yaxis_autorange = "reversed", title = "Matriz de Confusão")

    return fig

In [36]:
def plot_roc(model, X, y):
    y_score = model.predict_proba(X)
    # Verificar quais modelos têm predict_proba

    fpr, tpr, thresholds = roc_curve(y, y_score)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x = fpr, y = tpr, mode = "lines"))
    
    fig.update_layout(xaxis_title = "Taxa de Falso Positivo", yaxis_title = "Taxa de Verdadeiro Positivo",
                      title = "Curva ROC")

    return fig

In [38]:
def get_model_metrics(model, X, y):
    y_pred = model.predict(X)

    report_dict = classification_report(y, y_pred, output_dict = True)
    # Trazer AUC

    return report_dict

In [49]:
def get_best_params(model, X_train, y_train, param_grid, display_results = False):
    grid_search = GridSearchCV(model, param_grid=param_grid, refit=False, cv = 10) # cv = 10 -> Exigência do projeto
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_

    if display_results:
        df_res = pd.DataFrame(grid_search.cv_results_)
        df_res = df_res.sort_values("rank_test_score", ascending=True)
        display(df_res)

    return best_params

## Scripts

### Leitura

In [14]:
df = pd.read_pickle("../data/processed/steel-plates-fault.pkl")

df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,1687.0,1.0,0.0,80.0,0.0498,0.2415,0.1818,0.0047,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,0
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,1687.0,1.0,0.0,80.0,0.7647,0.3793,0.2069,0.0036,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,0
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,1623.0,1.0,0.0,100.0,0.9710,0.3426,0.3333,0.0037,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,0
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,1353.0,0.0,1.0,290.0,0.7287,0.4413,0.1556,0.0052,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,0
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,1353.0,0.0,1.0,185.0,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1263,221.0,242.0,3948212.0,3948253.0,519.0,33.0,41.0,48309.0,65.0,124.0,1360.0,1.0,0.0,200.0,0.3250,0.3972,0.5122,0.0154,0.6364,1.0000,1.0,2.7152,1.3222,1.6128,0.4878,-0.2728,0.9765,5
1264,1111.0,1121.0,4032298.0,4032320.0,110.0,20.0,22.0,12351.0,100.0,127.0,1354.0,1.0,0.0,200.0,0.3442,0.5000,0.4545,0.0074,0.5000,1.0000,1.0,2.0414,1.0000,1.3424,0.5454,-0.1228,0.3663,5
1265,995.0,1006.0,4085316.0,4085344.0,140.0,25.0,28.0,16076.0,103.0,132.0,1356.0,1.0,0.0,200.0,0.5162,0.5454,0.3929,0.0081,0.4400,1.0000,1.0,2.1461,1.0414,1.4472,0.6071,-0.1029,0.5096,5
1266,396.0,418.0,4116853.0,4116868.0,231.0,26.0,16.0,25096.0,56.0,141.0,1356.0,1.0,0.0,200.0,0.5841,0.3000,0.6818,0.0162,0.8461,0.9375,0.0,2.3636,1.3424,1.1761,-0.3182,-0.1512,0.5461,5


In [15]:
df['Outside_Global_Index'].value_counts()

Outside_Global_Index
1.0    683
0.0    525
0.5     60
Name: count, dtype: int64

In [16]:
df.columns

Index(['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
       'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'target'],
      dtype='object')

In [17]:
df.describe()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
count,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0
mean,508.635647,565.126183,1759791.0,1759855.0,2588.908517,143.205836,103.716088,283134.8,79.209779,130.299685,1445.250789,0.402997,0.597003,66.072555,0.310388,0.409378,0.589186,0.04052,0.597253,0.8026,0.562303,2.603358,1.388906,1.456804,0.067459,-0.135566,0.61065,2.695584
std,513.760815,481.284469,1719432.0,1719449.0,6107.39222,362.087198,524.113646,600689.8,32.625475,16.654437,137.299471,0.490694,0.490694,35.923484,0.303147,0.134019,0.264234,0.062152,0.241613,0.241219,0.484225,0.880245,0.535012,0.468135,0.483909,0.137824,0.3496,1.802939
min,0.0,6.0,7430.0,7458.0,2.0,2.0,1.0,250.0,0.0,70.0,1227.0,0.0,0.0,40.0,0.0,0.0,0.0083,0.0015,0.0144,0.105,0.0,0.301,0.301,0.0,-0.9319,-0.9989,0.119,0.0
25%,41.0,191.0,626630.0,626635.8,87.0,15.0,13.0,10110.0,46.0,124.0,1358.0,0.0,0.0,40.0,0.0585,0.31485,0.3757,0.0066,0.4,0.53985,0.0,1.9395,1.0,1.0792,-0.3648,-0.192625,0.2482,1.0
50%,283.0,330.5,1412536.0,1412546.0,200.5,28.0,28.0,21351.5,85.5,127.0,1362.0,0.0,1.0,60.0,0.18145,0.40745,0.5714,0.01015,0.6335,0.9565,1.0,2.3021,1.1761,1.38905,0.07225,-0.143,0.5708,2.0
75%,955.0,963.25,2246608.0,2246677.0,3638.5,184.25,115.0,369638.8,103.0,140.0,1624.0,1.0,1.0,70.0,0.53305,0.493725,0.837025,0.067625,0.7778,1.0,1.0,3.5609,2.01175,1.8129,0.468275,-0.08165,1.0,5.0
max,1688.0,1696.0,12987660.0,12987690.0,152655.0,10449.0,18152.0,11591410.0,196.0,252.0,1794.0,1.0,1.0,290.0,0.9923,0.9439,1.0,0.6226,1.0,1.0,1.0,5.1837,2.9385,4.2587,0.9917,0.5917,1.0,5.0


In [5]:
col = "X_Minimum"
targets = df['target'].unique()

fig = go.Figure()
for t in targets:
    df_filtered = df[df['target'] == t]

    fig.add_trace(go.Box(x = df_filtered[col], name = str(t)))
fig.show()

### Treinamento

### Divisão

In [21]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = df.drop(columns=["target"])
y = df["target"]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Por index
#indices = X.index
#train_indices, test_indices = train_test_split(indices, test_size=0.2, stratify=y)
#X_train = X.iloc[train_indices]
#X_test = X.iloc[test_indices]
#y_train = y.iloc[train_indices]
#y_test = y.iloc[test_indices]

### PCA

In [None]:
model_pca = Pipeline([('scaler', StandardScaler()) , ('model', PCA())])
#model_pca = Pipeline([('scaler', StandardScaler()) , ('model', TSNE(n_components=2))])
# model_pca = Pipeline([('scaler', StandardScaler()) , ('model', TSNE(n_components=3))])

X_pca = model_pca.fit_transform(X)

df_pca = pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], 'target': y})
# df_pca = pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], 'PC2': X_pca[:,3], 'target': y})
df_pca["is_train"] = False
df_pca.loc[train_indices, "is_train"] = True
df_pca = df_pca.astype({'target': str}).sort_values("target")
df_pca.head()

In [None]:
# plotar resultados
fig = px.scatter(data_frame=df_pca, x = "PC1", y = "PC2", color = "target")

fig.update_layout(showlegend = True)

In [None]:
# plotar resultados
fig = px.scatter(data_frame=df_pca[df_pca["is_train"]], x = "PC1", y = "PC2", color = "target")

fig.update_layout(showlegend = True)

In [None]:
# plotar resultados
fig = px.scatter(data_frame=df_pca[~df_pca["is_train"]], x = "PC1", y = "PC2", color = "target")

fig.update_layout(showlegend = True)

In [None]:
# plotar resultados 3d
# px.scatter_3d(data_frame=df_pca, x = 'PC1', y = 'PC2', z = 'PC3', color='target')

### KNN

In [48]:
model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])
# vamos criar um classificador kNN com k=5
model.fit(X_train, y_train)

#fig = plot_confusion_matrix(model, X_test, y_test)

#fig.show()

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.80      0.81       111
           1       0.93      0.94      0.93       133
           2       0.98      0.98      0.98       274
           3       0.96      0.96      0.96        50
           4       0.87      0.87      0.87        38
           5       0.91      0.91      0.91       281

    accuracy                           0.92       887
   macro avg       0.91      0.91      0.91       887
weighted avg       0.92      0.92      0.92       887

              precision    recall  f1-score   support

           0       0.86      0.68      0.76        47
           1       0.91      0.84      0.87        57
           2       0.95      0.97      0.96       117
           3       0.88      0.95      0.91        22
           4       0.93      0.82      0.88        17
           5       0.83      0.91      0.87       121

    accuracy                           0.89       381
   macro avg       0.89

In [53]:
model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])

param_grid = {"model__n_neighbors": [1, 3, 5, 10]}
best_params = get_best_params(model, X_train, y_train, param_grid, display_results = True)

model.set_params(**best_params)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.002304,0.000509,0.010219,0.000888,3,{'model__n_neighbors': 3},0.831461,0.921348,0.820225,0.94382,0.853933,0.853933,0.88764,0.840909,0.920455,0.863636,0.873736,0.040211,1
2,0.002206,0.000462,0.010568,0.000578,5,{'model__n_neighbors': 5},0.786517,0.910112,0.865169,0.88764,0.865169,0.853933,0.853933,0.840909,0.920455,0.875,0.865884,0.035637,2
0,0.002436,0.000813,0.012171,0.001176,1,{'model__n_neighbors': 1},0.808989,0.921348,0.842697,0.898876,0.865169,0.820225,0.876404,0.863636,0.920455,0.840909,0.865871,0.03707,3
3,0.002252,0.000516,0.010664,0.001051,10,{'model__n_neighbors': 10},0.820225,0.898876,0.865169,0.842697,0.808989,0.820225,0.853933,0.840909,0.909091,0.852273,0.851239,0.031156,4


In [54]:
model.fit(X_train, y_train)

In [None]:
get_best_params(model, X_train, y_train)

In [37]:
fig = plot_roc(model, X_test, y_test)

fig.show()

ValueError: multiclass format is not supported

In [None]:
model = Pipeline([('scaler', StandardScaler()) , ('model', SVC())])
# vamos criar um classificador kNN com k=5
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

### Árvore de Decisão

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
model = DecisionTreeClassifier(max_depth=10)

#treinando o modelo
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test)

#Resultados do classificador
print(classification_report(y_test, y_pred))

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
#model = RandomForestClassifier(max_depth=30, n_estimators=100)
model = RandomForestClassifier(max_depth=10)

#treinando o modelo
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test)

#Resultados do classificador
print(classification_report(y_test, y_pred))

---

In [None]:
X_train_v2 = X_train.iloc[:, :3]
X_test_v2 = X_test.iloc[:, :3]

model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])
# vamos criar um classificador kNN com k=5
model.fit(X_train_v2, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train_v2)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test_v2)))

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
model = DecisionTreeClassifier(max_depth=20)
#treinando o modelo
model.fit(X_train_v2, y_train)

y_pred_train = model.predict(X_train_v2)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test_v2)

#Resultados do classificador
print(classification_report(y_test, y_pred))

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
# model = DecisionTreeClassifier(max_depth=20)
model = RandomForestClassifier(n_estimators=50, max_depth=10)
#treinando o modelo
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test)

#Resultados do classificador
print(classification_report(y_test, y_pred))

In [None]:
feature_importances = {k: v for k, v in zip(X_train.columns, model.feature_importances_)}
feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: -item[1])}
feature_importances

In [None]:
# most_important_features = [f for f in feature_importances.keys() if feature_importances[f] > 0.05]
most_important_features = [f for f in feature_importances.keys()][:5]
most_important_features

In [None]:
X_train_v3 = X_train[most_important_features]
X_test_v3 = X_test[most_important_features]

model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])
# vamos criar um classificador kNN com k=5
model.fit(X_train_v3, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train_v3)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test_v3)))

In [None]:
X_train_v3 = X_train[most_important_features]
X_test_v3 = X_test[most_important_features]

model = DecisionTreeClassifier(max_depth=30)
# vamos criar um classificador kNN com k=5
model.fit(X_train_v3, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train_v3)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test_v3)))