# 02 - Treinamento dos Modelos

## Importação

In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import json
import pickle

# Pre processamento
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Scaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Pipeline
from sklearn.pipeline import Pipeline

# Gráficos
import plotly.graph_objects as go
import plotly.express as px

## Funções

## Scripts

### Leitura

In [3]:
df = pd.read_pickle("../data/processed/steel-plates-fault.pkl")
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,...,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,0
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,...,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,0
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,...,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,0
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,...,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,0
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,...,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1263,221.0,242.0,3948212.0,3948253.0,519.0,33.0,41.0,48309.0,65.0,124.0,...,0.6364,1.0000,1.0,2.7152,1.3222,1.6128,0.4878,-0.2728,0.9765,5
1264,1111.0,1121.0,4032298.0,4032320.0,110.0,20.0,22.0,12351.0,100.0,127.0,...,0.5000,1.0000,1.0,2.0414,1.0000,1.3424,0.5454,-0.1228,0.3663,5
1265,995.0,1006.0,4085316.0,4085344.0,140.0,25.0,28.0,16076.0,103.0,132.0,...,0.4400,1.0000,1.0,2.1461,1.0414,1.4472,0.6071,-0.1029,0.5096,5
1266,396.0,418.0,4116853.0,4116868.0,231.0,26.0,16.0,25096.0,56.0,141.0,...,0.8461,0.9375,0.0,2.3636,1.3424,1.1761,-0.3182,-0.1512,0.5461,5


In [5]:
col = "X_Minimum"
targets = df['target'].unique()

fig = go.Figure()
for t in targets:
    df_filtered = df[df['target'] == t]

    fig.add_trace(go.Box(x = df_filtered[col], name = str(t)))
fig.show()

### Treinamento

### Divisão

In [None]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = df.drop(columns=["target"])
y = df["target"]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# Por index
#indices = X.index
#train_indices, test_indices = train_test_split(indices, test_size=0.2, stratify=y)
#X_train = X.iloc[train_indices]
#X_test = X.iloc[test_indices]
#y_train = y.iloc[train_indices]
#y_test = y.iloc[test_indices]

### PCA

In [None]:
model_pca = Pipeline([('scaler', StandardScaler()) , ('model', PCA())])
#model_pca = Pipeline([('scaler', StandardScaler()) , ('model', TSNE(n_components=2))])
# model_pca = Pipeline([('scaler', StandardScaler()) , ('model', TSNE(n_components=3))])

X_pca = model_pca.fit_transform(X)

df_pca = pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], 'target': y})
# df_pca = pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], 'PC2': X_pca[:,3], 'target': y})
df_pca["is_train"] = False
df_pca.loc[train_indices, "is_train"] = True
df_pca = df_pca.astype({'target': str}).sort_values("target")
df_pca.head()

In [None]:
# plotar resultados
fig = px.scatter(data_frame=df_pca, x = "PC1", y = "PC2", color = "target")

fig.update_layout(showlegend = True)

In [None]:
# plotar resultados
fig = px.scatter(data_frame=df_pca[df_pca["is_train"]], x = "PC1", y = "PC2", color = "target")

fig.update_layout(showlegend = True)

In [None]:
# plotar resultados
fig = px.scatter(data_frame=df_pca[~df_pca["is_train"]], x = "PC1", y = "PC2", color = "target")

fig.update_layout(showlegend = True)

In [None]:
# plotar resultados 3d
# px.scatter_3d(data_frame=df_pca, x = 'PC1', y = 'PC2', z = 'PC3', color='target')

### KNN

In [None]:
model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])
# vamos criar um classificador kNN com k=5
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

In [None]:
model = Pipeline([('scaler', StandardScaler()) , ('model', SVC())])
# vamos criar um classificador kNN com k=5
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

### Árvore de Decisão

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
model = DecisionTreeClassifier(max_depth=10)

#treinando o modelo
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test)

#Resultados do classificador
print(classification_report(y_test, y_pred))

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
#model = RandomForestClassifier(max_depth=30, n_estimators=100)
model = RandomForestClassifier(max_depth=10)

#treinando o modelo
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test)

#Resultados do classificador
print(classification_report(y_test, y_pred))

---

In [None]:
X_train_v2 = X_train.iloc[:, :3]
X_test_v2 = X_test.iloc[:, :3]

model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])
# vamos criar um classificador kNN com k=5
model.fit(X_train_v2, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train_v2)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test_v2)))

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
model = DecisionTreeClassifier(max_depth=20)
#treinando o modelo
model.fit(X_train_v2, y_train)

y_pred_train = model.predict(X_train_v2)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test_v2)

#Resultados do classificador
print(classification_report(y_test, y_pred))

In [None]:
#scikit-learn usa uma versão otimizada do algoritmo CART (similar ao C4.5)
# model = DecisionTreeClassifier(max_depth=20)
model = RandomForestClassifier(n_estimators=50, max_depth=10)
#treinando o modelo
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
#Resultados do classificador
print(classification_report(y_train, y_pred_train))

#predição
y_pred = model.predict(X_test)

#Resultados do classificador
print(classification_report(y_test, y_pred))

In [None]:
feature_importances = {k: v for k, v in zip(X_train.columns, model.feature_importances_)}
feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: -item[1])}
feature_importances

In [None]:
# most_important_features = [f for f in feature_importances.keys() if feature_importances[f] > 0.05]
most_important_features = [f for f in feature_importances.keys()][:5]
most_important_features

In [None]:
X_train_v3 = X_train[most_important_features]
X_test_v3 = X_test[most_important_features]

model = Pipeline([('scaler', StandardScaler()) , ('model', KNeighborsClassifier(n_neighbors=5))])
# vamos criar um classificador kNN com k=5
model.fit(X_train_v3, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train_v3)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test_v3)))

In [None]:
X_train_v3 = X_train[most_important_features]
X_test_v3 = X_test[most_important_features]

model = DecisionTreeClassifier(max_depth=30)
# vamos criar um classificador kNN com k=5
model.fit(X_train_v3, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_train, model.predict(X_train_v3)))

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test_v3)))