# 02 - Análise Exploratória Básica

Neste notebook, busca-se realizar uma análise exploratória básica dos dados para auxiliar no treinamento dos modelos.

## Importações

In [1]:
# Bibliotecas padrão
import pickle

# Bibliotecas utilitárias de terceiros
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.preprocessing import StandardScaler

# Pessoal
from useful.constants import RED, YELLOW, GREEN, PALETTE
from useful.config import set_default_configs
from useful.plotly_tools import export_fig

## Configurações padrão

In [2]:
set_default_configs()  # Remover warnings, aumentar quantidade de colunas no pandas e setar seed

## Scripts

### Leitura

#### Base de dados

Como primeiro passo vamos realizar a leitura dos dados pré-tratados anteriormente.

In [3]:
df = pd.read_pickle('../data/processed/steel-plates-fault.pkl')

df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Target
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,1687.0,1.0,0.0,80.0,0.0498,0.2415,0.1818,0.0047,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,0
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,1687.0,1.0,0.0,80.0,0.7647,0.3793,0.2069,0.0036,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,0
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,1623.0,1.0,0.0,100.0,0.9710,0.3426,0.3333,0.0037,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,0
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,1353.0,0.0,1.0,290.0,0.7287,0.4413,0.1556,0.0052,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,0
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,1353.0,0.0,1.0,185.0,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1263,221.0,242.0,3948212.0,3948253.0,519.0,33.0,41.0,48309.0,65.0,124.0,1360.0,1.0,0.0,200.0,0.3250,0.3972,0.5122,0.0154,0.6364,1.0000,1.0,2.7152,1.3222,1.6128,0.4878,-0.2728,0.9765,5
1264,1111.0,1121.0,4032298.0,4032320.0,110.0,20.0,22.0,12351.0,100.0,127.0,1354.0,1.0,0.0,200.0,0.3442,0.5000,0.4545,0.0074,0.5000,1.0000,1.0,2.0414,1.0000,1.3424,0.5454,-0.1228,0.3663,5
1265,995.0,1006.0,4085316.0,4085344.0,140.0,25.0,28.0,16076.0,103.0,132.0,1356.0,1.0,0.0,200.0,0.5162,0.5454,0.3929,0.0081,0.4400,1.0000,1.0,2.1461,1.0414,1.4472,0.6071,-0.1029,0.5096,5
1266,396.0,418.0,4116853.0,4116868.0,231.0,26.0,16.0,25096.0,56.0,141.0,1356.0,1.0,0.0,200.0,0.5841,0.3000,0.6818,0.0162,0.8461,0.9375,0.0,2.3636,1.3424,1.1761,-0.3182,-0.1512,0.5461,5


#### Mapa dos targets

Em seguida, iremos realizar a importação do mapa dos targets. Isto irá facilitar na leitura dos gráficos e validação do problema:

In [4]:
with open('../data/processed/target_maps.pkl', 'rb') as file: 
    targets_map = pickle.load(file)

targets_map

{0: 'Pastry',
 1: 'Z_Scratch',
 2: 'K_Scatch',
 3: 'Stains',
 4: 'Dirtiness',
 5: 'Bumps'}

### Análise exploratória

#### Análise de proporção de classes

Como uma primeira análise, vamos verificar a proporção das classes. Para tal, vamos primeiro observar os valores numéricos e em seguida vamos trazer um gráfico de barras destes valores. Para facilitar, vamos substituir os ids das classes pelos seus valores.

In [5]:
df['Target'] = df['Target'].replace(targets_map)

df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Target
0,42.0,50.0,270900.0,270944.0,267.0,17.0,44.0,24220.0,76.0,108.0,1687.0,1.0,0.0,80.0,0.0498,0.2415,0.1818,0.0047,0.4706,1.0000,1.0,2.4265,0.9031,1.6435,0.8182,-0.2913,0.5822,Pastry
1,645.0,651.0,2538079.0,2538108.0,108.0,10.0,30.0,11397.0,84.0,123.0,1687.0,1.0,0.0,80.0,0.7647,0.3793,0.2069,0.0036,0.6000,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,Pastry
2,829.0,835.0,1553913.0,1553931.0,71.0,8.0,19.0,7972.0,99.0,125.0,1623.0,1.0,0.0,100.0,0.9710,0.3426,0.3333,0.0037,0.7500,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.2150,Pastry
3,853.0,860.0,369370.0,369415.0,176.0,13.0,45.0,18996.0,99.0,126.0,1353.0,0.0,1.0,290.0,0.7287,0.4413,0.1556,0.0052,0.5385,1.0000,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,Pastry
4,1289.0,1306.0,498078.0,498335.0,2409.0,60.0,260.0,246930.0,37.0,126.0,1353.0,0.0,1.0,185.0,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0000,Pastry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1263,221.0,242.0,3948212.0,3948253.0,519.0,33.0,41.0,48309.0,65.0,124.0,1360.0,1.0,0.0,200.0,0.3250,0.3972,0.5122,0.0154,0.6364,1.0000,1.0,2.7152,1.3222,1.6128,0.4878,-0.2728,0.9765,Bumps
1264,1111.0,1121.0,4032298.0,4032320.0,110.0,20.0,22.0,12351.0,100.0,127.0,1354.0,1.0,0.0,200.0,0.3442,0.5000,0.4545,0.0074,0.5000,1.0000,1.0,2.0414,1.0000,1.3424,0.5454,-0.1228,0.3663,Bumps
1265,995.0,1006.0,4085316.0,4085344.0,140.0,25.0,28.0,16076.0,103.0,132.0,1356.0,1.0,0.0,200.0,0.5162,0.5454,0.3929,0.0081,0.4400,1.0000,1.0,2.1461,1.0414,1.4472,0.6071,-0.1029,0.5096,Bumps
1266,396.0,418.0,4116853.0,4116868.0,231.0,26.0,16.0,25096.0,56.0,141.0,1356.0,1.0,0.0,200.0,0.5841,0.3000,0.6818,0.0162,0.8461,0.9375,0.0,2.3636,1.3424,1.1761,-0.3182,-0.1512,0.5461,Bumps


In [6]:
target_prop = df['Target'].value_counts(ascending=True, normalize=True)

target_prop

Target
Dirtiness    0.043375
Stains       0.056782
Pastry       0.124606
Z_Scratch    0.149842
K_Scatch     0.308360
Bumps        0.317035
Name: proportion, dtype: float64

In [7]:
x_fig1 = target_prop.index
y_fig1 = target_prop.values * 100

fig1 = go.Figure()
fig1.add_trace(go.Bar(x=x_fig1, y=y_fig1, text=[f'{np.round(yy, 2)}%' for yy in y_fig1],
                      marker=dict(color=y_fig1, colorscale=[[0, RED], [0.5, YELLOW], [1, GREEN]])))
fig1.update_layout(title='Proporção das classes na base de dados utilizada', yaxis_title='Proporção (%)', xaxis_title='Classe',
                   height=600, autosize=True)
export_fig(fig1, 'class_prop', '../figs/eda')
fig1.show()

#### Análise de similaridade das classes

Outra análise passível de ser realizada é a de similaridade entre as classes. Para tal, pode-se calcular os centroides das classes com base no valor médio de cada variável que descreve cada uma das classes e identificar a similaridade euclidiana entre os centroides. Antes de mais nada vamos encontrar os centroides. Para que seja possível computar as distâncias é necessário escalonar os dados, visto que as variáveis são de escalas diferentes. Com isso, vamos realizar esta etapa:

In [8]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop(columns='Target'))

df_scaled = pd.DataFrame(scaled_data, columns=df.drop(columns='Target').columns)
df_scaled['Target'] = df['Target']
# df_centroids_scaled = df_scaled.groupby('Target').mean()
df_centroids_scaled = df_scaled.groupby('Target').median()  # Menos sensivel a outlier
df_centroids_scaled = df_centroids_scaled.loc[[t for t in targets_map.values() if t in df_centroids_scaled.index]] # Melhora a legibilidade

df_centroids_scaled

Unnamed: 0_level_0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Pastry,0.61213,0.563036,-0.396061,-0.396084,-0.38983,-0.3404,-0.13975,-0.435992,0.024231,-0.258272,1.477276,-0.821604,0.821604,0.52709,-0.528792,-0.661941,-0.968788,-0.545976,-0.402675,0.818666,0.90427,-0.321921,-0.727198,0.058652,1.238823,-0.315626,-0.284437
Z_Scratch,-0.784012,-0.919,-0.304646,-0.304654,-0.40015,-0.318297,-0.154066,-0.444719,0.422849,-0.378408,-0.650301,1.217131,-1.217131,0.109371,-0.521532,0.373019,-0.022284,-0.474349,-0.402675,0.532506,0.90427,-0.49893,-0.397917,-0.332949,0.5871,-0.210378,-0.439103
K_Scatch,-0.91058,-0.734006,-0.144008,-0.143933,0.604767,0.353077,0.067348,0.618239,-1.171626,0.22227,-0.606584,-0.821604,0.821604,-0.726067,-0.83091,0.353985,-0.475092,1.161794,0.089627,-1.361125,-1.161702,1.357706,1.498491,0.789179,-1.204543,-0.150495,1.11414
Stains,0.477773,0.406103,-0.364397,-0.364428,-0.421362,-0.376317,-0.189377,-0.467998,1.097435,0.582676,-0.635729,-0.821604,0.821604,-0.447587,0.923223,-0.007673,0.432037,-0.581386,1.667563,0.818666,-1.161702,-1.575248,-1.141932,-1.826484,-0.693192,0.965499,-1.351362
Dirtiness,0.226585,0.147318,0.166576,0.166589,-0.400314,-0.318297,-0.131161,-0.442772,0.852131,0.102134,-0.592011,-0.821604,0.821604,0.944809,1.937158,-0.077467,-1.497694,-0.534709,-1.270933,0.818666,0.90427,-0.502283,-0.727198,0.186551,1.514397,0.342357,-0.586473
Bumps,0.677361,0.650337,-0.126105,-0.126133,-0.404327,-0.348688,-0.167427,-0.450186,0.361523,-0.198205,1.302407,1.217131,-1.217131,0.109371,0.432673,-0.567888,0.608844,-0.521832,0.517339,0.818666,0.90427,-0.593657,-0.546664,-0.663967,0.067272,-0.05432,-0.86118


Assim, podemos calcular as similaridades por:

In [9]:
euc_dist = euclidean_distances(df_centroids_scaled)  # Normalizado
# euc_dist = euclidean_distances(df_centroids)  # Não normalizado

df_euc_dist = pd.DataFrame(euc_dist, index=df_centroids_scaled.index, columns=df_centroids_scaled.index)

df_euc_dist

Target,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Pastry,0.0,4.489706,6.643334,5.673374,3.78581,3.932614
Z_Scratch,4.489706,0.0,6.297665,5.678524,4.781443,3.500104
K_Scatch,6.643334,6.297665,0.0,7.471569,7.330619,7.322422
Stains,5.673374,5.678524,7.471569,0.0,5.640739,4.945116
Dirtiness,3.78581,4.781443,7.330619,5.640739,0.0,5.176034
Bumps,3.932614,3.500104,7.322422,4.945116,5.176034,0.0


In [10]:
x_fig2 = df_euc_dist.index
y_fig2 = x_fig2
z_fig2 = df_euc_dist.values

fig2 = go.Figure()
fig2.add_trace(go.Heatmap(x=x_fig2, y=y_fig2, z=z_fig2, text=z_fig2, texttemplate='%{text:.4f}', colorscale=PALETTE, reversescale=True))
fig2.update_layout(title=f'Distâncias entre os centroides das classes', yaxis_autorange='reversed', height=800, autosize=True)
export_fig(fig2, 'class_dist', '../figs/eda')
fig2.show()

Assim, podemos computar qual foi a classe mais semelhante para cada um dos casos, de forma que quanto menor, mais similar:

In [11]:
# df_euc_dist.apply(lambda s: df_euc_dist.columns[np.argsort(s)].to_list()[1], axis=1)  # Apenas 1
df_euc_dist.apply(lambda s: df_euc_dist.columns[np.argsort(s)].to_list()[1:], axis=1)  # Lista

Target
Pastry        [Dirtiness, Bumps, Z_Scratch, Stains, K_Scatch]
Z_Scratch        [Bumps, Pastry, Dirtiness, Stains, K_Scatch]
K_Scatch        [Z_Scratch, Pastry, Bumps, Dirtiness, Stains]
Stains        [Bumps, Dirtiness, Pastry, Z_Scratch, K_Scatch]
Dirtiness        [Pastry, Z_Scratch, Bumps, Stains, K_Scatch]
Bumps        [Z_Scratch, Pastry, Stains, Dirtiness, K_Scatch]
dtype: object