# Aprendizagem supervisionada: classificação

In [19]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import math

In [20]:
df = pd.read_csv('data/heart_tratado.csv', sep=';', encoding='utf-8')

In [21]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
912,45,M,TA,110,264.0,0,Normal,132,N,1.2,Flat,1
913,68,M,ASY,144,193.0,1,Normal,141,N,3.4,Flat,1
914,57,M,ASY,130,131.0,0,Normal,115,Y,1.2,Flat,1
915,57,F,ATA,130,236.0,0,LVH,174,N,0.0,Flat,1


In [22]:
df.shape

(917, 12)

# Transformando as variáveis categóricas nominais em variáveis categóricas ordinais

In [23]:
df2 = pd.DataFrame.copy(df)

In [24]:
df2['Sex'].replace({'M':0, 'F':1}, inplace=True)
df2['ChestPainType'].replace({'TA': 0, 'ATA': 1, 'NAP': 2, 'ASY': 3}, inplace=True)
df2['RestingECG'].replace({'Normal': 0, 'ST': 1, 'LVH': 2}, inplace=True)
df2['ExerciseAngina'].replace({'N': 0, 'Y': 1}, inplace=True)
df2['ST_Slope'].replace({'Up': 0, 'Flat': 1, 'Down': 2}, inplace=True)

In [25]:
df2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289.0,0,0,172,0,0.0,0,0
1,49,1,2,160,180.0,0,0,156,0,1.0,1,1
2,37,0,1,130,283.0,0,1,98,0,0.0,0,0
3,48,1,3,138,214.0,0,0,108,1,1.5,1,1
4,54,0,2,150,195.0,0,0,122,0,0.0,0,0


## Atributos previsores e alvo

In [26]:
previsores = df2.iloc[:, 0:11].values # das linhas tudo e das colunas do zero ao 11
previsores

array([[40. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       [49. ,  1. ,  2. , ...,  0. ,  1. ,  1. ],
       [37. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       ...,
       [57. ,  0. ,  3. , ...,  1. ,  1.2,  1. ],
       [57. ,  1. ,  1. , ...,  0. ,  0. ,  1. ],
       [38. ,  0. ,  2. , ...,  0. ,  0. ,  0. ]])

In [27]:
previsores.shape

(917, 11)

In [28]:
alvo = df2.iloc[: , 11].values
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [29]:
alvo.shape

(917,)

## Análise das escalas e dos atributos (Escalonamento)

In [30]:
df2.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.210469,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,0.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


Padronização (Utiliza a média e o desvio padrão como referência). <br>
Normalização (Utiliza os valores máximo e mínimo como referência).

In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
previsores_esc = StandardScaler().fit_transform(previsores)

In [33]:
previsores_esc

array([[-1.43220634, -0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       [-0.47805725,  1.9368261 , -0.27058012, ..., -0.82431012,
         0.10625149,  0.59651863],
       [-1.75025603, -0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       ...,
       [ 0.37007527, -0.51630861,  0.80354095, ...,  1.21313565,
         0.29380223,  0.59651863],
       [ 0.37007527,  1.9368261 , -1.34470119, ..., -0.82431012,
        -0.83150225,  0.59651863],
       [-1.64423947, -0.51630861, -0.27058012, ..., -0.82431012,
        -0.83150225, -1.05109458]])

In [34]:
previsoresdf = pd.DataFrame(previsores_esc)

In [35]:
previsoresdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.432206,-0.516309,-1.344701,0.414627,0.832075,-0.551733,-0.749818,1.383339,-0.824310,-0.831502,-1.051095
1,-0.478057,1.936826,-0.270580,1.526360,-1.212261,-0.551733,-0.749818,0.754736,-0.824310,0.106251,0.596519
2,-1.750256,-0.516309,-1.344701,-0.141240,0.719543,-0.551733,0.491306,-1.523953,-0.824310,-0.831502,-1.051095
3,-0.584074,1.936826,0.803541,0.303453,-0.574578,-0.551733,-0.749818,-1.131075,1.213136,0.575128,0.596519
4,0.052026,-0.516309,-0.270580,0.970493,-0.930931,-0.551733,-0.749818,-0.581047,-0.824310,-0.831502,-1.051095
...,...,...,...,...,...,...,...,...,...,...,...
912,-0.902124,-0.516309,-2.418822,-1.252973,0.363191,-0.551733,-0.749818,-0.188170,-0.824310,0.293802,0.596519
913,1.536257,-0.516309,0.803541,0.636973,-0.968441,1.812470,-0.749818,0.165420,-0.824310,2.356860,0.596519
914,0.370075,-0.516309,0.803541,-0.141240,-2.131275,-0.551733,-0.749818,-0.856061,1.213136,0.293802,0.596519
915,0.370075,1.936826,-1.344701,-0.141240,-0.161960,-0.551733,1.732430,1.461915,-0.824310,-0.831502,0.596519


In [36]:
previsoresdf.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,7.748558e-18,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-0.5163086,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,-0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,-0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,-0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,1.936826,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


## Codificação de variáveis categóricas
 ### LabelEncoder: transformação de variáveis categóricas em numéricas

In [37]:
from sklearn.preprocessing import LabelEncoder

In [38]:
previsores2 = df.iloc[:, 0:11].values
previsores2

array([[40, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 'F', 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 'M', 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 'F', 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 'M', 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [39]:
previsores2[:,1] = LabelEncoder().fit_transform(previsores[:,1])

In [40]:
previsores2

array([[40, 0, 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 1, 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 0, 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 0, 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 1, 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 0, 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [41]:
previsores2[:,2] = LabelEncoder().fit_transform(previsores[:,2])
previsores2[:,6] = LabelEncoder().fit_transform(previsores[:,6])
previsores2[:,8] = LabelEncoder().fit_transform(previsores[:,8])
previsores2[:,10] = LabelEncoder().fit_transform(previsores[:,10])

In [42]:
previsores2

array([[40, 0, 1, ..., 0, 0.0, 0],
       [49, 1, 2, ..., 0, 1.0, 1],
       [37, 0, 1, ..., 0, 0.0, 0],
       ...,
       [57, 0, 3, ..., 1, 1.2, 1],
       [57, 1, 1, ..., 0, 0.0, 1],
       [38, 0, 2, ..., 0, 0.0, 0]], dtype=object)

In [43]:
previsores2.shape

(917, 11)

## OneHotEncoder: Criação de variáveis dummy (fictícia)
Cuidado com a multicolinearidade (variáveis altamente correlacionadas entre si). <br>

Você faz atividade física? <br>
A = 0 | Não <br>
B = 1 | Sim, um ou dois dias por semana <br>
C = 2 | Sim, três ou quatro dias por semana <br>
D = 3 | Sim, pelo menos cinco dias por semana <br>

In [44]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [45]:
?ColumnTransformer

In [46]:
previsores3 = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,2,6,8,10])],
                               remainder='passthrough').fit_transform(previsores2)

In [47]:
previsores3

array([[1.0, 0.0, 0.0, ..., 0, 172, 0.0],
       [0.0, 1.0, 0.0, ..., 0, 156, 1.0],
       [1.0, 0.0, 0.0, ..., 0, 98, 0.0],
       ...,
       [1.0, 0.0, 0.0, ..., 0, 115, 1.2],
       [0.0, 1.0, 0.0, ..., 0, 174, 0.0],
       [1.0, 0.0, 0.0, ..., 0, 173, 0.0]], dtype=object)

In [48]:
previsores3.shape

(917, 20)

In [49]:
previsores3df = pd.DataFrame(previsores3)

In [50]:
previsores3df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,40,140,289.0,0,172,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,49,160,180.0,0,156,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,37,130,283.0,0,98,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,48,138,214.0,0,108,1.5
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,54,150,195.0,0,122,0.0


## Escalonamento da variável previsores3

In [51]:
from sklearn.preprocessing import StandardScaler

In [52]:
previsores3_esc = StandardScaler().fit_transform(previsores3)

In [53]:
previsores3_esc

array([[ 0.51630861, -0.51630861, -0.22981048, ..., -0.55173333,
         1.38333943, -0.83150225],
       [-1.9368261 ,  1.9368261 , -0.22981048, ..., -0.55173333,
         0.75473573,  0.10625149],
       [ 0.51630861, -0.51630861, -0.22981048, ..., -0.55173333,
        -1.52395266, -0.83150225],
       ...,
       [ 0.51630861, -0.51630861, -0.22981048, ..., -0.55173333,
        -0.85606123,  0.29380223],
       [-1.9368261 ,  1.9368261 , -0.22981048, ..., -0.55173333,
         1.46191489, -0.83150225],
       [ 0.51630861, -0.51630861, -0.22981048, ..., -0.55173333,
         1.42262716, -0.83150225]])

In [54]:
previsores3df = pd.DataFrame(previsores3_esc)

In [55]:
previsores3df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.516309,-0.516309,-0.229810,2.073784,-0.531524,-1.085425,0.815013,-0.490781,-0.507826,0.824310,-0.824310,1.149573,-1.001091,-0.271607,-1.432206,0.414627,0.832075,-0.551733,1.383339,-0.831502
1,-1.936826,1.936826,-0.229810,-0.482210,1.881384,-1.085425,0.815013,-0.490781,-0.507826,0.824310,-0.824310,-0.869888,0.998910,-0.271607,-0.478057,1.526360,-1.212261,-0.551733,0.754736,0.106251
2,0.516309,-0.516309,-0.229810,2.073784,-0.531524,-1.085425,-1.226974,2.037569,-0.507826,0.824310,-0.824310,1.149573,-1.001091,-0.271607,-1.750256,-0.141240,0.719543,-0.551733,-1.523953,-0.831502
3,-1.936826,1.936826,-0.229810,-0.482210,-0.531524,0.921298,0.815013,-0.490781,-0.507826,-1.213136,1.213136,-0.869888,0.998910,-0.271607,-0.584074,0.303453,-0.574578,-0.551733,-1.131075,0.575128
4,0.516309,-0.516309,-0.229810,-0.482210,1.881384,-1.085425,0.815013,-0.490781,-0.507826,0.824310,-0.824310,1.149573,-1.001091,-0.271607,0.052026,0.970493,-0.930931,-0.551733,-0.581047,-0.831502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,0.516309,-0.516309,4.351412,-0.482210,-0.531524,-1.085425,0.815013,-0.490781,-0.507826,0.824310,-0.824310,-0.869888,0.998910,-0.271607,-0.902124,-1.252973,0.363191,-0.551733,-0.188170,0.293802
913,0.516309,-0.516309,-0.229810,-0.482210,-0.531524,0.921298,0.815013,-0.490781,-0.507826,0.824310,-0.824310,-0.869888,0.998910,-0.271607,1.536257,0.636973,-0.968441,1.812470,0.165420,2.356860
914,0.516309,-0.516309,-0.229810,-0.482210,-0.531524,0.921298,0.815013,-0.490781,-0.507826,-1.213136,1.213136,-0.869888,0.998910,-0.271607,0.370075,-0.141240,-2.131275,-0.551733,-0.856061,0.293802
915,-1.936826,1.936826,-0.229810,2.073784,-0.531524,-1.085425,-1.226974,-0.490781,1.969177,0.824310,-0.824310,-0.869888,0.998910,-0.271607,0.370075,-0.141240,-0.161960,-0.551733,1.461915,-0.831502


In [56]:
previsores3df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,-1.472226e-16,1.084798e-16,6.973702000000001e-17,-3.8742790000000005e-17,3.8742790000000005e-17,1.937139e-17,-9.298269e-17,1.549712e-17,0.0,-4.2617070000000006e-17,4.2617070000000006e-17,-3.8742790000000005e-17,0.0,8.523413e-17,1.859654e-16,7.884157e-16,3.014189e-15,-1.549712e-17,-5.114048e-16,-1.859654e-16
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-1.936826,-0.5163086,-0.2298105,-0.4822104,-0.5315237,-1.085425,-1.226974,-0.490781,-0.507826,-1.213136,-0.8243101,-0.8698879,-1.001091,-0.2716072,-2.704405,-2.920572,-2.994023,-0.5517333,-3.016886,-3.269662
25%,0.5163086,-0.5163086,-0.2298105,-0.4822104,-0.5315237,-1.085425,-1.226974,-0.490781,-0.507826,-1.213136,-0.8243101,-0.8698879,-1.001091,-0.2716072,-0.6900904,-0.6971063,-0.5745784,-0.5517333,-0.6596226,-0.8315022
50%,0.5163086,-0.5163086,-0.2298105,-0.4822104,-0.5315237,0.9212982,0.8150134,-0.490781,-0.507826,0.8243101,-0.8243101,-0.8698879,0.99891,-0.2716072,0.05202558,-0.1412398,3.19836e-15,-0.5517333,0.04755658,-0.26885
75%,0.5163086,-0.5163086,-0.2298105,-0.4822104,-0.5315237,0.9212982,0.8150134,-0.490781,-0.507826,0.8243101,1.213136,1.149573,0.99891,-0.2716072,0.688125,0.4146267,0.4194568,-0.5517333,0.7547357,0.5751284
max,0.5163086,1.936826,4.351412,2.073784,1.881384,0.9212982,0.8150134,2.037569,1.969177,0.8243101,1.213136,1.149573,0.99891,3.681787,2.490407,3.749826,6.721265,1.81247,2.561971,4.982571


## RESUMO PRÉ-PROCESSAMENTO

alvo = variável que se pretende atingir (tem ou não doença cardíaca).

previsores = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas manualmente, sem escalonar.

previsores_esc = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas, escalonada.

previsores2 = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas pelo labelencoder.

previsores3 = conjunto de variáveis previsoras transformadas pelo labelencoder e onehotencoder, sem escalonar.

previsores3_esc = conjunto de variáveis previsoras transformadas pelo labelencoder e onehotencoder escalonada.