In [89]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle

In [63]:
df = pd.read_csv('../data/df_tratado.csv', sep=';', encoding='utf-8')

In [64]:
df.head()

Unnamed: 0,Age,Education,Employment,Income,Marital status,Violence
0,30,secondary,unemployed,0,married,yes
1,47,tertiary,unemployed,0,married,no
2,24,tertiary,unemployed,0,unmarred,no
3,22,tertiary,unemployed,0,unmarred,no
4,50,primary,unemployed,0,married,yes


In [65]:
df.shape

(347, 6)

### Transformando as variáveis categóricas nominais em variáveis ordinais

In [66]:
df2 = pd.DataFrame.copy(df)

In [67]:
colunas_para_codificar = ['Education', 'Employment', 'Marital status', 'Violence']
for c in colunas_para_codificar:
    df2[c] = LabelEncoder().fit_transform(df2[c])

In [70]:
df.head()

Unnamed: 0,Age,Education,Employment,Income,Marital status,Violence
0,30,secondary,unemployed,0,married,yes
1,47,tertiary,unemployed,0,married,no
2,24,tertiary,unemployed,0,unmarred,no
3,22,tertiary,unemployed,0,unmarred,no
4,50,primary,unemployed,0,married,yes


In [69]:
df2.head()

Unnamed: 0,Age,Education,Employment,Income,Marital status,Violence
0,30,2,2,0,0,1
1,47,3,2,0,0,0
2,24,3,2,0,1,0
3,22,3,2,0,1,0
4,50,1,2,0,0,1


- Education - 1 = Primary, 2 = Secondary, 3 = Tertiary
- Employment - 0 = Employed, 1 = Semi employed, 2 = Unemployed
- Maritial status - 0 = Married, 1 = Unmarred
- Violence - 0 = No, 1 = Yes

### Separando atributos previsores e alvo

In [54]:
previsores = df2.iloc[:, :5].values
previsores

array([[30,  2,  2,  0,  0],
       [47,  3,  2,  0,  0],
       [24,  3,  2,  0,  1],
       ...,
       [27,  1,  2,  0,  0],
       [29,  1,  2,  0,  0],
       [24,  2,  2,  0,  0]])

In [75]:
alvo = df2.iloc[:, -1].values
alvo

array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [45]:
print(alvo.shape)
print(previsores.shape)

(347,)
(347, 5)


### Análise das escalas dos atributos (Escalonamento)

In [73]:
df2.describe()

Unnamed: 0,Age,Education,Employment,Income,Marital status,Violence
count,347.0,347.0,347.0,347.0,347.0,347.0
mean,31.380403,1.461095,1.714697,2110.685879,0.135447,0.247839
std,9.601569,0.912699,0.595637,5743.278766,0.342694,0.432381
min,15.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,1.0,2.0,0.0,0.0,0.0
50%,30.0,1.0,2.0,0.0,0.0,0.0
75%,39.5,2.0,2.0,0.0,0.0,0.0
max,60.0,3.0,2.0,35000.0,1.0,1.0


In [78]:
previsores_esc = StandardScaler().fit_transform(previsores)

In [81]:
previsoresdf = pd.DataFrame(previsores_esc)
previsoresdf.head()

Unnamed: 0,0,1,2,3,4
0,-0.143976,0.591305,0.479679,-0.368036,-0.395811
1,1.629124,1.688538,0.479679,-0.368036,-0.395811
2,-0.769776,1.688538,0.479679,-0.368036,2.526456
3,-0.978376,1.688538,0.479679,-0.368036,2.526456
4,1.942025,-0.505929,0.479679,-0.368036,-0.395811


### OneHotEnconder: Criação de variáveis Dummy

In [85]:
previsores2 = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,2,4])],
                                remainder='passthrough').fit_transform(previsores)

In [86]:
previsores2

array([[ 0.,  0.,  1., ...,  0., 30.,  0.],
       [ 0.,  0.,  0., ...,  0., 47.,  0.],
       [ 0.,  0.,  0., ...,  1., 24.,  0.],
       ...,
       [ 0.,  1.,  0., ...,  0., 27.,  0.],
       [ 0.,  1.,  0., ...,  0., 29.,  0.],
       [ 0.,  0.,  1., ...,  0., 24.,  0.]])

In [87]:
previsores2.shape

(347, 11)

In [88]:
# Escalonando
previsores2_esc = StandardScaler().fit_transform(previsores2)

### Resumo pré-processamento

- alvo = variável que se pretende atingir (sofre ou não violência doméstica)

- previsores = conjunto de variáveis previsoras com variáveis categóricas trasnformadas em numéricas pelo LabelEncoder

- previsores_esc = conjunto de variáveis previsoras com variáveis categóricas trasnformadas em numéricas, escalonada

- previsores2 = conjunto de variáveis previsoras transformadas pelo LabelEncoder e OneHotEncoder

- previsores2_esc = conjunto de variáveis previsoras transformadas pelo LabelEncoder e OneHotEncoder, escalonada

### Salvado variáveis em aquivos

In [93]:
arq1 = open('../data/alvo.pkl', 'wb')
pickle.dump(alvo, arq1)
arq2 = open('../data/previsores.pkl', 'wb')
pickle.dump(previsores, arq2)
arq3 = open('../data/previsores_esc.pkl', 'wb')
pickle.dump(previsores_esc, arq3)
arq4 = open('../data/previsores2.pkl', 'wb')
pickle.dump(previsores2, arq4)
arq5 = open('../data/previsores2_esc.pkl', 'wb')
pickle.dump(previsores2_esc, arq5)

In [94]:
arq1.close()
arq2.close()
arq3.close()
arq4.close()
arq5.close()