## Feature Scalling (Python)

In [1]:
# Importando as bicliotecas
from sklearn.model_selection import train_test_split # Biblioteca utilizada para realizar split data
import numpy as np
import pandas as pd


In [2]:
# Importando os dados
df = pd.read_csv('dados/missing_data.csv')
# Visualizando dados
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
# Preenchendo missing values
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Imputer
# Criando o objeto resposável por preencher os dados
imputer = Imputer(strategy='mean', missing_values='NaN', axis=0 )
# Treinando o modelo 
imputer.fit(df[['Age', 'Salary']])
# Realizando a conversão
df[['Age', 'Salary']] = imputer.transform(df[['Age', 'Salary']])
# Viasualizando as modificações
df



Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# Criando objeto para transformação dos dados nominais para discretos
le = LabelEncoder()
# Aplicando transformação
df.Country = le.fit_transform(df.Country)
df.Purchased = le.fit_transform(df.Purchased)
# Visualizando as tranformações
df

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.777778,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [5]:
# Separando os dados em descritores e label
X = df.iloc[:, :-1].values
y = df.iloc[:, -1]

In [6]:
# Aplicando a técnica Dummy Coding
dc = OneHotEncoder(categorical_features=[0])
# Realizando treinamento
X = dc.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
# Visualizando dados
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [8]:
# Realiando splitting of data
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [9]:
X_treino

array([[0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.4e+01, 7.2e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.5e+01, 5.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 6.1e+04]])

In [10]:
# Realizando o Feature Scaling
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()

# Normalizando os dados
# Devemos utilizar o fit_transform no treinamento para que ele treine a aprenda. No de teste, ele transforme baseado no que
# o modelo aprendeu. https://sebastianraschka.com/faq/docs/scale-training-test.html
# Preciso realizar o fit_transformation na variável dependente? Depende. Se ela for categórica, não. Caso contrário, sim.

X_treino = ss_X.fit_transform(X_treino)
X_teste = ss_X.transform(X_teste)

X_treino

array([[-0.8660254 ,  1.58113883, -0.63245553,  1.7454336 ,  1.80891264],
       [ 1.15470054, -0.63245553, -0.63245553,  0.92174584,  0.79959182],
       [-0.8660254 , -0.63245553,  1.58113883, -1.41203617, -1.4025627 ],
       [ 1.15470054, -0.63245553, -0.63245553, -0.31378582, -0.48499832],
       [-0.8660254 ,  1.58113883, -0.63245553, -1.00019229, -0.85202407],
       [ 1.15470054, -0.63245553, -0.63245553, -0.03922323,  0.34080963],
       [-0.8660254 , -0.63245553,  1.58113883,  0.09805807, -0.209729  ]])