## Utilizando a regressão linear para definir a técnica de codificação mais eficaz 

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [2]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Caio Santos Martins" --iversions

Author: Caio Santos Martins

sklearn: 0.0
pandas : 1.4.4
numpy  : 1.21.5



### Carregando os dados

In [3]:
#Importação dos dados
dados = pd.read_csv('dados/dataset.csv')

In [4]:
#Verificando as dimensões
dados.shape

(13, 4)

In [5]:
dados

Unnamed: 0,Modelo,Kilometragem,Preco_Venda,Idade_Veiculo
0,Jaguar F-Type R,69000,18000,6
1,Jaguar F-Type R,35000,34000,3
2,Jaguar F-Type R,57000,26100,5
3,Jaguar F-Type R,22500,40000,2
4,Jaguar F-Type R,46000,31500,4
5,Mustang Mach 1,59000,29400,5
6,Mustang Mach 1,52000,32000,5
7,Mustang Mach 1,72000,19300,6
8,Mustang Mach 1,91000,12000,8
9,Mercedez Benz AMG GLE 53,67000,22000,6


## Aplicando One-Hot-Encoding

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [17]:
#Criação do objeto de codificação
OHEncoder = ColumnTransformer([('Modelo', OneHotEncoder(), [0])], remainder = 'passthrough')

In [18]:
OHEncoder

ColumnTransformer(remainder='passthrough',
                  transformers=[('Modelo', OneHotEncoder(), [0])])

In [19]:
#Cópia da fonte de dados original
df_car = dados

In [20]:
df_car

Unnamed: 0,Modelo,Kilometragem,Preco_Venda,Idade_Veiculo
0,Jaguar F-Type R,69000,18000,6
1,Jaguar F-Type R,35000,34000,3
2,Jaguar F-Type R,57000,26100,5
3,Jaguar F-Type R,22500,40000,2
4,Jaguar F-Type R,46000,31500,4
5,Mustang Mach 1,59000,29400,5
6,Mustang Mach 1,52000,32000,5
7,Mustang Mach 1,72000,19300,6
8,Mustang Mach 1,91000,12000,8
9,Mercedez Benz AMG GLE 53,67000,22000,6


In [21]:
x = df_car.drop('Preco_Venda', axis=1)
x

Unnamed: 0,Modelo,Kilometragem,Idade_Veiculo
0,Jaguar F-Type R,69000,6
1,Jaguar F-Type R,35000,3
2,Jaguar F-Type R,57000,5
3,Jaguar F-Type R,22500,2
4,Jaguar F-Type R,46000,4
5,Mustang Mach 1,59000,5
6,Mustang Mach 1,52000,5
7,Mustang Mach 1,72000,6
8,Mustang Mach 1,91000,8
9,Mercedez Benz AMG GLE 53,67000,6


In [22]:
y = df_car['Preco_Venda']
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Preco_Venda, dtype: int64

In [23]:
#Padronização dos dados
X = OHEncoder.fit_transform(x)
X

array([[1.00e+00, 0.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.20e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.20e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.90e+04, 5.00e+00]])

In [25]:
#Criação do modelo de treinamento
modelo_v1 = linear_model.LinearRegression()

In [27]:
#Treinamento com os dados
modelo_v1.fit(X, y)

LinearRegression()

In [29]:
# Calcula a acurácia
modelo_v1.score(X,y)

0.9417050937281082

## Aplicando Label Encoding

In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
# Cópia do dataframe original
df_car2 = dados

In [43]:
# Cria o objeto encoder
le = LabelEncoder()

In [44]:
le

LabelEncoder()

In [46]:
df_car2['Modelo'] = le.fit_transform(df_car['Modelo'])

In [47]:
df_car2

Unnamed: 0,Modelo,Kilometragem,Preco_Venda,Idade_Veiculo
0,0,69000,18000,6
1,0,35000,34000,3
2,0,57000,26100,5
3,0,22500,40000,2
4,0,46000,31500,4
5,2,59000,29400,5
6,2,52000,32000,5
7,2,72000,19300,6
8,2,91000,12000,8
9,1,67000,22000,6


In [49]:
# Define os valores de X
x = df_car2[['Modelo', 'Kilometragem', 'Idade_Veiculo']].values
x

array([[    0, 69000,     6],
       [    0, 35000,     3],
       [    0, 57000,     5],
       [    0, 22500,     2],
       [    0, 46000,     4],
       [    2, 59000,     5],
       [    2, 52000,     5],
       [    2, 72000,     6],
       [    2, 91000,     8],
       [    1, 67000,     6],
       [    1, 83000,     7],
       [    1, 79000,     7],
       [    1, 59000,     5]], dtype=int64)

In [50]:
y = df_car2['Preco_Venda']
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Preco_Venda, dtype: int64

In [51]:
modelo_v2 = linear_model.LinearRegression()

In [52]:
# Treina o modelo
modelo_v2.fit(x,y)

LinearRegression()

In [53]:
# Calcula a acurácia
modelo_v2.score(x,y)

0.8803425650559955