# Notas da Aula 3: Conceitos de teste, treino e validação

In [191]:
# manipulação
import numpy as np
import pandas as pd

# visualização
import seaborn as sns
import matplotlib.pyplot as plt

# separação em treino/testes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [192]:
df_vinho_branco = pd.read_csv('winequality-white.csv', sep=';')
df_vinho_tinto = pd.read_csv('winequality-red.csv', sep=';')

In [193]:
df_vinho_branco.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [194]:
df_vinho_tinto.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [195]:
X = df_vinho_branco.drop('quality',axis=1)
y = df_vinho_branco['quality']

In [196]:
from sklearn.preprocessing import StandardScaler

In [197]:
X_std = StandardScaler().fit_transform(X)

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.2, random_state = 10)

In [199]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [200]:
y_pred = lr.predict(X_test)

In [201]:
from sklearn.metrics import r2_score, mean_absolute_error

In [202]:
r2_score(y_test,y_pred)

0.2742868892098238

In [203]:
mean_absolute_error(y_test, y_pred)

0.5784674680133184

## Vinho Tinto

In [204]:
X2 = df_vinho_tinto.drop('quality',axis=1)
y2 = df_vinho_tinto['quality']

X2_std = StandardScaler().fit_transform(X2)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_std, y2, test_size = 0.2, random_state = 10)

lr2 = LinearRegression()
lr2.fit(X2_train, y2_train)

y2_pred = lr2.predict(X2_test)

In [205]:
r2_score(y2_test,y2_pred)

0.3707576574165461

In [206]:
mean_absolute_error(y2_test, y2_pred)

0.5298713595576603

In [207]:
lr2.coef_

array([ 0.03723437, -0.22264548, -0.05263842,  0.00156068, -0.08297405,
        0.03118181, -0.11033439, -0.0127809 , -0.05774104,  0.14003403,
        0.28659045])

In [208]:
lr2.intercept_

5.625909735031051

## Seleção de variaveis mais importantes

In [209]:
from sklearn.feature_selection import RFE

In [210]:
X = df_vinho_branco.drop('quality', axis = 1)
y = df_vinho_branco['quality']
X_std = StandardScaler().fit_transform(X)

estimator = LinearRegression()

selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
selector.support_

array([False,  True, False, False,  True, False, False,  True, False,
        True,  True])

In [211]:
X.loc[:,selector.support_].head() # Mostra as colunas mais importantes para o conjunto de vinhos brancos

Unnamed: 0,volatile acidity,chlorides,density,sulphates,alcohol
0,0.27,0.045,1.001,0.45,8.8
1,0.3,0.049,0.994,0.49,9.5
2,0.28,0.05,0.9951,0.44,10.1
3,0.23,0.058,0.9956,0.4,9.9
4,0.23,0.058,0.9956,0.4,9.9


In [212]:
X2 = df_vinho_tinto.drop('quality', axis = 1)
y2 = df_vinho_tinto['quality']
X2_std = StandardScaler().fit_transform(X2)

estimator2 = LinearRegression()

selector2 = RFE(estimator2, n_features_to_select=5, step=1)
selector2 = selector2.fit(X2, y2)
selector2.support_

array([False,  True, False, False,  True, False, False,  True,  True,
        True, False])

In [213]:
X2.loc[:,selector2.support_].head() # Mostra as colunas mais importantes para o conjunto de vinhos tintos

Unnamed: 0,volatile acidity,chlorides,density,pH,sulphates
0,0.7,0.076,0.9978,3.51,0.56
1,0.88,0.098,0.9968,3.2,0.68
2,0.76,0.092,0.997,3.26,0.65
3,0.28,0.075,0.998,3.16,0.58
4,0.7,0.076,0.9978,3.51,0.56


## Novo modelo com caracteristicas mais significativas

In [214]:
# Vinho Branco
X_ajustado = X.loc[:,selector.support_]

X_ajustado_std = StandardScaler().fit_transform(X_ajustado)
X_ajustado_train, X_ajustado_test, y_train, y_test = train_test_split(X_ajustado_std, y, test_size=0.2, random_state=10)

modelo = LinearRegression()
modelo = modelo.fit(X_ajustado_train, y_train)
y_pred = modelo.predict(X_ajustado_test)

In [215]:
r2_score(y_test,y_pred)

0.24693424536163644

In [216]:
mean_absolute_error(y_test, y_pred)

0.5882461316332739

In [217]:
# Vinho Tinto
X2_ajustado = X2.loc[:,selector2.support_]

X2_ajustado_std = StandardScaler().fit_transform(X2_ajustado)
X2_ajustado_train, X2_ajustado_test, y2_train, y2_test = train_test_split(X2_ajustado_std, y2, test_size=0.2, random_state=10)

modelo = LinearRegression()
modelo = modelo.fit(X2_ajustado_train, y2_train)
y2_pred = modelo.predict(X2_ajustado_test)

In [218]:
r2_score(y2_test,y2_pred)

0.23533750534139275

In [219]:
mean_absolute_error(y2_test, y2_pred)

0.5909146708368256