# Vinhos 

### A partir de 'wineanalysis.csv' treinar um algoritimo para classificar os vinhos no arquivo 'winequality.csv'.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
tqdm.pandas(desc="Operation Progress")

In [2]:
wine_a = pd.read_csv('dados/wineanalysis.csv')
wine_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
Unnamed: 0              6497 non-null int64
fixed.acidity           6497 non-null float64
volatile.acidity        6497 non-null float64
citric.acid             6497 non-null float64
residual.sugar          6497 non-null float64
chlorides               6497 non-null float64
free.sulfur.dioxide     6497 non-null float64
total.sulfur.dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
type                    6497 non-null object
dtypes: float64(11), int64(2), object(1)
memory usage: 710.7+ KB


In [3]:
#tratando os nomes da variáveis que vieram com pontos
del wine_a['Unnamed: 0']
wine_a.rename(
    columns={
        'fixed.acidity':'fixed_acidity',
        'volatile.acidity':'volatile_acidity',
        'citric.acid':'citric_acid',
        'residual.sugar':'residual_sugar',
        'free.sulfur.dioxide':'free_sulfur_dioxide',
        'total.sulfur.dioxide':'total_sulfur_dioxide'},
    inplace=True)

In [4]:
wine_a.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [5]:
#verificando as classificações, ou as notas
wine_a['quality'].value_counts()

6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64

In [6]:
wine_a.groupby('type').alcohol.count()

type
red      1599
white    4898
Name: alcohol, dtype: int64

### Criando Dummies

In [7]:
n_dummies = wine_a["type"].str.get_dummies()

In [8]:
n_dummies.head()

Unnamed: 0,red,white
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [9]:
wine_a = wine_a.join(n_dummies) #concatenando
wine_a.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,type,red,white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,1,0


In [10]:
wine_a.shape

(6497, 15)

# 2 - Treinando o Modelo de Machine Learning

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

### 2.1 Instanciar os modelos

In [12]:
modelo_MQO = LinearRegression()
modelo_RF = RandomForestRegressor()
modelo_gradient = GradientBoostingRegressor()

### 2.2 Separar os modelos em explicativas X e target Y

In [13]:
wine_a_numeric = wine_a.select_dtypes(include=[np.number]) 
numericas = list(wine_a_numeric)

In [14]:
numericas

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality',
 'red',
 'white']

In [15]:
wine_a.corr()['quality']

fixed_acidity          -0.076743
volatile_acidity       -0.265699
citric_acid             0.085532
residual_sugar         -0.036980
chlorides              -0.200666
free_sulfur_dioxide     0.055463
total_sulfur_dioxide   -0.041385
density                -0.305858
pH                      0.019506
sulphates               0.038485
alcohol                 0.444319
quality                 1.000000
red                    -0.119323
white                   0.119323
Name: quality, dtype: float64

In [16]:
wine_a.shape

(6497, 15)

In [17]:
#criando o x e o y
X = wine_a[numericas].drop(['quality'], axis = 1)
y = wine_a['quality']

### 2.3 - Separar o arquivo em treino e teste

In [18]:
y.shape

(6497,)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [20]:
X_train.shape

(5197, 13)

In [21]:
y_train.shape

(5197,)

### 2.4 - Treinar com o modelo .fit

__Modelo Linear__

In [22]:
modelo_MQO.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [23]:
modelo_MQO.coef_

array([ 1.02373712e-01, -1.40001548e+00, -5.10448525e-02,  6.80199609e-02,
       -7.22322153e-01,  4.62593568e-03, -1.47817508e-03, -1.20165381e+02,
        5.78856956e-01,  6.87440333e-01,  2.05267738e-01,  1.75334213e-01,
       -1.75334213e-01])

__Modelo Gradient__

In [24]:
modelo_gradient = GradientBoostingRegressor(max_depth=3)
from sklearn.model_selection import cross_val_score

In [25]:
modelo_gradient.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

__Modelo Randon Forest__

In [26]:
modelo_RF.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### 2.5 - Analisando as Métricas

In [27]:
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In [28]:
print("Score MQO Treino: " , modelo_MQO.score(X_train, y_train))
print("Score MQO Teste: " , modelo_MQO.score(X_test, y_test))

Score MQO Treino:  0.29712004282162124
Score MQO Teste:  0.29162651381471905


In [29]:
print("Score Gradient Treino: " , modelo_gradient.score(X_train, y_train))
print("Score Grqdient Teste: " , modelo_gradient.score(X_test, y_test))

Score Gradient Treino:  0.4703022928588888
Score Grqdient Teste:  0.3710523398330712


In [30]:
print("Score Randon Forest Treino: " , modelo_RF.score(X_train, y_train))
print("Score Randon Forest Teste: " , modelo_RF.score(X_test, y_test))

Score Randon Forest Treino:  0.9021610629130329
Score Randon Forest Teste:  0.46518224986336265


In [31]:
# primeiro modelo - regressão linear
yhat_train = modelo_MQO.predict(X_train) # previsao dos dados de treino para calcular as métricas
yhat_test = modelo_MQO.predict(X_test) # previsao dos dados de teste para calcular métricas

print('TREINO - MSE:', mean_squared_error(y_train, yhat_train), 'MAE:', median_absolute_error(y_train, yhat_train),'R2:', r2_score(y_train, yhat_train))
print('TESTE - MSE:', mean_squared_error(y_test, yhat_test), 'MAE:', median_absolute_error(y_test, yhat_test), 'R2:', r2_score(y_train, yhat_train))

TREINO - MSE: 0.5369169561369598 MAE: 0.4635267716653857 R2: 0.2971200428216212
TESTE - MSE: 0.5360714858345726 MAE: 0.4663600366284655 R2: 0.2971200428216212


In [32]:
# segundo modelo -  gradient
yhat_train = modelo_gradient.predict(X_train) # previsao dos dados de treino para calcular as métricas
yhat_test = modelo_gradient.predict(X_test) # previsao dos dados de teste para calcular métricas

print('TREINO - MSE:', mean_squared_error(y_train, yhat_train), 'MAE:', median_absolute_error(y_train, yhat_train), 'R2:', r2_score(y_train, yhat_train))
print('TESTE - MSE - test', mean_squared_error(y_test, yhat_test), 'MAE - test', median_absolute_error(y_test, yhat_test),'R2 - test',r2_score(y_test, yhat_test))


TREINO - MSE: 0.4046262490292569 MAE: 0.4084681891262969 R2: 0.4703022928588888
TESTE - MSE - test 0.47596488755322525 MAE - test 0.4347838295245894 R2 - test 0.3710523398330712


In [33]:
# terceiro modelo -  random forest
yhat_train = modelo_RF.predict(X_train) # previsao dos dados de treino para calcular as métricas
yhat_test = modelo_RF.predict(X_test) # previsao dos dados de teste para calcular métricas

print('TREINO - MSE:', mean_squared_error(y_train, yhat_train), 'MAE:', median_absolute_error(y_train, yhat_train), 'R2:', r2_score(y_train, yhat_train))
print('TESTE - MSE - test', mean_squared_error(y_test, yhat_test), 'MAE - test', median_absolute_error(y_test, yhat_test),'R2 - test',r2_score(y_test, yhat_test))


TREINO - MSE: 0.0747373484702713 MAE: 0.09999999999999964 R2: 0.9021610629130329
TESTE - MSE - test 0.4047307692307691 MAE - test 0.2999999999999998 R2 - test 0.46518224986336265


In [34]:
cross_val_score(modelo_gradient, X, y, cv=10, )

array([0.2640259 , 0.3755655 , 0.34188007, 0.26204913, 0.29362839,
       0.31904184, 0.35643917, 0.29200506, 0.35872288, 0.29695268])

In [35]:
np.array([0.26292378, 0.3755655 , 0.34278168, 0.26204913, 0.29348888,
       0.31704058, 0.35628847, 0.29026625, 0.36054368, 0.29726546]).mean()

0.31582134100000003

In [36]:
cross_val_score(modelo_MQO, X, y, cv=10, )

array([0.21270177, 0.36292991, 0.24462452, 0.20259809, 0.27032523,
       0.21862335, 0.33865399, 0.29197893, 0.27207201, 0.14940061])

In [37]:
np.array([0.21270177, 0.36292991, 0.24462452, 0.20259809, 0.27032523,
       0.21862335, 0.33865399, 0.29197893, 0.27207201, 0.14940061]).mean()

0.25639084100000004

In [38]:
cross_val_score(modelo_RF, X, y, cv=10, )

array([0.19090162, 0.27094616, 0.26344678, 0.22722029, 0.18067271,
       0.19524827, 0.34228525, 0.23376795, 0.31467042, 0.22335129])

In [39]:
np.array([0.21754613, 0.22062441, 0.27075503, 0.17431776, 0.16564617,
       0.21467876, 0.3471465 , 0.21983501, 0.32103004, 0.20276366]).mean()

0.235434347

# 3 - Treinando o  modelo escolhido

In [40]:
X = wine_a[numericas].drop(['quality'], axis = 1)
Y = wine_a['quality']

modelo_final_RF = RandomForestRegressor().fit(X,Y)

print(modelo_final_RF.score(X_train, y_train))
print(modelo_final_RF.score(X_test, y_test))

0.9093879600192483
0.9164865031811724


# 4 - Rodar as previsões

In [41]:
wine = pd.read_csv('dados/winequality_test.csv')
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
type                    6497 non-null object
fixed_acidity           6497 non-null float64
volatile_acidity        6497 non-null float64
citric_acid             6497 non-null float64
residual_sugar          6497 non-null float64
chlorides               6497 non-null float64
free_sulfur_dioxide     6497 non-null float64
total_sulfur_dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
dtypes: float64(11), int64(1), object(1)
memory usage: 659.9+ KB


In [42]:
n_dummies = wine["type"].str.get_dummies()

In [43]:
wine = wine.join(n_dummies) #concatenando
wine.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,red,white
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0,1
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,0,1
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0,1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0,1
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0,1


In [44]:
wine_numeric = wine.select_dtypes(include=[np.number]) 
numericas = list(wine)

In [45]:
X_Real = wine[list(X)] #determinando o X

In [46]:
Y_Real = wine['quality'] #determinando o Y

In [47]:
yhat_Real = modelo_final_RF.predict(X_Real) #

In [48]:
yhat_Real

array([6. , 6. , 6. , ..., 6. , 5.1, 6.1])

In [49]:
wine['quality']= yhat_Real
wine.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,red,white
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0,0,1
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0,0,1
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0,0,1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.1,0,1
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.1,0,1


In [51]:
#verificando as classificações, ou as notas
wine['quality'].value_counts()

6.0    1042
5.0     772
5.1     481
5.9     478
7.0     359
6.1     341
5.2     307
5.8     306
5.3     223
6.2     206
6.9     204
5.7     170
6.8     164
5.4     119
6.3     116
5.6     116
6.7     115
4.9     106
6.6      99
5.5      97
6.5      85
6.4      75
7.1      53
4.8      39
4.2      37
4.3      37
8.0      33
4.5      31
4.4      29
7.8      24
4.7      23
7.5      23
7.6      22
4.1      21
7.3      19
7.9      19
4.0      19
7.7      17
7.4      17
7.2      16
4.6      15
3.9       4
3.6       3
3.8       3
3.7       2
8.1       2
3.5       2
8.4       2
3.2       1
8.3       1
8.2       1
3.3       1
Name: quality, dtype: int64

In [50]:
wine.to_csv('dados/resultado_winequality_RF2.csv', index=False)