# Setup

In [25]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [26]:
import pandas as pd
import numpy as np

wine = pd.read_csv("winequality-red.csv")

In [27]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [28]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [29]:
wine['new_feature'] = wine['sulphates'] - wine['chlorides']
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  new_feature           1599 non-null   float64
dtypes: float64(12), int64(1)
memory usage: 162.5 KB


In [30]:
corr = wine.corr()
corr['quality'].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
new_feature             0.307735
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

In [31]:
from sklearn.utils import shuffle
wine = shuffle(wine)

In [32]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,new_feature
803,7.7,0.56,0.08,2.5,0.114,14.0,46.0,0.9971,3.24,0.66,9.6,6,0.546
124,7.8,0.5,0.17,1.6,0.082,21.0,102.0,0.996,3.39,0.48,9.5,5,0.398
350,10.7,0.67,0.22,2.7,0.107,17.0,34.0,1.0004,3.28,0.98,9.9,6,0.873
682,8.5,0.46,0.31,2.25,0.078,32.0,58.0,0.998,3.33,0.54,9.8,5,0.462
1326,6.7,0.46,0.24,1.7,0.077,18.0,34.0,0.9948,3.39,0.6,10.6,6,0.523


In [33]:
wine_X = wine.drop(['quality','fixed acidity'], axis=1)
wine_X

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,new_feature
803,0.560,0.08,2.50,0.114,14.0,46.0,0.99710,3.24,0.66,9.6,0.546
124,0.500,0.17,1.60,0.082,21.0,102.0,0.99600,3.39,0.48,9.5,0.398
350,0.670,0.22,2.70,0.107,17.0,34.0,1.00040,3.28,0.98,9.9,0.873
682,0.460,0.31,2.25,0.078,32.0,58.0,0.99800,3.33,0.54,9.8,0.462
1326,0.460,0.24,1.70,0.077,18.0,34.0,0.99480,3.39,0.60,10.6,0.523
...,...,...,...,...,...,...,...,...,...,...,...
1130,0.600,0.00,1.90,0.058,5.0,10.0,0.99770,3.18,0.63,10.4,0.572
1294,0.635,0.10,2.10,0.073,25.0,60.0,0.99638,3.29,0.75,10.9,0.677
860,0.620,0.06,2.70,0.077,15.0,85.0,0.99746,3.51,0.54,9.5,0.463
1459,0.200,0.35,1.70,0.054,7.0,15.0,0.99458,3.32,0.80,11.9,0.746


In [34]:
wine_y = wine['quality']
wine_y

803     6
124     5
350     6
682     5
1326    6
       ..
1130    6
1294    6
860     5
1459    7
1126    6
Name: quality, Length: 1599, dtype: int64

# ExtraTreesRegressor

In [35]:
from sklearn.ensemble import ExtraTreesRegressor

wine_reg = ExtraTreesRegressor(n_estimators=400, random_state=42)

In [36]:
wine_reg.fit(wine_X, wine_y)

ExtraTreesRegressor(n_estimators=400, random_state=42)

In [37]:
from sklearn.model_selection import cross_val_score
wine_scores = cross_val_score(wine_reg, wine_X, wine_y, 
                         scoring="neg_mean_squared_error", cv=10)
Extra_rmse_scores = np.sqrt(-wine_scores)
Extra_rmse_scores

array([0.5363476 , 0.53391391, 0.58652043, 0.54068078, 0.55958582,
       0.5846795 , 0.54314118, 0.57317022, 0.51560322, 0.51536128])

In [38]:
Extra_rmse_scores.mean()

0.5489003935342096

# ExtraTreesRegressor가 더 성능이 좋음
# RMSE : 0.54