In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import gaussian_kde, yeojohnson, boxcox, skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
# 忽略FutureWarning警告
warnings.filterwarnings("ignore", category=FutureWarning)

In [32]:
# 美國白酒品質資料
wine_quality = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/winequality-white.csv')

In [3]:
wine_quality.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [4]:
# axis=0代表row, axis=1代表column
# 以預測Quality為目標
X, y = wine_quality.drop(["quality"], axis=1), wine_quality["quality"]

<img src="dataset.png" alt="dataset" style="width: 600px">

Training set : 訓練集，在學校上課念書小考

Validatiob set : 驗證集，模擬考，測試成績，考不好再回學校唸書

Test set : 測試集，正式考試，驗證讀書學習的成效

In [10]:
""" 
X_train : 訓練集的訓練資料
X_test : 驗證集的訓練目標
y_train : 訓練集的訓練資料
y_test :驗證集的訓練目標
"""
X_train, X_test, y_train, y_test = train_test_split( 
    X, y, test_size=0.2, random_state=42)

In [11]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [17]:
predict_result = lm.predict(X_test)

In [22]:
mse = np.mean((np.array(y_test) - predict_result) ** 2)
print("預測結果的均方誤差為:", mse)

預測結果的均方誤差為: 0.5690247717229266


### 將偏態較大的欄位做處理後再訓練一次

### yeojohnson

In [69]:
# 美國白酒品質資料
wine_quality = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/winequality-white.csv')

wine_quality.skew()

fixed acidity           0.647751
volatile acidity        1.576980
citric acid             1.281920
residual sugar          1.077094
chlorides               5.023331
free sulfur dioxide     1.406745
total sulfur dioxide    0.390710
density                 0.977773
pH                      0.457783
sulphates               0.977194
alcohol                 0.487342
quality                 0.155796
dtype: float64

In [70]:
wine_quality["chlorides"], lambda_value = yeojohnson(wine_quality["chlorides"])
# wine_quality["volatile acidity"], lambda_value = yeojohnson(wine_quality["volatile acidity"])
# wine_quality["free sulfur dioxide"], lambda_value = yeojohnson(wine_quality["free sulfur dioxide"])
# wine_quality["citric acid"], lambda_value = yeojohnson(wine_quality["citric acid"])
# wine_quality["residual sugar"], lambda_value = yeojohnson(wine_quality["residual sugar"])
wine_quality.skew()

fixed acidity           0.647751
volatile acidity        1.576980
citric acid             1.281920
residual sugar          1.077094
chlorides              -0.066480
free sulfur dioxide     1.406745
total sulfur dioxide    0.390710
density                 0.977773
pH                      0.457783
sulphates               0.977194
alcohol                 0.487342
quality                 0.155796
dtype: float64

In [71]:
# axis=0代表row, axis=1代表column
# 以預測Quality為目標
X, y = wine_quality.drop(["quality"], axis=1), wine_quality["quality"]

X_train, X_test, y_train, y_test = train_test_split( 
    X, y, test_size=0.2, random_state=42)

In [72]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [73]:
predict_result = lm.predict(X_test)

mse = np.mean((np.array(y_test) - predict_result) ** 2)
print("預測結果的均方誤差為:", mse)

預測結果的均方誤差為: 0.568409377105998


## 去除離異值

In [91]:
# 美國白酒品質資料
wine_quality = pd.read_csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/winequality-white.csv')

wine_quality.skew()

fixed acidity           0.647751
volatile acidity        1.576980
citric acid             1.281920
residual sugar          1.077094
chlorides               5.023331
free sulfur dioxide     1.406745
total sulfur dioxide    0.390710
density                 0.977773
pH                      0.457783
sulphates               0.977194
alcohol                 0.487342
quality                 0.155796
dtype: float64

In [92]:
Q1 = wine_quality['chlorides'].quantile(0.25)
Q3 = wine_quality['chlorides'].quantile(0.75)
IQR = Q3-Q1
minimun = Q1 - 1.5*IQR
maximun = Q3 + 1.5*IQR

In [93]:
mask =  (wine_quality['chlorides']>=minimun) & (wine_quality['chlorides']<=maximun)
wine_quality = wine_quality[mask]
print("總共去除", len(mask)-len(wine_quality),"個離異值")

總共去除 208 個離異值


In [94]:
wine_quality

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [95]:
X, y = wine_quality.drop(["quality"], axis=1), wine_quality["quality"]

X_train, X_test, y_train, y_test = train_test_split( 
    X, y, test_size=0.2, random_state=42)

In [96]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [97]:
predict_result = lm.predict(X_test)

mse = np.mean((np.array(y_test) - predict_result) ** 2)
print("預測結果的均方誤差為:", mse)

預測結果的均方誤差為: 0.5723566913519271
