In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("winequality-red.csv")

def cal(name):
    
    x = np.array(df[f'{name}']).reshape(-1, 1)
    y = np.array(df['quality']).reshape(-1, 1)
    
    n = len(x)
    x_mean = x.mean()
    y_mean = y.mean()
    diff = (x - x_mean) * (y - y_mean)
    covar = diff.sum() / n
    
    s1 = pd.Series(x.flatten())
    s2 = pd.Series(y.flatten())
    corr = s1.corr(s2)

    return covar, corr

#找最相關的特徵
covarr = []
corrr = []
namee = df.columns[:-1]

for name in namee:
    covar, corr = cal(name)
    covarr.append(covar)
    corrr.append(corr)

df_print = pd.DataFrame({'特徵': namee, '共變數': covarr, '相關係數': corrr})
print(df_print)

feature = namee[corrr.index(max(corrr))]
print("\n最相關的特徵:", feature)

#線性回歸預測
x = df[feature].values.reshape(-1, 1)
y = df['quality'].values

XTrain, XTest, yTrain, yTest = train_test_split(x, y, test_size=0.33, random_state=10)

lm = LinearRegression()
lm.fit(XTrain, yTrain)

yPred = lm.predict(XTest)

mse = mean_squared_error(yTest, yPred)
r2 = r2_score(yTest, yPred)
rmse = sqrt(mean_squared_error(yTest, yPred))

print("\n線性回歸的預測：")
print("MSE均方誤差:", mse)
print("RMSE均方根誤差:", rmse)
print("R-squared:", r2)

                      特徵       共變數      相關係數
0          fixed acidity  0.174315  0.124052
1       volatile acidity -0.056441 -0.390558
2            citric acid  0.035590  0.226373
3         residual sugar  0.015625  0.013732
4              chlorides -0.004896 -0.128907
5    free sulfur dioxide -0.427639 -0.050656
6   total sulfur dioxide -4.914162 -0.185100
7                density -0.000266 -0.174919
8                     pH -0.007193 -0.057731
9              sulphates  0.034392  0.251397
10               alcohol  0.409533  0.476166

最相關的特徵: alcohol

線性回歸的預測：
MSE均方誤差: 0.4925622432686191
RMSE均方根誤差: 0.701827787472553
R-squared: 0.28192733210237453


---

## 結論

我把所有11種特徵利用cal()這個自訂函式算出共變數及相關係數  

其中，相關係數使用了pandas的corr()計算，越接近1越好，因此最終找出最相關的特徵為相關係數約為<font color=Red>0.476</font>的<font color=Red>alcohol</font>

接著使用線性回歸LinearRegression()進行預測，程式使用RMSE均方根誤差、MSE均方誤差以及R-Squared。  

1. MSE（Mean Square Error）是「誤差」的平方的期望值，越小越好，算出來的結果約為0.493左右，誤差算小。
2. RMSE（Root Mean Square Error）是MSE的平方根，同樣也是越小越好，算出來的結果約為0.702左右，誤差算小。
3. R-Squared介於0~1之間，R-Squared越大，表示模型擬合效果越好

### Results:

- <font color=blue>MSE: 0.4925622432686191</font>
- <font color=blue>RMSE均方根誤差: 0.701827787472553</font>
- <font color=blue>R-squared: 0.28192733210237453</font>