### 波士頓房價預測
- 計算linear regression training data & testing data 的 MSE&RMSE。

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
# 導入數據集
boston = pd.read_csv('HousingData.csv')
# 处理缺失值（这里假设我们选择删除含有缺失值的行）
boston = boston.dropna()
boston = pd.get_dummies(boston)
# 分割數據集為訓練資料跟測試資料
X = boston.drop('MEDV', axis=1)  # 特征变量
y = boston['MEDV']  # 目标变量（房價）
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42, 
    shuffle=True, 
)

#標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#多元線性回歸模型
model = LinearRegression()
model.fit(X_train_scaled, y_train)
#印出各項參數及截距項
w_0 = model.intercept_
w_1 = model.coef_
print('Interception : ', w_0)
print('Coeficient : ', w_1)
# 用mse及rmse評估模型效能
mse_train = mean_squared_error(y_train, model.predict(X_train_scaled))
mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
rmse_train = root_mean_squared_error(y_train, model.predict(X_train_scaled))
rmse_test = root_mean_squared_error(y_test, model.predict(X_test_scaled))
print(f'Mean Squared Error in training data: {mse_train}')
print(f'Mean Squared Error in testing data: {mse_test}')
print(f'Root Mean Squared Error in training data: {rmse_train}')
print(f'Root Mean Squared Error in testing data: {rmse_test}')

Interception :  21.993333333333336
Coeficient :  [-1.09299765  0.97431655  0.17589472  0.51612013 -1.98125482  2.99665496
 -0.61026711 -2.94464351  2.06984891 -2.07443678 -2.0915885   0.89475279
 -2.94995322]
Mean Squared Error in training data: 16.69221271088402
Mean Squared Error in testing data: 31.454047664950842
Root Mean Squared Error in training data: 4.085610445317079
Root Mean Squared Error in testing data: 5.608390826694484


- 用Cross validation 找出最好的 $\lambda$ 

In [3]:
from sklearn.linear_model import RidgeCV
import numpy as np

alphas = np.logspace(-6, 6, 100)
ridge_model = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_model.fit(X_train_scaled, y_train)
best_alpha = ridge_model.alpha_
print(f"best alpha : {best_alpha}")


best alpha : 6.135907273413176


- 計算ridge regression training data & testing data 的 MSE&RMSE($\lambda$ = best_alpha)。

In [4]:
from sklearn.linear_model import Ridge
# ridge regression 模型
Ridge_model = Ridge(alpha=best_alpha)
Ridge_model.fit(X_train_scaled, y_train)
w_0 = Ridge_model.intercept_
w_1 =Ridge_model.coef_
print('Interception : ', w_0)
print('Coeficient : ', w_1)
# 用mse及rmse評估模型效能
mse_train = mean_squared_error(y_train, Ridge_model.predict(X_train_scaled))
mse_test = mean_squared_error(y_test, Ridge_model.predict(X_test_scaled))
rmse_train = root_mean_squared_error(y_train, Ridge_model.predict(X_train_scaled))
rmse_test = root_mean_squared_error(y_test, Ridge_model.predict(X_test_scaled))
print(f'Mean Squared Error in training data: {mse_train}')
print(f'Mean Squared Error in testing data: {mse_test}')
print(f'Root Mean Squared Error in training data: {rmse_train}')
print(f'Root Mean Squared Error in testing data: {rmse_test}')

Interception :  21.993333333333336
Coeficient :  [-1.02098346  0.86101439 -0.03053942  0.54594324 -1.73092966  3.02100604
 -0.59223441 -2.66643297  1.56560399 -1.59366338 -2.03451539  0.88146107
 -2.88941807]
Mean Squared Error in training data: 16.75297019676133
Mean Squared Error in testing data: 31.940995362225927
Root Mean Squared Error in training data: 4.093039237139235
Root Mean Squared Error in testing data: 5.651636520710256


In [5]:
from sklearn.metrics import mean_squared_error, r2_score
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# 5. 訓練回歸模型
ridge_model = Ridge(alpha=best_alpha) 
ridge_model.fit(X_train_scaled, y_train)

# 6. 预测
y_pred_linear = linear_model.predict(X_test_scaled)
y_pred_ridge = ridge_model.predict(X_test_scaled)

# 7. 計算均方誤差
mse_linear = mean_squared_error(y_test, y_pred_linear)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
score_linear = r2_score(y_test, y_pred_linear)
score_ridge = r2_score(y_test, y_pred_ridge)

# 8. 输出结果
print(f"The MSE of linear regression: {mse_linear}")
print(f"The MSE of ridge regression: {mse_ridge}")
print(f"The score of linear regression (R^2): {score_linear}")
print(f"The score of ridge regression  (R^2): {score_ridge}")
print(f"best alpha: {best_alpha}")
# 9. 對比模型
if mse_linear < mse_ridge:
    print("The MSE of linear regression is lower than ridge regression。")
else:
    print("The MSE of ridge regression is lower than linear regression。")

The MSE of linear regression: 31.454047664950842
The MSE of ridge regression: 31.940995362225927
The score of linear regression (R^2): 0.6270849941673194
The score of ridge regression  (R^2): 0.6213118070308387
best alpha: 6.135907273413176
The MSE of linear regression is lower than ridge regression。


- 在高次方做CROSS VALIDATION

- 在一次回歸當中看不出什麼區別，將資料運用到高次方回歸當中如下所示:

In [None]:
from sklearn.linear_model import RidgeCV
import numpy as np
alphas = np.logspace(-6, 6, 100)
ridge_model = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_model.fit(X_train_scaled, y_train)
best_alpha = ridge_model.alpha_
print(f"best alpha : {best_alpha}")

In [6]:
## 導入多項式套件，建構多項式迴歸模型所需的套件
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
regressor = make_pipeline(PolynomialFeatures(4), LinearRegression())
regressor.fit(X_train_scaled, y_train)
mse_train = mean_squared_error(y_train, regressor.predict(X_train_scaled))
mse_test = mean_squared_error(y_test, regressor.predict(X_test_scaled))
rmse_train = root_mean_squared_error(y_train, regressor.predict(X_train_scaled))
rmse_test = root_mean_squared_error(y_test, regressor.predict(X_test_scaled))
print(f'Mean Squared Error in training data: {mse_train}')
print(f'Mean Squared Error in testing data: {mse_test}')
print(f'Root Mean Squared Error in training data: {rmse_train}')
print(f'Root Mean Squared Error in testing data: {rmse_test}')

Mean Squared Error in training data: 7.794822005268475e-26
Mean Squared Error in testing data: 441.6247125109677
Root Mean Squared Error in training data: 2.7919208450936564e-13
Root Mean Squared Error in testing data: 21.014868843534753


In [7]:
from sklearn.linear_model import Ridge
# ridge regression 模型
ridge_regressor = make_pipeline(PolynomialFeatures(4), Ridge(alpha=best_alpha))
ridge_regressor.fit(X_train_scaled, y_train)
# 用mse及rmse評估模型效能
mse_train = mean_squared_error(y_train, ridge_regressor.predict(X_train_scaled))
mse_test = mean_squared_error(y_test, ridge_regressor.predict(X_test_scaled))
rmse_train = root_mean_squared_error(y_train, ridge_regressor.predict(X_train_scaled))
rmse_test = root_mean_squared_error(y_test, ridge_regressor.predict(X_test_scaled))
print(f'Mean Squared Error in training data: {mse_train}')
print(f'Mean Squared Error in testing data: {mse_test}')
print(f'Root Mean Squared Error in training data: {rmse_train}')
print(f'Root Mean Squared Error in testing data: {rmse_test}')

Mean Squared Error in training data: 0.49615676149933885
Mean Squared Error in testing data: 42.95440057130164
Root Mean Squared Error in training data: 0.7043839588600374
Root Mean Squared Error in testing data: 6.553960678193121


In [9]:
from sklearn.metrics import mean_squared_error, r2_score
regressor = make_pipeline(PolynomialFeatures(4), LinearRegression())
regressor.fit(X_train_scaled, y_train)

# 訓練回歸模型
ridge_regressor = make_pipeline(PolynomialFeatures(4), Ridge(alpha=best_alpha))
ridge_regressor.fit(X_train_scaled, y_train)

# 6. 预测
y_pred_linear = regressor.predict(X_test_scaled)
y_pred_ridge = ridge_regressor.predict(X_test_scaled)

# 7. 計算均方誤差
mse_linear = mean_squared_error(y_test, y_pred_linear)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
score_linear = r2_score(y_test, y_pred_linear)
score_ridge = r2_score(y_test, y_pred_ridge)
# 8. 输出结果
print(f"The MSE of linear regression: {mse_linear}")
print(f"The MSE of ridge regression: {mse_ridge}")
print(f"The score of linear regression (R^2): {score_linear}")
print(f"The score of ridge regression  (R^2): {score_ridge}")
print(f"best alpha: {best_alpha}")
# 9. 對比模型
if mse_linear < mse_ridge:
    print("The MSE of linear regression is lower than ridge regression。")
else:
    print("The MSE of ridge regression is lower than linear regression。")

The MSE of linear regression: 441.6247125109677
The MSE of ridge regression: 42.95440057130164
The score of linear regression (R^2): -4.235843856922596
The score of ridge regression  (R^2): 0.4907383396180378
best alpha: 6.135907273413176
The MSE of ridge regression is lower than linear regression。


### 當多項式回歸次方項越高就越能發現ridge regression 對測試資料的mse下降越多。