## LOOCV，K-ford
#### 通过生成polynomial的函数 通过LOOCV和K-fold来计算MSE 通过MSE选择在不同polynomial的模型里选取最合适的
#### 适用于样本比较小的数据集

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report
from numpy import mean
from numpy import absolute
from numpy import sqrt
import numpy as np
import pandas as pd


In [94]:
dataset = pd.read_excel('Auto.xlsx')
dataset.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


### Split Training and Testing dataset

In [95]:
X_train, X_test, y_train, y_test = train_test_split(dataset["horsepower"], dataset["mpg"], test_size=0.5, random_state=77)
X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train)

### Fit models with different degrees

In [96]:
##### Polynomial Function
for i in range(1,4):
    # transformer = PolynomialFeatures(degree=i, include_bias=True)
    x_ = PolynomialFeatures(degree=i, include_bias=True).fit_transform(X_train)
    model = LinearRegression(fit_intercept=True).fit(x_, y_train)
    ###############
    x_2 = PolynomialFeatures(degree=i, include_bias=True).fit_transform(np.array(X_test).reshape(-1, 1))
    predicted = model.predict(x_2)
    ###############
    r_sq = model.score(x_, y_train)
    intercept, coefficients = model.intercept_, model.coef_
    mse = mean_squared_error(y_test, predicted)
    ###############
    print(f"R^2: {r_sq}")
    print(f"intercept: {intercept}")
    print(f"coefficients:{coefficients}")
    print(f"MSE:{mse}")
    print("       ")
    print("       ")
    # print(f"top 5 predicted response:\n{predicted}")

R^2: 0.6009749692355413
intercept: 41.03108069621042
coefficients:[ 0.         -0.16991311]
MSE:22.801891362837296
       
       
R^2: 0.6473049445337324
intercept: 55.46316417446556
coefficients:[ 0.         -0.43887587  0.001112  ]
MSE:15.482153729976943
       
       
R^2: 0.6480096140317292
intercept: 59.904366554112535
coefficients:[ 0.00000000e+00 -5.63523882e-01  2.18698431e-03 -2.86031927e-06]
MSE:15.603234678327231
       
       


### Leave-one-out Cross Validation

In [97]:
#define predictor and response variables

from sklearn.linear_model import LogisticRegression


X_train = dataset["horsepower"]
y_train = dataset["mpg"]
X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train)
cv = LeaveOneOut()
    #build multiple linear regression model
for i in range(1,4):
    x_ = PolynomialFeatures(degree=i, include_bias=True).fit_transform(X_train)
    model = LinearRegression()
    scores = cross_val_score(model, x_, y_train, scoring='neg_mean_absolute_error',
                            cv=cv, n_jobs= -1)
    #view mean absolute error
    print("阶数:", i)
    print("MAE(mean absolute error):", round(mean(absolute(scores)),2))
    scores_1 = cross_val_score(model, x_, y_train, scoring='neg_mean_squared_error',
                        cv=cv, n_jobs= -1)
    print("MSE(mean squared error):", abs(mean(scores_1)))
    print("       ")
    print("       ")

阶数: 1
MAE(mean absolute error): 3.85
MSE(mean squared error): 24.231513517929226
       
       
阶数: 2
MAE(mean absolute error): 3.27
MSE(mean squared error): 19.248213124489745
       
       
阶数: 3
MAE(mean absolute error): 3.28
MSE(mean squared error): 19.33498406411498
       
       


### K-fold

In [98]:
#define predictor and response variables

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, LeavePOut

X_train = dataset["horsepower"]
y_train = dataset["mpg"]
X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train)

# cv = KFold(n_splits=len(X_train)) #分成len(X_train)就相当于LOOCV
cv = KFold(n_splits=10) 
    #build multiple linear regression model
for i in range(1,4):
    x_ = PolynomialFeatures(degree=i, include_bias=True).fit_transform(X_train)
    model = LinearRegression()
    scores = cross_val_score(model, x_, y_train, scoring='neg_mean_absolute_error',
                            cv=cv, n_jobs= -1)
    #view mean absolute error
    print("阶数:", i)
    print("MAE(mean absolute error):", round(mean(absolute(scores)),2))
    scores_1 = cross_val_score(model, x_, y_train, scoring='neg_mean_squared_error',
                        cv=cv, n_jobs= -1)
    print("MSE(mean squared error):", round(abs(mean(scores_1)),2))
    print("       ")
    print("       ")

阶数: 1
MAE(mean absolute error): 4.11
MSE(mean squared error): 27.44
       
       
阶数: 2
MAE(mean absolute error): 3.46
MSE(mean squared error): 21.24
       
       
阶数: 3
MAE(mean absolute error): 3.47
MSE(mean squared error): 21.34
       
       
