In [7]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score


In [8]:
# Load the diabetes dataset
data = datasets.load_diabetes()
X = data.data
y = data.target


In [9]:
results = []

def mape_error(y_true, y_pred):
    """Mean Absolute Percentage Error"""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

for degree in range(9):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    
    model = LinearRegression()
    
    scores_r2 = cross_val_score(model, X_poly, y, cv=5, scoring='r2')
    scores_mae = cross_val_score(model, X_poly, y, cv=5, 
                                 scoring=make_scorer(mean_absolute_error, greater_is_better=False))
    scores_mape = cross_val_score(model, X_poly, y, cv=5, 
                                  scoring=make_scorer(mape_error, greater_is_better=False))
    
    results.append({
        'degree': degree,
        'mean_r2': np.mean(scores_r2),
        'std_r2': np.std(scores_r2),
        'mean_mae': np.mean(scores_mae),
        'std_mae': np.std(scores_mae),
        'mean_mape': np.mean(scores_mape),
        'std_mape': np.std(scores_mape)
    })


In [10]:
df = pd.DataFrame(results)
df


Unnamed: 0,degree,mean_r2,std_r2,mean_mae,std_mae,mean_mape,std_mape
0,0,-0.027506,0.036772,-66.045624,3.47466,-62.362224,6.560955
1,1,0.482316,0.049269,-44.276499,2.10011,-39.486032,2.960173
2,2,0.391502,0.120519,-46.612882,2.192124,-40.266898,2.959421
3,3,-182.489194,229.392982,-344.40157,140.379665,-233.742231,82.981982
4,4,-70.667516,53.160839,-303.158461,39.359835,-245.368514,37.51481
5,5,-67.387407,50.797106,-295.686026,37.255902,-240.52327,35.25937
6,6,-67.447482,50.95711,-295.631865,37.280642,-240.495378,35.300436
7,7,-67.448566,50.959858,-295.630167,37.281559,-240.495019,35.301504
8,8,-67.447305,50.961013,-295.616356,37.27826,-240.484394,35.29165


In [11]:
bestModel = df.loc[df['mean_r2'].idxmax()]
bestModel


degree        1.000000
mean_r2       0.482316
std_r2        0.049269
mean_mae    -44.276499
std_mae       2.100110
mean_mape   -39.486032
std_mape      2.960173
Name: 1, dtype: float64