In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error


X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
def cross_val(X,y):
    cv_scores, r2_scores, mse, mae_scores = [], [], [], []
    # Perform cross-validation for each polynomial degree
    for degree in range(9):
        # Create a pipeline with PolynomialFeatures and LinearRegression
        model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
        scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
        cv_scores.append(np.mean(scores))
        polymodel_features = PolynomialFeatures(degree=degree)
        x_poly = polymodel_features.fit_transform(X)

        poly_model = LinearRegression()
        model = make_pipeline(polymodel_features, poly_model)
        print('############################################### \n')
        print('The Degree is :', degree)
        print('MSE :', scores.mean())
        y_pred = cross_val_predict(model, X, y, cv=5)
        r2 = r2_score(y, y_pred)
        print('R2 :', r2)
        mae = mean_absolute_error(y, y_pred)
        print('Mean Absolute Error :', mae)
        print('##############################################')
        r2_scores.append(r2)
        mse.append(scores[0])
        mae_scores.append(mae)
        df = pd.DataFrame({'R2' : r2_scores, 'MSE' : mse, 'MAE' : mae_scores})
        
    return r2_scores, mse, mae_scores, df


In [51]:
r2, mse, mae, df = cross_val(X,y)

############################################### 

The Degree is : 0
MSE : -5982.413413836098
R2 : -0.008823715634149076
Mean Absolute Error : 66.03925023071267
##############################################
############################################### 

The Degree is : 1
MSE : -2993.0813104693307
R2 : 0.49532242216821853
Mean Absolute Error : 44.274855902209154
##############################################
############################################### 

The Degree is : 2
MSE : -14085.06094927718
R2 : -1.37135768554197
Mean Absolute Error : 79.93420430869539
##############################################
############################################### 

The Degree is : 3
MSE : -1165681.9931776212
R2 : -194.78186311204618
Mean Absolute Error : 339.12528379416574
##############################################
############################################### 

The Degree is : 4
MSE : -3402771.88099344
R2 : -572.3618961395722
Mean Absolute Error : 656.6292837930541
####################

Construct a table summarizing the cross-validation results. Each model should have a separate row in the table. Include the R-Squared and Mean Absolute Error (MAE) metrics for each model. Calculate the mean value and standard deviation of these metrics from the cross-validation. Include both values. (2 points)


In [53]:
df

Unnamed: 0,R2,MSE,MAE
0,-0.008824,-5353.026,66.03925
1,0.495322,-2779.923,44.274856
2,-1.371358,-3175.957,79.934204
3,-194.781863,-68781.38,339.125284
4,-572.361896,-840098.0,656.629284
5,-446.482486,-1784896.0,563.019974
6,-1791.212017,-2423193.0,743.14878
7,-5894.999471,-3737809.0,1033.741782
8,-17172.387611,-6206812.0,1477.455476


Identification of the Best Model: Identify the model that exhibits the highest performance based on the R-Squared and MAE metrics. Provide an explanation for choosing this specific model. (2 points)


The Model with degree 1 is the best model with the lowest r2 and mae value