In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]

In [49]:
df = pd.read_csv("../data/auto-mpg.data", sep = r"\s+", header = None, names = column_names, na_values = "?")

In [50]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [52]:
df.dropna(inplace = True)

In [53]:
numeric_columns = ["displacement", "horsepower", "weight", "acceleration"]
from sklearn.preprocessing import StandardScaler
df.drop("car_name", axis =1 , inplace = True)   
scaler = StandardScaler()

In [54]:
df.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1


In [55]:
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [56]:
df.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,1.07729,0.664133,0.62054,-1.285258,70,1


In [57]:
df.dropna()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,1.077290,0.664133,0.620540,-1.285258,70,1
1,15.0,8,1.488732,1.574594,0.843334,-1.466724,70,1
2,18.0,8,1.182542,1.184397,0.540382,-1.648189,70,1
3,16.0,8,1.048584,1.184397,0.536845,-1.285258,70,1
4,17.0,8,1.029447,0.924265,0.555706,-1.829655,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,-0.520637,-0.480448,-0.221125,0.021294,82,1
394,44.0,4,-0.932079,-1.364896,-0.999134,3.287676,82,2
395,32.0,4,-0.568479,-0.532474,-0.804632,-1.430430,82,1
396,28.0,4,-0.712005,-0.662540,-0.415627,1.110088,82,1


In [58]:
X = df.drop(["mpg"], axis = 1)
y = df['mpg']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state = 42)

In [59]:
X_train.shape , y_train.shape, X_test.shape , y_test.shape

((313, 7), (313,), (79, 7), (79,))

In [60]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

## Before Hyper-Parameter Tuning

In [70]:
from sklearn.metrics import r2_score

In [71]:
regressors = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    ElasticNet(),
    SVR(),
    RandomForestRegressor(),
    DecisionTreeRegressor()
]

for regressor in regressors:
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"{regressor.__class__.__name__}: R² Score = {r2:.4f}")

LinearRegression: R² Score = 0.7902
Ridge: R² Score = 0.7891
Lasso: R² Score = 0.7953
ElasticNet: R² Score = 0.7736
SVR: R² Score = 0.2258
RandomForestRegressor: R² Score = 0.8896
DecisionTreeRegressor: R² Score = 0.7850


In [64]:
from sklearn.model_selection import GridSearchCV

In [76]:
regressors = [
    {'model': LinearRegression(), 'params': {}},  # No hyperparameters to tune
    {'model': Ridge(), 'params': {'alpha': [0.1, 1.0, 10.0, 100.0]}},
    {'model': Lasso(), 'params': {'alpha': [0.1, 1.0, 10.0], 'max_iter': [1000, 2000]}},
    {'model': ElasticNet(), 'params': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.2, 0.5, 0.8]}},
    {'model': SVR(), 'params': {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear'], 'gamma': ['scale', 'auto']}},
    {'model': RandomForestRegressor(), 'params': {'n_estimators': [50, 100, 200, 300], 'max_depth': [None, 10, 20,30, 40]}},
    {'model': DecisionTreeRegressor(), 'params': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 3]}}
]

In [85]:
model_names= []
r2_scores = []
fold_scores = []
for reg in regressors:
    model = reg['model']
    param_grid = reg['params']
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, return_train_score=False)
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    
    best_index = grid_search.best_index_
    fold_r2_scores = [grid_search.cv_results_[f'split{i}_test_score'][best_index] for i in range(5)]
    
    model_names.append(model.__class__.__name__)
    r2_scores.append(r2)
    fold_scores.append(fold_r2_scores)
    
    print(f"{model.__class__.__name__}:")
    print(f"Best Parameters: {best_params}")
    print(f"R² Score (Test): {r2:.4f}")
    print(f"Cross-Validated R² Score: {grid_search.best_score_:.4f}")
    print("R² Scores per Fold:", [f"{s:.4f}" for s in fold_r2_scores])
    print("-" * 50)

LinearRegression:
Best Parameters: {}
R² Score (Test): 0.7902
Cross-Validated R² Score: 0.8099
R² Scores per Fold: ['0.8161', '0.8124', '0.7733', '0.7969', '0.8507']
--------------------------------------------------
Ridge:
Best Parameters: {'alpha': 1.0}
R² Score (Test): 0.7891
Cross-Validated R² Score: 0.8103
R² Scores per Fold: ['0.8166', '0.8117', '0.7726', '0.7999', '0.8505']
--------------------------------------------------
Lasso:
Best Parameters: {'alpha': 0.1, 'max_iter': 1000}
R² Score (Test): 0.7864
Cross-Validated R² Score: 0.8130
R² Scores per Fold: ['0.8228', '0.8080', '0.7721', '0.8123', '0.8497']
--------------------------------------------------
ElasticNet:
Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.8}
R² Score (Test): 0.7855
Cross-Validated R² Score: 0.8119
R² Scores per Fold: ['0.8192', '0.8107', '0.7710', '0.8149', '0.8437']
--------------------------------------------------
SVR:
Best Parameters: {'C': 10.0, 'gamma': 'auto', 'kernel': 'rbf'}
R² Score (Test): 0.87

### Random Forest and SVR Performed the Best , so lets check which one is actually best amongst them
#### So we will use pairwise test to check the r2_score across each fold of RandomForest and SVR

In [100]:
data = pd.DataFrame({
    'Model': model_names,
    'Fold_Scores': fold_scores
})

In [101]:
data

Unnamed: 0,Model,Fold_Scores
0,LinearRegression,"[0.8161206375806028, 0.812396591304834, 0.7733..."
1,Ridge,"[0.8166252523781788, 0.811741200042034, 0.7725..."
2,Lasso,"[0.8228489409120633, 0.8079863666614412, 0.772..."
3,ElasticNet,"[0.8191800576545027, 0.8107342051707799, 0.771..."
4,SVR,"[0.8266452424928654, 0.870322748362936, 0.8024..."
5,RandomForestRegressor,"[0.8169794258564641, 0.8477493315851458, 0.856..."
6,DecisionTreeRegressor,"[0.8267052401982293, 0.7714781459458833, 0.776..."


In [91]:
import pingouin as pg

In [104]:
svr_scores = data.loc[data['Model'] == 'SVR', 'Fold_Scores'].values[0]
rf_scores = data.loc[data['Model'] == 'RandomForestRegressor', 'Fold_Scores'].values[0]

In [112]:
from scipy.stats import shapiro

In [115]:
def compare_models(scores_a, scores_b, model_a="Model_A", model_b="Model_B"):
    diffs = np.array(scores_a) - np.array(scores_b)
    normality_p = shapiro(diffs).pvalue
    ttest_res = pg.ttest(scores_a, scores_b, paired=True)
    wilcoxon_res = pg.wilcoxon(scores_a, scores_b)
    if normality_p > 0.05 and len(scores_a) >= 10:
        decision = "Normality OK → Use Paired T-Test"
    elif normality_p > 0.05 and len(scores_a) < 10:
        decision = "Normality OK but small sample → Report both tests"
    else:
        decision = "Normality violated → Prefer Wilcoxon"
    t_p = ttest_res['p-val'].values[0]
    w_p = wilcoxon_res['p-val'].values[0]
    t_interpret = "Significant difference ✅" if t_p < 0.05 else "No significant difference ❌"
    w_interpret = "Significant difference ✅" if w_p < 0.05 else "No significant difference ❌"
    print(f"\n📊 Comparison: {model_a} vs {model_b}")
    print(f"Normality p-value: {normality_p:.4f}")
    print(f"T-Test: p={t_p:.4f}, T={ttest_res['T'].values[0]:.4f} → {t_interpret}")
    print(f"Wilcoxon: p={w_p:.4f}, W={wilcoxon_res['W-val'].values[0]:.4f} → {w_interpret}")
    print(f"Decision: {decision}")

In [116]:
compare_models(svr_scores, rf_scores, "SVR", "RandomForest")


📊 Comparison: SVR vs RandomForest
Normality p-value: 0.5354
T-Test: p=0.5699, T=-0.6182 → No significant difference ❌
Wilcoxon: p=0.8125, W=6.0000 → No significant difference ❌
Decision: Normality OK but small sample → Report both tests


### We can use both but will go with RandomForest