# Conclusion in this notebook
### Removing all other transformations except decade 70 and 80

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]

In [3]:
df = pd.read_csv("../data/auto-mpg.data", sep = r"\s+", header = None, names = column_names, na_values = "?")

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [6]:
df.dropna(inplace = True)

In [7]:
df.duplicated().sum()

np.int64(0)

## Feature Engineering

In [8]:
# convert the year to categorical 
# Convert to decade group (e.g., 70s, 80s)
df['year_category'] = df['model_year'].apply(lambda x: '70s' if x < 80 else '80s')
year_dummies = pd.get_dummies(df['year_category'], prefix='decade')
df = pd.concat([df, year_dummies], axis=1)

In [9]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name,year_category,decade_70s,decade_80s
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,70s,True,False
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,70s,True,False
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,70s,True,False
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,70s,True,False
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,70s,True,False


#### we will play with car names in 3rd experiement

In [10]:
np.unique([car_name.split(" ")[0] for car_name in df['car_name'].values ])

array(['amc', 'audi', 'bmw', 'buick', 'cadillac', 'capri', 'chevroelt',
       'chevrolet', 'chevy', 'chrysler', 'datsun', 'dodge', 'fiat',
       'ford', 'hi', 'honda', 'maxda', 'mazda', 'mercedes',
       'mercedes-benz', 'mercury', 'nissan', 'oldsmobile', 'opel',
       'peugeot', 'plymouth', 'pontiac', 'renault', 'saab', 'subaru',
       'toyota', 'toyouta', 'triumph', 'vokswagen', 'volkswagen', 'volvo',
       'vw'], dtype='<U13')

In [11]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'car_name', 'year_category',
       'decade_70s', 'decade_80s'],
      dtype='object')

In [12]:
numeric_columns = ["displacement", "horsepower", "weight", "acceleration" ]
from sklearn.preprocessing import StandardScaler, Normalizer
df.drop("car_name", axis =1 , inplace = True)   
# scaler = StandardScaler()
scaler = Normalizer()

In [13]:
df.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,year_category,decade_70s,decade_80s
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,70s,True,False


In [14]:
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [15]:
df.head(1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,year_category,decade_70s,decade_80s
0,18.0,8,0.08722,0.036933,0.995498,0.003409,70,1,70s,True,False


In [16]:
df.dropna()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,year_category,decade_70s,decade_80s
0,18.0,8,0.087220,0.036933,0.995498,0.003409,70,1,70s,True,False
1,15.0,8,0.094257,0.044436,0.994551,0.003097,70,1,70s,True,False
2,18.0,8,0.092068,0.043428,0.994800,0.003185,70,1,70s,True,False
3,16.0,8,0.088123,0.043482,0.995154,0.003479,70,1,70s,True,False
4,17.0,8,0.087156,0.040404,0.995370,0.003030,70,1,70s,True,False
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,0.050092,0.030771,0.998255,0.005582,82,1,80s,False,True
394,44.0,4,0.045476,0.024379,0.998601,0.011533,82,2,80s,False,True
395,32.0,4,0.058682,0.036513,0.997596,0.005042,82,1,80s,False,True
396,28.0,4,0.045645,0.030050,0.998481,0.007075,82,1,80s,False,True


In [17]:
X = df.drop(["mpg","model_year", "acceleration", "weight","year_category"], axis = 1)
y = df['mpg']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state = 42)

In [18]:
X_train.shape , y_train.shape, X_test.shape , y_test.shape

((313, 6), (313,), (79, 6), (79,))

In [19]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

## Before Hyper-Parameter Tuning

In [20]:
from sklearn.metrics import r2_score

In [21]:
regressors = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    ElasticNet(),
    SVR(),
    RandomForestRegressor(),
    DecisionTreeRegressor()
]

for regressor in regressors:
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"{regressor.__class__.__name__}: R² Score = {r2:.4f}")

LinearRegression: R² Score = 0.7124
Ridge: R² Score = 0.7082
Lasso: R² Score = 0.6029
ElasticNet: R² Score = 0.6400
SVR: R² Score = 0.6951
RandomForestRegressor: R² Score = 0.7217
DecisionTreeRegressor: R² Score = 0.3815


In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
regressors = [
    {'model': LinearRegression(), 'params': {}},  # No hyperparameters to tune
    {'model': Ridge(), 'params': {'alpha': [0.1, 1.0, 10.0, 100.0]}},
    {'model': Lasso(), 'params': {'alpha': [0.1, 1.0, 10.0], 'max_iter': [1000, 2000]}},
    {'model': ElasticNet(), 'params': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.2, 0.5, 0.8]}},
    {'model': SVR(), 'params': {'C': [0.1, 1.0, 10.0], 'kernel': ['rbf', 'linear'], 'gamma': ['scale', 'auto']}},
    {'model': RandomForestRegressor(), 'params': {'n_estimators': [50, 100, 200, 300], 'max_depth': [None, 10, 20,30, 40]}},
    {'model': DecisionTreeRegressor(), 'params': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 3]}}
]

In [24]:
model_names= []
r2_scores = []
fold_scores = []
for reg in regressors:
    model = reg['model']
    param_grid = reg['params']
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, return_train_score=False)
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    
    best_index = grid_search.best_index_
    fold_r2_scores = [grid_search.cv_results_[f'split{i}_test_score'][best_index] for i in range(5)]
    
    model_names.append(model.__class__.__name__)
    r2_scores.append(r2)
    fold_scores.append(fold_r2_scores)
    
    print(f"{model.__class__.__name__}:")
    print(f"Best Parameters: {best_params}")
    print(f"R² Score (Test): {r2:.4f}")
    print(f"Cross-Validated R² Score: {grid_search.best_score_:.4f}")
    print("R² Scores per Fold:", [f"{s:.4f}" for s in fold_r2_scores])
    print("-" * 50)

LinearRegression:
Best Parameters: {}
R² Score (Test): 0.7124
Cross-Validated R² Score: 0.7039
R² Scores per Fold: ['0.7188', '0.7157', '0.6893', '0.6962', '0.6995']
--------------------------------------------------
Ridge:
Best Parameters: {'alpha': 10.0}
R² Score (Test): 0.7062
Cross-Validated R² Score: 0.7124
R² Scores per Fold: ['0.7213', '0.7085', '0.6789', '0.7336', '0.7195']
--------------------------------------------------
Lasso:
Best Parameters: {'alpha': 0.1, 'max_iter': 1000}
R² Score (Test): 0.7070
Cross-Validated R² Score: 0.7111
R² Scores per Fold: ['0.7214', '0.7038', '0.6772', '0.7322', '0.7208']
--------------------------------------------------
ElasticNet:
Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.5}
R² Score (Test): 0.7034
Cross-Validated R² Score: 0.7114
R² Scores per Fold: ['0.7138', '0.7085', '0.6755', '0.7431', '0.7159']
--------------------------------------------------
SVR:
Best Parameters: {'C': 10.0, 'gamma': 'auto', 'kernel': 'rbf'}
R² Score (Test): 0.7

### Random Forest and SVR Performed the Best , so lets check which one is actually best amongst them
#### So we will use pairwise test to check the r2_score across each fold of RandomForest and SVR

In [25]:
data = pd.DataFrame({
    'Model': model_names,
    'Fold_Scores': fold_scores
})

In [26]:
data

Unnamed: 0,Model,Fold_Scores
0,LinearRegression,"[0.7188291216088702, 0.7156905664136294, 0.689..."
1,Ridge,"[0.7212999122869723, 0.7084599620605856, 0.678..."
2,Lasso,"[0.7213517116960353, 0.7037734832354656, 0.677..."
3,ElasticNet,"[0.7138161052958443, 0.7085369724699443, 0.675..."
4,SVR,"[0.7054830540656725, 0.7365614913445082, 0.707..."
5,RandomForestRegressor,"[0.7149747354314375, 0.649435252998557, 0.7242..."
6,DecisionTreeRegressor,"[0.7029430916197263, 0.5327107469132301, 0.688..."


In [27]:
import pingouin as pg

In [28]:
svr_scores = data.loc[data['Model'] == 'SVR', 'Fold_Scores'].values[0]
rf_scores = data.loc[data['Model'] == 'RandomForestRegressor', 'Fold_Scores'].values[0]

In [29]:
from scipy.stats import shapiro

In [30]:
def compare_models(scores_a, scores_b, model_a="Model_A", model_b="Model_B"):
    diffs = np.array(scores_a) - np.array(scores_b)
    normality_p = shapiro(diffs).pvalue
    ttest_res = pg.ttest(scores_a, scores_b, paired=True)
    wilcoxon_res = pg.wilcoxon(scores_a, scores_b)
    if normality_p > 0.05 and len(scores_a) >= 10:
        decision = "Normality OK → Use Paired T-Test"
    elif normality_p > 0.05 and len(scores_a) < 10:
        decision = "Normality OK but small sample → Report both tests"
    else:
        decision = "Normality violated → Prefer Wilcoxon"
    t_p = ttest_res['p-val'].values[0]
    w_p = wilcoxon_res['p-val'].values[0]
    t_interpret = "Significant difference ✅" if t_p < 0.05 else "No significant difference ❌"
    w_interpret = "Significant difference ✅" if w_p < 0.05 else "No significant difference ❌"
    print(f"\n📊 Comparison: {model_a} vs {model_b}")
    print(f"Normality p-value: {normality_p:.4f}")
    print(f"T-Test: p={t_p:.4f}, T={ttest_res['T'].values[0]:.4f} → {t_interpret}")
    print(f"Wilcoxon: p={w_p:.4f}, W={wilcoxon_res['W-val'].values[0]:.4f} → {w_interpret}")
    print(f"Decision: {decision}")

In [31]:
compare_models(svr_scores, rf_scores, "SVR", "RandomForest")


📊 Comparison: SVR vs RandomForest
Normality p-value: 0.4847
T-Test: p=0.6357, T=0.5119 → No significant difference ❌
Wilcoxon: p=0.8125, W=6.0000 → No significant difference ❌
Decision: Normality OK but small sample → Report both tests


### We can use both but will go with RandomForest