In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr

In [2]:
file = "automobile/imports-85.data"

column_names = [
    "symboling", "normalized-losses", "make", "fuel-type", "aspiration",
    "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", 
    "length", "width", "height", "curb-weight", "engine-type", 
    "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", 
    "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"
]

df = pd.read_csv(file, names=column_names, na_values="?", sep=",")

In [3]:
features = ["wheel-base", "compression-ratio", "engine-size", "length", "width"]
target = "city-mpg"

data = df[features + [target]]
data = data.dropna()
print(data[features])
print(data[target])

     wheel-base  compression-ratio  engine-size  length  width
0          88.6                9.0          130   168.8   64.1
1          88.6                9.0          130   168.8   64.1
2          94.5                9.0          152   171.2   65.5
3          99.8               10.0          109   176.6   66.2
4          99.4                8.0          136   176.6   66.4
..          ...                ...          ...     ...    ...
200       109.1                9.5          141   188.8   68.9
201       109.1                8.7          141   188.8   68.8
202       109.1                8.8          173   188.8   68.9
203       109.1               23.0          145   188.8   68.9
204       109.1                9.5          141   188.8   68.9

[205 rows x 5 columns]
0      21
1      21
2      19
3      24
4      18
       ..
200    23
201    19
202    18
203    26
204    19
Name: city-mpg, Length: 205, dtype: int64


In [4]:
train_part = int(0.6 * len(data))
val_part = int(0.8 * len(data))

train_data = data[:train_part]
val_data = data[train_part:val_part]
test_data = data[val_part:]

x_train = train_data[features]
x_val = val_data[features]
x_test = test_data[features]

y_train = train_data[target]
y_val = val_data[target]
y_test = test_data[target]


In [5]:
# Linear
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

# Ridge
ridge_model = Ridge()
ridge_model.fit(x_train, y_train)

# Lasso
lasso_model = Lasso()
lasso_model.fit(x_train, y_train)

def evaluate_model(model, x, y):
    predictions = model.predict(x)
    
    mse = mean_squared_error(y, predictions)
    r2 = r2_score(y, predictions)
    pcc, _ = pearsonr(y, predictions)
    
    return mse, r2, pcc


In [6]:
linear_mse_val, linear_r2_val, linear_pcc_val = evaluate_model(linear_model, x_val, y_val)
ridge_mse_val, ridge_r2_val, ridge_pcc_val = evaluate_model(ridge_model, x_val, y_val)
lasso_mse_val, lasso_r2_val, lasso_pcc_val = evaluate_model(lasso_model, x_val, y_val)

linear_mse_test, linear_r2_test, linear_pcc_test = evaluate_model(linear_model, x_test, y_test)
ridge_mse_test, ridge_r2_test, ridge_pcc_test = evaluate_model(ridge_model, x_test, y_test)
lasso_mse_test, lasso_r2_test, lasso_pcc_test = evaluate_model(lasso_model, x_test, y_test)

# Display Results
print("Linear Model Metrics:")
print("Validation -> MSE: ", linear_mse_val, " R2: ", linear_r2_val, " PCC: ", linear_pcc_val)
print("Test -> MSE: ", linear_mse_test, " R2: ", linear_r2_test, " PCC: ", linear_pcc_test)
print()
print("Ridge Model Metrics:")
print("Validation -> MSE: ", ridge_mse_val, " R2: ", ridge_r2_val, " PCC: ", ridge_pcc_val)
print("Test -> MSE: ", ridge_mse_test, " R2: ", ridge_r2_test, " PCC: ", ridge_pcc_test)
print()
print("Lasso Model Metrics:")
print("Validation -> MSE: ", lasso_mse_val, " R2: ", lasso_r2_val, " PCC: ", lasso_pcc_val)
print("Test -> MSE: ", lasso_mse_test, " R2: ", lasso_r2_test, " PCC: ", lasso_pcc_test)


Linear Model Metrics:
Validation -> MSE:  10.786733256258685  R2:  0.6575672570673279  PCC:  0.8137650755178988
Test -> MSE:  6.427188600353539  R2:  0.738867307072212  PCC:  0.8781180406480414

Ridge Model Metrics:
Validation -> MSE:  10.783145476534955  R2:  0.657681153760854  PCC:  0.8137715058592373
Test -> MSE:  6.42508341977005  R2:  0.7389528392557294  PCC:  0.8781658099560556

Lasso Model Metrics:
Validation -> MSE:  11.321814574312052  R2:  0.6405807089549298  PCC:  0.8022903093258729
Test -> MSE:  6.067300012625358  R2:  0.7534893575379894  PCC:  0.8855888965279801


In [7]:
alphas = [0., 0.25, 0.5, 1., 1000.]

print("Ridge Model")
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train, y_train)
    ridge_mse_val, _, _ = evaluate_model(ridge, x_val, y_val)
    print("Validation with alpha  ", alpha, "-> MSE: ", ridge_mse_val)

print()
print("Lasso Model")
for alpha in alphas:    
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(x_train, y_train)
    lasso_mse_val, _, _ = evaluate_model(lasso, x_val, y_val)
    print("Validation with alpha  ", alpha, " -> MSE: ", lasso_mse_val)


Ridge Model
Validation with alpha   0.0 -> MSE:  10.786733256258652
Validation with alpha   0.25 -> MSE:  10.78582565089782
Validation with alpha   0.5 -> MSE:  10.78492519563879
Validation with alpha   1.0 -> MSE:  10.783145476534955
Validation with alpha   1000.0 -> MSE:  10.793083377756865

Lasso Model
Validation with alpha   0.0  -> MSE:  10.786733256258678
Validation with alpha   0.25  -> MSE:  10.717383983194823
Validation with alpha   0.5  -> MSE:  10.783613886978287
Validation with alpha   1.0  -> MSE:  11.321814574312052
Validation with alpha   1000.0  -> MSE:  31.685967347478353


  lasso.fit(x_train, y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [8]:
best_alpha_ridge = Ridge(alpha=1.0)
best_alpha_lasso = Lasso(alpha=0.25, max_iter=10000)

best_alpha_ridge.fit(x_train, y_train)
best_alpha_lasso.fit(x_train, y_train)

ridge_mse_test, ridge_r2_test, ridge_pcc_test = evaluate_model(best_alpha_ridge, x_test, y_test)
lasso_mse_test, lasso_r2_test, lasso_pcc_test = evaluate_model(best_alpha_lasso, x_test, y_test)

print("Ridge Model with best Alpha -> Test MSE: ", ridge_mse_test, " R2: ", ridge_r2_test, " PCC: ", ridge_pcc_test)
print("Lasso Model with best Alpha -> Test MSE: ", lasso_mse_test, " R2: ", lasso_r2_test, " PCC: ", lasso_pcc_test)


Ridge Model with best Alpha -> Test MSE:  6.42508341977005  R2:  0.7389528392557294  PCC:  0.8781658099560556
Lasso Model with best Alpha -> Test MSE:  6.30405180129232  R2:  0.7438702789681348  PCC:  0.8804354381564511


In [9]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=5)

x_poly = poly.fit_transform(data[features])
x_poly_frame = pd.DataFrame(x_poly, columns=poly.get_feature_names_out(features))

x_train_poly = x_poly_frame[:train_part]
x_val_poly = x_poly_frame[train_part:val_part]
x_test_poly = x_poly_frame[val_part:]


In [10]:
alphas = [0., 0.25, 0.5, 1., 1000.]

print("Ridge Model")
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train_poly, y_train)
    ridge_mse_val, _, _ = evaluate_model(ridge, x_val_poly, y_val)
    print("Ridge MSE for Alpha ", alpha, "is -> ", ridge_mse_val)

print()
print("Lasso Model")
for alpha in alphas:    
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(x_train_poly, y_train)
    lasso_mse_val, _, _ = evaluate_model(lasso, x_val_poly, y_val)
    print("Lasso MSE for Alpha ", alpha, "is -> ", lasso_mse_val)

Ridge Model
Ridge MSE for Alpha  0.0 is ->  5473.913251874896
Ridge MSE for Alpha  0.25 is ->  5473.913251874896
Ridge MSE for Alpha  0.5 is ->  5473.913251874896
Ridge MSE for Alpha  1.0 is ->  5473.913251874896
Ridge MSE for Alpha  1000.0 is ->  5473.913251874896

Lasso Model


  lasso.fit(x_train_poly, y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso MSE for Alpha  0.0 is ->  14.146638175985025


  model = cd_fast.enet_coordinate_descent(


Lasso MSE for Alpha  0.25 is ->  11.794645672717435


  model = cd_fast.enet_coordinate_descent(


Lasso MSE for Alpha  0.5 is ->  11.667939885151082


  model = cd_fast.enet_coordinate_descent(


Lasso MSE for Alpha  1.0 is ->  11.537829841567731
Lasso MSE for Alpha  1000.0 is ->  9.251022839244406


  model = cd_fast.enet_coordinate_descent(


In [11]:
best_alpha_ridge = Ridge(alpha=1.0)
best_alpha_lasso = Lasso(alpha=1000.0, max_iter=10000)

best_alpha_ridge.fit(x_train_poly, y_train)
best_alpha_lasso.fit(x_train_poly, y_train)

ridge_mse_test, ridge_r2_test, ridge_pcc_test = evaluate_model(best_alpha_ridge, x_test_poly, y_test)
lasso_mse_test, lasso_r2_test, lasso_pcc_test = evaluate_model(best_alpha_lasso, x_test_poly, y_test)

print("Ridge Model with best Alpha -> Test MSE: ", ridge_mse_test, " R2: ", ridge_r2_test, " PCC: ", ridge_pcc_test)
print("Lasso Model with best Alpha -> Test MSE: ", lasso_mse_test, " R2: ", lasso_r2_test, " PCC: ", lasso_pcc_test)




Ridge Model with best Alpha -> Test MSE:  4885.2576503763285  R2:  -197.48499323929542  PCC:  -0.5203894576354618
Lasso Model with best Alpha -> Test MSE:  12.201479269991854  R2:  0.5042614527757455  PCC:  0.8612871959035014


  model = cd_fast.enet_coordinate_descent(


In [12]:
ridge_coeffs = best_alpha_ridge.coef_
lasso_coeffs = best_alpha_lasso.coef_

print(ridge_coeffs)
print(lasso_coeffs)

most_important_ridge_index = np.argmax(np.abs(ridge_coeffs))
most_important_lasso_index = np.argmax(np.abs(lasso_coeffs))

print(most_important_ridge_index)
print(most_important_lasso_index)

print(f"Most Important Feature in Ridge: {x_poly_frame.columns[most_important_ridge_index]} with coefficient {ridge_coeffs[most_important_ridge_index]}")
print(f"Most Important Feature in Lasso: {x_poly_frame.columns[most_important_lasso_index]} with coefficient {lasso_coeffs[most_important_lasso_index]}")


[ 0.00000000e+00  6.00477169e-13 -1.41799841e-12 -2.19263685e-11
 -5.98397017e-12  2.69913316e-12  1.52133670e-10 -1.14183453e-10
 -1.19992432e-09 -1.88860100e-10  2.69046824e-10 -1.16030596e-11
 -2.75987711e-10 -2.38034378e-10 -4.61380286e-11 -2.46944256e-09
 -2.35705190e-09 -6.80099104e-10 -1.06988414e-09  1.72356799e-10
  2.90248498e-10  2.09680659e-08 -7.48103091e-09 -4.50601976e-08
  1.10654871e-08  2.50977478e-08 -8.19791114e-10 -1.59494251e-08
 -1.52090152e-08 -3.29256122e-09 -1.00841059e-07 -9.06137672e-08
 -2.30211612e-08 -2.16075569e-08  2.74197691e-08  2.39420039e-08
 -3.58820462e-10 -1.16342230e-09 -1.51817032e-09 -9.35739423e-11
 -2.05424698e-08 -2.83498130e-08 -9.43031076e-09 -2.98665127e-08
 -7.01044869e-09 -6.35179385e-10 -1.30048949e-07 -1.60671340e-07
 -6.42528459e-08 -1.71719730e-07 -4.71782263e-08 -9.92458493e-09
 -1.00626015e-07  2.13468269e-08  3.09971983e-08  2.06598689e-08
  1.75571860e-06 -3.28411398e-07 -2.65473446e-07  2.01989193e-06
  1.72588753e-06 -4.23618