In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


df = pd.read_csv("Hitters.csv")

print("Before cleaning:", df.shape)

df = df.dropna(subset=['Salary'])
print("After removing NaN Salary:", df.shape)

df = pd.get_dummies(df, drop_first=True)

X = df.drop(columns=['Salary'])
y = df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)


ridge_alpha = 0.5748
lasso_alpha = 0.5748

lin_reg = LinearRegression().fit(X_train_s, y_train)
ridge_reg = Ridge(alpha=ridge_alpha).fit(X_train_s, y_train)
lasso_reg = Lasso(alpha=lasso_alpha, max_iter=10000).fit(X_train_s, y_train)

y_pred_lin = lin_reg.predict(X_test_s)
y_pred_ridge = ridge_reg.predict(X_test_s)
y_pred_lasso = lasso_reg.predict(X_test_s)


models = {
    "Linear Regression": [y_pred_lin, lin_reg],
    "Ridge Regression": [y_pred_ridge, ridge_reg],
    "Lasso Regression": [y_pred_lasso, lasso_reg]
}

results = []
for name, (preds, model) in models.items():
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    results.append({"Model": name, "R²": r2, "MSE": mse})

results_df = pd.DataFrame(results)
print("\n Q2: Model Comparison on Hitters Dataset")
print(results_df)

best_model = results_df.loc[results_df["R²"].idxmax()]
print(f"\nBest Model: {best_model['Model']}")
if best_model["Model"] == "Ridge Regression":
    print("Ridge performs best, it handles multicollinearity by adding L2 regularization.")
elif best_model["Model"] == "Lasso Regression":
    print("Lasso performs best, it performs feature selection by driving some coefficients to zero.")
else:
    print("Linear performs best, suggests features are not heavily correlated.")




Before cleaning: (322, 20)
After removing NaN Salary: (263, 20)

 Q2: Model Comparison on Hitters Dataset
               Model        R²            MSE
0  Linear Regression  0.290745  128284.345497
1   Ridge Regression  0.300036  126603.902644
2   Lasso Regression  0.299626  126678.116040

Best Model: Ridge Regression
Ridge performs best, it handles multicollinearity by adding L2 regularization.
