In [1]:
# Step 1: Import required libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Step 2: Create a synthetic dataset with multicollinearity
np.random.seed(42)
X1 = np.random.rand(100)
X2 = X1 + np.random.normal(0, 0.01, 100)  # Highly correlated with X1
X3 = np.random.rand(100)
y = 3*X1 + 2*X2 + 5*X3 + np.random.normal(0, 0.1, 100)

# Create DataFrame
df = pd.DataFrame({'X1': X1, 'X2': X2, 'X3': X3, 'y': y})

# Step 3: Train-Test Split
X = df[['X1', 'X2', 'X3']]
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
mse_lin = mean_squared_error(y_test, y_pred_lin)

# Step 5: Train Ridge Regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Step 6: Compare Results
print("Linear Regression Coefficients:", lin_reg.coef_)
print("Ridge Regression Coefficients:", ridge_reg.coef_)
print("Linear Regression MSE:", mse_lin)
print("Ridge Regression MSE:", mse_ridge)


Linear Regression Coefficients: [4.60714716 0.37693213 5.02847578]
Ridge Regression Coefficients: [2.34758564 2.33297239 4.36420609]
Linear Regression MSE: 0.006742878758661827
Ridge Regression MSE: 0.045100919357462514
