In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("=== Ridge and Lasso Regression Demonstration ===\n")

# 1. Generate synthetic dataset with correlated features
print("1. Generating synthetic dataset...")
# Create base dataset
X, y = make_regression(n_samples=100, n_features=20, noise=10, random_state=42)

# Add some highly correlated features to demonstrate multicollinearity
X_corr = np.column_stack([
    X,
    X[:, 0] + np.random.normal(0, 0.1, X.shape[0]),  # Highly correlated with feature 0
    X[:, 1] + np.random.normal(0, 0.1, X.shape[0]),  # Highly correlated with feature 1
    X[:, 2] * 2 + np.random.normal(0, 0.2, X.shape[0])  # Scaled version of feature 2
])

print(f"Dataset shape: {X_corr.shape}")
print(f"Number of samples: {X_corr.shape[0]}")
print(f"Number of features: {X_corr.shape[1]}")

# 2. Split the data
print("\n2. Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X_corr, y, test_size=0.3, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# 3. Standardize the data (crucial for regularization!)
print("\n3. Standardizing features...")
print("Note: Standardization is crucial for Ridge and Lasso because:")
print("- Regularization penalties are applied equally to all coefficients")
print("- Features with larger scales would be penalized less without standardization")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Train standard Linear Regression for comparison
print("\n4. Training Standard Linear Regression...")
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr.predict(X_test_scaled)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - Test MSE: {mse_lr:.2f}, R²: {r2_lr:.3f}")

# 5. Train Ridge Regression with different alpha values
print("\n5. Training Ridge Regression...")
alphas_ridge = [0.1, 1.0, 10.0, 100.0]
ridge_results = {}

for alpha in alphas_ridge:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)

    y_pred_ridge = ridge.predict(X_test_scaled)
    mse_ridge = mean_squared_error(y_test, y_pred_ridge)
    r2_ridge = r2_score(y_test, y_pred_ridge)

    ridge_results[alpha] = {
        'model': ridge,
        'mse': mse_ridge,
        'r2': r2_ridge,
        'coefficients': ridge.coef_
    }

    print(f"Ridge (α={alpha}) - Test MSE: {mse_ridge:.2f}, R²: {r2_ridge:.3f}")

# 6. Train Lasso Regression with different alpha values
print("\n6. Training Lasso Regression...")
alphas_lasso = [0.1, 1.0, 10.0, 50.0]
lasso_results = {}

for alpha in alphas_lasso:
    lasso = Lasso(alpha=alpha, max_iter=2000)
    lasso.fit(X_train_scaled, y_train)

    y_pred_lasso = lasso.predict(X_test_scaled)
    mse_lasso = mean_squared_error(y_test, y_pred_lasso)
    r2_lasso = r2_score(y_test, y_pred_lasso)

    # Count non-zero coefficients
    non_zero_coef = np.sum(np.abs(lasso.coef_) > 1e-5)

    lasso_results[alpha] = {
        'model': lasso,
        'mse': mse_lasso,
        'r2': r2_lasso,
        'coefficients': lasso.coef_,
        'non_zero_features': non_zero_coef
    }

    print(f"Lasso (α={alpha}) - Test MSE: {mse_lasso:.2f}, R²: {r2_lasso:.3f}, "
          f"Non-zero features: {non_zero_coef}/{X_train.shape[1]}")

# 7. Visualize coefficient magnitudes
print("\n7. Creating visualizations...")

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Coefficient comparison for different models
ax1 = axes[0, 0]
feature_indices = range(len(lr.coef_))

ax1.scatter(feature_indices, lr.coef_, alpha=0.7, label='Linear Regression', s=50)
ax1.scatter(feature_indices, ridge_results[1.0]['coefficients'],
           alpha=0.7, label='Ridge (α=1.0)', s=50)
ax1.scatter(feature_indices, lasso_results[1.0]['coefficients'],
           alpha=0.7, label='Lasso (α=1.0)', s=50)

ax1.set_xlabel('Feature Index')
ax1.set_ylabel('Coefficient Value')
ax1.set_title('Coefficient Comparison: Linear vs Ridge vs Lasso')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Ridge coefficients vs alpha
ax2 = axes[0, 1]
for i in range(min(10, X_train.shape[1])):  # Show first 10 features
    ridge_coefs = [ridge_results[alpha]['coefficients'][i] for alpha in alphas_ridge]
    ax2.plot(alphas_ridge, ridge_coefs, 'o-', alpha=0.7, label=f'Feature {i}')

ax2.set_xscale('log')
ax2.set_xlabel('Regularization Parameter (α)')
ax2.set_ylabel('Coefficient Value')
ax2.set_title('Ridge: Coefficient Shrinkage vs α')
ax2.grid(True, alpha=0.3)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot 3: Lasso coefficients vs alpha
ax3 = axes[1, 0]
for i in range(min(10, X_train.shape[1])):  # Show first 10 features
    lasso_coefs = [lasso_results[alpha]['coefficients'][i] for alpha in alphas_lasso]
    ax3.plot(alphas_lasso, lasso_coefs, 'o-', alpha=0.7, label=f'Feature {i}')

ax3.set_xscale('log')
ax3.set_xlabel('Regularization Parameter (α)')
ax3.set_ylabel('Coefficient Value')
ax3.set_title('Lasso: Coefficient Shrinkage vs α')
ax3.grid(True, alpha=0.3)
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot 4: Model performance comparison
ax4 = axes[1, 1]
models = ['Linear Reg'] + [f'Ridge α={a}' for a in alphas_ridge] + [f'Lasso α={a}' for a in alphas_lasso]
mse_values = [mse_lr] + [ridge_results[a]['mse'] for a in alphas_ridge] + [lasso_results[a]['mse'] for a in alphas_lasso]

colors = ['blue'] + ['green'] * len(alphas_ridge) + ['red'] * len(alphas_lasso)
bars = ax4.bar(range(len(models)), mse_values, color=colors, alpha=0.7)

ax4.set_xlabel('Model')
ax4.set_ylabel('Test MSE')
ax4.set_title('Model Performance Comparison')
ax4.set_xticks(range(len(models)))
ax4.set_xticklabels(models, rotation=45, ha='right')
ax4.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, mse in zip(bars, mse_values):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{mse:.1f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# 8. Cross-validation for optimal alpha selection
print("\n8. Finding optimal alpha using cross-validation...")

# Ridge cross-validation
alpha_range = np.logspace(-2, 2, 20)  # From 0.01 to 100
ridge_cv_scores = []

for alpha in alpha_range:
    ridge = Ridge(alpha=alpha)
    scores = cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    ridge_cv_scores.append(-scores.mean())

best_ridge_alpha = alpha_range[np.argmin(ridge_cv_scores)]

# Lasso cross-validation
lasso_cv_scores = []

for alpha in alpha_range:
    lasso = Lasso(alpha=alpha, max_iter=2000)
    scores = cross_val_score(lasso, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    lasso_cv_scores.append(-scores.mean())

best_lasso_alpha = alpha_range[np.argmin(lasso_cv_scores)]

print(f"Best Ridge alpha: {best_ridge_alpha:.3f}")
print(f"Best Lasso alpha: {best_lasso_alpha:.3f}")

# Plot cross-validation results
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(alpha_range, ridge_cv_scores, 'g-o', alpha=0.7)
plt.axvline(best_ridge_alpha, color='red', linestyle='--', alpha=0.7, label=f'Best α={best_ridge_alpha:.3f}')
plt.xscale('log')
plt.xlabel('Regularization Parameter (α)')
plt.ylabel('CV Mean Squared Error')
plt.title('Ridge: Cross-Validation Score vs α')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(alpha_range, lasso_cv_scores, 'r-o', alpha=0.7)
plt.axvline(best_lasso_alpha, color='red', linestyle='--', alpha=0.7, label=f'Best α={best_lasso_alpha:.3f}')
plt.xscale('log')
plt.xlabel('Regularization Parameter (α)')
plt.ylabel('CV Mean Squared Error')
plt.title('Lasso: Cross-Validation Score vs α')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 9. Final model evaluation with optimal alpha
print("\n9. Final evaluation with optimal alpha values...")

# Train final models with best alpha
final_ridge = Ridge(alpha=best_ridge_alpha)
final_ridge.fit(X_train_scaled, y_train)
y_pred_final_ridge = final_ridge.predict(X_test_scaled)

final_lasso = Lasso(alpha=best_lasso_alpha, max_iter=2000)
final_lasso.fit(X_train_scaled, y_train)
y_pred_final_lasso = final_lasso.predict(X_test_scaled)

# Calculate final metrics
final_ridge_mse = mean_squared_error(y_test, y_pred_final_ridge)
final_ridge_r2 = r2_score(y_test, y_pred_final_ridge)

final_lasso_mse = mean_squared_error(y_test, y_pred_final_lasso)
final_lasso_r2 = r2_score(y_test, y_pred_final_lasso)
final_lasso_features = np.sum(np.abs(final_lasso.coef_) > 1e-5)

print("\n=== FINAL RESULTS ===")
print(f"Linear Regression  - MSE: {mse_lr:.2f}, R²: {r2_lr:.3f}")
print(f"Ridge (α={best_ridge_alpha:.3f}) - MSE: {final_ridge_mse:.2f}, R²: {final_ridge_r2:.3f}")
print(f"Lasso (α={best_lasso_alpha:.3f}) - MSE: {final_lasso_mse:.2f}, R²: {final_lasso_r2:.3f}, Features: {final_lasso_features}/{X_train.shape[1]}")

# 10. Key takeaways
print("\n=== KEY TAKEAWAYS ===")
print("1. Regularization typically improves generalization performance")
print("2. Ridge shrinks coefficients but keeps all features")
print("3. Lasso can shrink coefficients to exactly zero, performing feature selection")
print("4. Cross-validation is essential for selecting optimal regularization strength")
print("5. Feature standardization is crucial for fair regularization")
print("6. The optimal alpha balances bias and variance for best generalization")