<a href="https://colab.research.google.com/github/cedamusk/AI-N-ML/blob/main/Ridge_and_Lasso_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
def prepare_data(df):
  X=df.drop(['Y', 'Year'], axis=1)
  y=df['Y']

  X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

  scaler=StandardScaler()
  X_train_scaled=scaler.fit_transform(X_train)
  X_test_scaled=scaler.transform(X_test)

  return X_train_scaled, X_test_scaled, y_train, y_test, scaler

In [None]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
  models={
      'Linear Regression': LinearRegression(),
      'Ridge Regression': Ridge (alpha=1.0),
      "Lasso Regression": Lasso(alpha=1.0)
  }

  results={}

  for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    train_mse=mean_squared_error(y_train, y_train_pred)
    test_mse=mean_squared_error(y_test, y_test_pred)
    train_r2=r2_score(y_train, y_train_pred)
    test_r2=r2_score(y_test, y_test_pred)

    cv_scores=cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    results[name]={
        'model':model,
        'train_predictions': y_train_pred,
        'test_predictions': y_test_pred,
        'train_rmse': np.sqrt(train_mse),
        'train_r2': train_r2,
        'test_r2': test_r2,
        'cv_scores_mean':cv_scores.mean(),
        'cv_scores_std': cv_scores.std()
    }

  return results

In [None]:
def plot_feature_importance(model, feature_names):
  importance=pd.DataFrame({
      'feature': feature_names,
      'coefficient': np.abs(model.coef_)
  })

  importance=importance.sort_values('coefficient', ascending=False)

  plt.figure(figsize=(12, 6))
  plt.bar(importance['feature'], importance['coefficient'])
  plt.xticks(rotation=45)
  plt.title('Feature Importance')
  plt.xlabel('Features')
  plt.ylabel("Absolute Coefficient Value")
  plt.tight_layout()
  plt.show()

In [None]:
def plot_predictions_vs_actual(results, y_train, y_test):
  fig, axes=plt.subplots(len(results), 2, figsize=(15, 5*len(results)))
  fig.suptitle('Predicted Vs Actual Values for All Models', y=1.02, fontsize=16)

  for i, (name, metrics) in enumerate(results.items()):
    axes[i, 0].scatter(y_train, metrics['train_predictions'], alpha=0.5)
    axes[i, 0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
    axes[i, 0].set_title(f'{name}-Training Set')
    axes[i, 0].set_xlabel("Actual Values")
    axes[i, 0].set_ylabel('Predicted Values')

    axes[i,1].scatter(y_test, metrics['test_predictions'], alpha=0.5)
    axes[i,1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i,1].set_title(f'{name}-Test Set')
    axes[i,1].set_xlabel("Actual Values")
    axes[i, 1].set_ylabel("Predicted Values")

  plt.tight_layout()
  plt.show()


In [None]:
def plot_residuals(results, y_train, y_test):
  fig, axes=plt.subplots(len(results), 2, figsize=(15, 5*len(results)))
  fig.suptitle('residual Plots for All models', y=1.02, fontsize=16)

  for i, (name, metrics) in enumerate(results.items()):

    train_residuals=y_train-metrics['train_predictions']
    axes[i, 0].scatter(metrics['train_predictions'], train_residuals, alpha=0.5)
    axes[i, 0].axhline(y=0, color='r', linestyle='--')
    axes[i, 0].set_title(f'{name}-Trainig Set residuals')
    axes[i, 0].set_xlabel('Predicted Values')
    axes[i, 0].set_ylabel("Residuals")

    test_residuals=y_test-metrics['test_predictions']
    axes[i, 1].scatter(metrics['test_predictions'], test_residuals, alpha=0.5)
    axes[i, 1].axhline(y=0, color='r', linestyle='--')
    axes[i, 1].set_title(f'{name}-Test set Residuals')
    axes[i, 1].set_xlabel('Predicted Values')
    axes[i, 1].set_ylabel('Residuals')

  plt.tight_layout()
  plt.show()

In [None]:
def plot_error_distribution(results, y_train, y_test):
  fig, axes=plt.subplots(len(results), 2, figsize=(15, 5*len(results)))
  fig.suptitle('Error Distribution for All models', y=1.02, fontsize=16)

  for i, (name, metrics) in enumerate(results.items()):
    train_errors=y_train-metrics['train_predictions']
    sns.histplot(train_errors, kde=True, ax=axes[i, 0])
    axes[i, 0].set_title(f'{name}- Training Error distribution')
    axes[i, 0].set_xlabel('Prediction Error')

    test_errors=y_test-metrics['test_predictions']
    sns.histplot(test_errors, kde=True, ax=axes[i, 1])
    axes[i, 1].set_title(f'{name}- Test Error Distribution')
    axes[i, 1].set_xlabel('Prediction Error')

  plt.tight_layout()
  plt.show()

In [None]:
df=pd.read_csv('synthetic_ridge_lasso_data.csv')
X_train, X_test, y_train, y_test, scaler=prepare_data(df)

results=train_and_evaluate_models(X_train, X_test, y_train, y_test)

for name, metrics in results.items():
  print(f"\n{name} Results:")
  print(f"Training RMSE: {metrics['train_rmse']:.4f}")
  print(f"Training R2 Score: {metrics['train_r2']:.4f}")
  print(f"Test R2 Score: {metrics['test_r2']:.4f}")
  print(f"Cross-validation R2(mean +/- std): {metrics['cv_scores_mean']:.4f}+/-{metrics['cv_scores_std']:.4f}")

  plot_feature_importance(results['Linear Regression']['model'], df.drop(["Y", "Year"], axis=1).columns)
  plot_predictions_vs_actual(results, y_train, y_test)
  plot_residuals(results, y_train, y_test)
  plot_error_distribution(results, y_train, y_test)