# Regularization Analysis for LC Duration Prediction

This notebook implements regularized regression models with proper hyperparameter tuning using scikit-learn's cross-validation tools.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [8]:
def load_and_prepare_data():
    # Load the engineered data
    df = pd.read_csv('../../data/LC_engineered.csv')
    
    target = 'Duration_In_Min'
    features_to_drop = ['Check_Out_Time', target]
    
    X = df.drop(features_to_drop, axis=1)
    y = df[target]
    
    # Convert categorical variables to numeric
    X = pd.get_dummies(X, drop_first=True)
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    return X_scaled, y

In [9]:
def tune_models():
    # Load data
    X, y = load_and_prepare_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
    
    # Define models and their parameter grids
    models = {
        'Ridge': (Ridge(), {
            'alpha': np.logspace(-3, 3, 7)
        }),
        'Lasso': (Lasso(), {
            'alpha': np.logspace(-3, 3, 7)
        }),
        'ElasticNet': (ElasticNet(), {
            'alpha': np.logspace(-3, 3, 7),
            'l1_ratio': np.linspace(0.1, 0.9, 5)
        })
    }
    
    # Perform GridSearchCV for each model
    best_models = {}
    cv = KFold(n_splits=5, shuffle=True, random_state=3)
    
    for name, (model, param_grid) in models.items():
        print(f"\nTuning {name}...")
        grid_search = GridSearchCV(
            model, param_grid, cv=cv, 
            scoring='neg_mean_squared_error',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV score: {np.sqrt(-grid_search.best_score_):.2f} (RMSE)")
        
        best_models[name] = grid_search.best_estimator_
    
    return best_models, X_train, X_test, y_train, y_test

In [10]:
def plot_feature_importance(model, feature_names, title):
    importance = np.abs(model.coef_)
    feat_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=feat_importance.head(10), x='importance', y='feature')
    plt.title(f'Top 10 Feature Importance - {title}')
    plt.tight_layout()
    plt.show()

In [11]:
def plot_predictions(y_true, predictions, titles):
    n_models = len(predictions)
    fig, axes = plt.subplots(1, n_models, figsize=(15, 5))
    
    for ax, y_pred, title in zip(axes, predictions, titles):
        ax.scatter(y_true, y_pred, alpha=0.5)
        ax.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
        ax.set_xlabel('Actual Duration')
        ax.set_ylabel('Predicted Duration')
        ax.set_title(f'{title}: Actual vs Predicted')
    
    plt.tight_layout()
    plt.show()

: 

In [12]:
# Run the analysis
best_models, X_train, X_test, y_train, y_test = tune_models()

# Get predictions
predictions = {name: model.predict(X_test) for name, model in best_models.items()}

# Print metrics
for name, y_pred in predictions.items():
    print(f"\n{name} Performance Metrics:")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")

# Plot feature importance for each model
for name, model in best_models.items():
    plot_feature_importance(model, X_train.columns, name)

# Plot predictions
plot_predictions(y_test, predictions.values(), predictions.keys())


Tuning Ridge...
