In [3]:
# MACHINE LEARNING MODELS WITH SKLEARN PIPELINES
# Rossmann Sales Forecasting - Task 2: Machine Learning Pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging
import pickle
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import sys
import mlflow
import mlflow.sklearn
from scipy import stats
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
import time
warnings.filterwarnings('ignore')
# Clear any existing handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configure logging with force=True to override existing config
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)],
    force=True  # This is key for Colab
)




In [2]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentele

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Use the correct path to your file in Google Drive
file_path = '/content/drive/MyDrive/Colab_Notebooks/processed_data_set/train_cleaned.csv'
# Load the CSV into a variable
train_df = pd.read_csv(file_path)
logging.info("Train data loaded successfully.")

2025-06-25 11:29:43,328 - INFO - Train data loaded successfully.


In [8]:
logging.info(f"Loaded training data with shape: {train_df.shape}")

2025-06-25 11:29:44,638 - INFO - Loaded training data with shape: (1017155, 27)


In [9]:
# 2.1 PREPROCESSING
logging.info("Starting preprocessing pipeline")

2025-06-25 11:29:45,897 - INFO - Starting preprocessing pipeline


In [10]:
# Feature Engineering
logging.info("Creating new features...")


2025-06-25 11:29:46,878 - INFO - Creating new features...


In [11]:
# Convert Date column to datetime format first
train_df['Date'] = pd.to_datetime(train_df['Date'])

# Date features
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['DayOfWeek'] = train_df['Date'].dt.dayofweek
train_df['WeekOfYear'] = train_df['Date'].dt.isocalendar().week
train_df['Quarter'] = train_df['Date'].dt.quarter
train_df['IsWeekend'] = train_df['DayOfWeek'].isin([5, 6]).astype(int)

# Holiday features
train_df['IsStateHoliday'] = (train_df['StateHoliday'] != '0').astype(int)
train_df['IsSchoolHoliday'] = train_df['SchoolHoliday'].astype(int)

# Create a reference date for holiday calculations
reference_date = train_df['Date'].min()

# Days to/from holidays calculation
state_holidays = train_df[train_df['IsStateHoliday'] == 1]['Date'].unique()
train_df['DaysToHoliday'] = np.nan
train_df['DaysAfterHoliday'] = np.nan

for idx, row in train_df.iterrows():
    current_date = row['Date']

    # Find nearest future holiday
    future_holidays = state_holidays[state_holidays > current_date]
    if len(future_holidays) > 0:
        days_to = (future_holidays.min() - current_date).days
        train_df.loc[idx, 'DaysToHoliday'] = min(days_to, 30)  # Cap at 30 days
    else:
        train_df.loc[idx, 'DaysToHoliday'] = 30

    # Find nearest past holiday
    past_holidays = state_holidays[state_holidays < current_date]
    if len(past_holidays) > 0:
        days_after = (current_date - past_holidays.max()).days
        train_df.loc[idx, 'DaysAfterHoliday'] = min(days_after, 30)  # Cap at 30 days
    else:
        train_df.loc[idx, 'DaysAfterHoliday'] = 30

In [12]:
# Month position features
train_df['IsBeginningOfMonth'] = (train_df['Day'] <= 10).astype(int)
train_df['IsMidMonth'] = ((train_df['Day'] > 10) & (train_df['Day'] <= 20)).astype(int)
train_df['IsEndOfMonth'] = (train_df['Day'] > 20).astype(int)

# Competition features
train_df['CompetitionAge'] = train_df['Year'] - train_df['CompetitionOpenSinceYear']
train_df['CompetitionAge'] = train_df['CompetitionAge'].clip(lower=0)
train_df['HasCompetition'] = (train_df['CompetitionDistance'].notna()).astype(int)

# Promo features
train_df['PromoAge'] = np.where(
    train_df['Promo2'] == 1,
    (train_df['Year'] - train_df['Promo2SinceYear']) * 52 +
    (train_df['WeekOfYear'] - train_df['Promo2SinceWeek']),
    0
)
train_df['PromoAge'] = train_df['PromoAge'].clip(lower=0)


In [13]:
# Seasonal features
train_df['IsSummer'] = train_df['Month'].isin([6, 7, 8]).astype(int)
train_df['IsWinter'] = train_df['Month'].isin([12, 1, 2]).astype(int)
train_df['IsSpring'] = train_df['Month'].isin([3, 4, 5]).astype(int)
train_df['IsAutumn'] = train_df['Month'].isin([9, 10, 11]).astype(int)

# Sales per customer (for stores that are open)
train_df['SalesPerCustomer'] = np.where(
    train_df['Customers'] > 0,
    train_df['Sales'] / train_df['Customers'],
    0
)

logging.info("Feature engineering completed")

2025-06-25 11:54:45,105 - INFO - Feature engineering completed


In [14]:
# Encode categorical variables
logging.info("Encoding categorical variables")

# Label encoding for categorical variables
le_store_type = LabelEncoder()
le_assortment = LabelEncoder()
le_state_holiday = LabelEncoder()
le_promo_interval = LabelEncoder()

train_df['StoreType_encoded'] = le_store_type.fit_transform(train_df['StoreType'])
train_df['Assortment_encoded'] = le_assortment.fit_transform(train_df['Assortment'])
train_df['StateHoliday_encoded'] = le_state_holiday.fit_transform(train_df['StateHoliday'].astype(str))
train_df['PromoInterval_encoded'] = le_promo_interval.fit_transform(train_df['PromoInterval'])

2025-06-25 11:54:46,329 - INFO - Encoding categorical variables


In [15]:
# Handle missing values
train_df['CompetitionDistance'].fillna(train_df['CompetitionDistance'].median(), inplace=True)
train_df['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
train_df['CompetitionOpenSinceYear'].fillna(1900, inplace=True)
train_df['Promo2SinceWeek'].fillna(0, inplace=True)
train_df['Promo2SinceYear'].fillna(1900, inplace=True)
train_df['DaysToHoliday'].fillna(30, inplace=True)
train_df['DaysAfterHoliday'].fillna(30, inplace=True)


In [16]:
# Select features for modeling
feature_columns = [
    'Store', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'StoreType_encoded',
    'Assortment_encoded', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
    'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear',
    'PromoInterval_encoded', 'Year', 'Month', 'Day', 'WeekOfYear', 'Quarter',
    'IsWeekend', 'IsStateHoliday', 'IsSchoolHoliday', 'DaysToHoliday',
    'DaysAfterHoliday', 'IsBeginningOfMonth', 'IsMidMonth', 'IsEndOfMonth',
    'CompetitionAge', 'HasCompetition', 'PromoAge', 'IsSummer', 'IsWinter',
    'IsSpring', 'IsAutumn'
]

In [17]:
# Filter out rows where store is closed (Sales = 0, Open = 0)
train_df_filtered = train_df[train_df['Open'] == 1].copy()
logging.info(f"Training on {len(train_df_filtered)} open store records")

X = train_df_filtered[feature_columns]
y = train_df_filtered['Sales']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logging.info(f"Train set size: {X_train.shape}, Test set size: {X_test.shape}")


2025-06-25 11:54:51,624 - INFO - Training on 844338 open store records
2025-06-25 11:54:52,020 - INFO - Train set size: (675470, 34), Test set size: (168868, 34)


In [18]:
# 2.2 BUILDING MODELS WITH SKLEARN PIPELINES
logging.info("Building ML models with sklearn pipelines")

# Define preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Define models to try
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression()
}

# Create pipelines for each model
pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('model', model)
    ])


2025-06-25 11:54:54,200 - INFO - Building ML models with sklearn pipelines


In [19]:
# 2.3 LOSS FUNCTION CHOICE
logging.info("Evaluating models with multiple metrics")

# Custom loss functions
def mean_absolute_percentage_error(y_true, y_pred):
    """Calculate MAPE"""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def root_mean_squared_percentage_error(y_true, y_pred):
    """Calculate RMSPE - commonly used for sales forecasting"""
    return np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2)) * 100


2025-06-25 11:54:56,136 - INFO - Evaluating models with multiple metrics


In [None]:
# Evaluate models
model_results = {}
logging.info("Training and evaluating models...")

for name, pipeline in pipelines.items():
    logging.info(f"Training {name}...")

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Predictions
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)

    # Calculate metrics
    results = {
        'train_mse': mean_squared_error(y_train, y_pred_train),
        'test_mse': mean_squared_error(y_test, y_pred_test),
        'train_mae': mean_absolute_error(y_train, y_pred_train),
        'test_mae': mean_absolute_error(y_test, y_pred_test),
        'train_r2': r2_score(y_train, y_pred_train),
        'test_r2': r2_score(y_test, y_pred_test),
        'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'train_mape': mean_absolute_percentage_error(y_train, y_pred_train),
        'test_mape': mean_absolute_percentage_error(y_test, y_pred_test),
        'train_rmspe': root_mean_squared_percentage_error(y_train, y_pred_train),
        'test_rmspe': root_mean_squared_percentage_error(y_test, y_pred_test)
    }

    model_results[name] = results

    logging.info(f"{name} - Test RMSE: {results['test_rmse']:.2f}, Test RMSPE: {results['test_rmspe']:.2f}%")


2025-06-25 11:54:57,435 - INFO - Training and evaluating models...
2025-06-25 11:54:57,437 - INFO - Training Random Forest...


In [None]:
# Display results
results_df = pd.DataFrame(model_results).T
print("\n=== MODEL COMPARISON ===")
print(results_df[['test_rmse', 'test_rmspe', 'test_r2', 'test_mape']].round(3))


In [None]:
# Select best model based on RMSPE (Root Mean Squared Percentage Error)
best_model_name = results_df['test_rmspe'].idxmin()
best_pipeline = pipelines[best_model_name]
logging.info(f"Best model: {best_model_name} with RMSPE: {results_df.loc[best_model_name, 'test_rmspe']:.3f}%")

In [None]:


def progressive_tuning(pipeline, X_train, y_train, best_model_name):
    """Tune hyperparameters progressively to save time with detailed logging"""

    logging.info(f"Starting progressive hyperparameter tuning for {best_model_name}")
    start_time = time.time()

    if best_model_name == 'Random Forest':
        logging.info("=== RANDOM FOREST PROGRESSIVE TUNING ===")

        #  Tune n_estimators first
        logging.info(" Tuning n_estimators...")
        param_grid_1 = {'model__n_estimators': [50, 100, 200]}
        logging.info(f"Testing parameters: {param_grid_1}")

        stage1_start = time.time()
        grid_1 = GridSearchCV(
            pipeline,
            param_grid_1,
            cv=3,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        grid_1.fit(X_train, y_train)
        stage1_time = time.time() - stage1_start

        best_n_est = grid_1.best_params_['model__n_estimators']
        best_score_1 = grid_1.best_score_

        logging.info(f" completed  {stage1_time:.2f} seconds")
        logging.info(f"Best n_estimators: {best_n_est}")
        logging.info(f"Best CV score: {best_score_1:.4f}")
        logging.info(f"All  scores: {dict(zip([str(p) for p in param_grid_1['model__n_estimators']], grid_1.cv_results_['mean_test_score']))}")

        #  Fix best n_estimators, tune other params
        logging.info("\n Tuning max_depth and min_samples_split...")
        param_grid_2 = {
            'model__n_estimators': [best_n_est],
            'model__max_depth': [10, 20, None],
            'model__min_samples_split': [2, 5]
        }
        logging.info(f"Testing parameters: {param_grid_2}")

        stage2_start = time.time()
        grid_2 = GridSearchCV(
            pipeline,
            param_grid_2,
            cv=3,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        grid_2.fit(X_train, y_train)
        stage2_time = time.time() - stage2_start

        best_score_2 = grid_2.best_score_

        logging.info(f" completed  {stage2_time:.2f} seconds")
        logging.info(f"Final best parameters: {grid_2.best_params_}")
        logging.info(f"Final best CV score: {best_score_2:.4f}")
        logging.info(f"Score improvement : {best_score_2 - best_score_1:.4f}")

        total_time = time.time() - start_time
        logging.info(f"Random Forest progressive tuning completed in {total_time:.2f} seconds")

        return grid_2.best_estimator_

    elif best_model_name == 'Gradient Boosting':
        logging.info("=== GRADIENT BOOSTING PROGRESSIVE TUNING ===")

        # Tune learning_rate and n_estimators
        logging.info(" Tuning learning_rate and n_estimators...")
        param_grid_1 = {
            'model__learning_rate': [0.05, 0.1, 0.2],
            'model__n_estimators': [100, 200]
        }
        logging.info(f"Testing parameters: {param_grid_1}")

        stage1_start = time.time()
        grid_1 = GridSearchCV(
            pipeline,
            param_grid_1,
            cv=3,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        grid_1.fit(X_train, y_train)
        stage1_time = time.time() - stage1_start

        best_lr = grid_1.best_params_['model__learning_rate']
        best_n_est = grid_1.best_params_['model__n_estimators']
        best_score_1 = grid_1.best_score_

        logging.info(f"Stage 1 completed in {stage1_time:.2f} seconds")
        logging.info(f"Best learning_rate: {best_lr}")
        logging.info(f"Best n_estimators: {best_n_est}")
        logging.info(f"Best CV score: {best_score_1:.4f}")

        # Log all combinations tested
        results_1 = []
        for i, (lr, n_est) in enumerate([(lr, n_est) for lr in param_grid_1['model__learning_rate']
                                        for n_est in param_grid_1['model__n_estimators']]):
            score = grid_1.cv_results_['mean_test_score'][i]
            results_1.append(f"lr={lr}, n_est={n_est}: {score:.4f}")
        logging.info(f"All Stage 1 results: {results_1}")

        #  Fix best lr and n_est, tune depth
        logging.info("\n Tuning max_depth...")
        param_grid_2 = {
            'model__learning_rate': [best_lr],
            'model__n_estimators': [best_n_est],
            'model__max_depth': [3, 6, 10]
        }
        logging.info(f"Testing parameters: {param_grid_2}")

        stage2_start = time.time()
        grid_2 = GridSearchCV(
            pipeline,
            param_grid_2,
            cv=3,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        grid_2.fit(X_train, y_train)
        stage2_time = time.time() - stage2_start

        best_score_2 = grid_2.best_score_

        logging.info(f"Stage 2 completed in {stage2_time:.2f} seconds")
        logging.info(f"Final best parameters: {grid_2.best_params_}")
        logging.info(f"Final best CV score: {best_score_2:.4f}")
        logging.info(f"Score improvement from Stage 1: {best_score_2 - best_score_1:.4f}")

        # Log all depth results
        depth_results = dict(zip([3, 6, 10], grid_2.cv_results_['mean_test_score']))
        logging.info(f"All depth results: {depth_results}")

        total_time = time.time() - start_time
        logging.info(f"Gradient Boosting progressive tuning completed in {total_time:.2f} seconds")

        return grid_2.best_estimator_

    else:
        logging.warning(f"Progressive tuning not implemented for {best_model_name}")
        logging.info("Returning original pipeline without tuning")
        return pipeline

In [None]:
# Final model evaluation
y_pred_final = best_pipeline.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_final))
final_rmspe = root_mean_squared_percentage_error(y_test, y_pred_final)
final_r2 = r2_score(y_test, y_pred_final)

logging.info(f"Final model performance - RMSE: {final_rmse:.2f}, RMSPE: {final_rmspe:.3f}%, R²: {final_r2:.3f}")


In [None]:
# 2.4 POST PREDICTION ANALYSIS
logging.info("Performing post-prediction analysis...")

# Feature importance (for tree-based models)
if hasattr(best_pipeline.named_steps['model'], 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_pipeline.named_steps['model'].feature_importances_
    }).sort_values('importance', ascending=False)

    plt.figure(figsize=(12, 8))
    plt.barh(feature_importance.head(15)['feature'], feature_importance.head(15)['importance'])
    plt.title('Top 15 Feature Importances')
    plt.xlabel('Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    print("\n=== TOP 10 FEATURE IMPORTANCES ===")
    print(feature_importance.head(10))


In [None]:
# Residual analysis
residuals = y_test - y_pred_final

plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
plt.scatter(y_pred_final, residuals, alpha=0.5)
plt.xlabel('Predicted Sales')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted')
plt.axhline(y=0, color='r', linestyle='--')

plt.subplot(2, 3, 2)
plt.hist(residuals, bins=50, edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residuals Distribution')

plt.subplot(2, 3, 3)
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot')

plt.subplot(2, 3, 4)
plt.scatter(y_test, y_pred_final, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted')

plt.subplot(2, 3, 5)
plt.scatter(range(len(residuals)), residuals, alpha=0.5)
plt.xlabel('Index')
plt.ylabel('Residuals')
plt.title('Residuals vs Index')
plt.axhline(y=0, color='r', linestyle='--')

plt.subplot(2, 3, 6)
# Error by day of week
error_by_dow = pd.DataFrame({
    'DayOfWeek': X_test['DayOfWeek'],
    'Error': np.abs(residuals)
}).groupby('DayOfWeek')['Error'].mean()
plt.bar(range(7), error_by_dow.values)
plt.xlabel('Day of Week (0=Mon, 6=Sun)')
plt.ylabel('Mean Absolute Error')
plt.title('Prediction Error by Day of Week')

plt.tight_layout()
plt.show()


In [None]:
def fast_bootstrap_ci(best_pipeline, X_train, y_train, X_test, n_bootstrap=20):
    """Reduced bootstrap iterations for faster execution"""

    logging.info(f"Estimating confidence intervals with {n_bootstrap} bootstrap samples...")
    start_time = time.time()

    bootstrap_predictions = []

    for i in range(n_bootstrap):
        if i % 5 == 0:
            logging.info(f"Bootstrap iteration {i+1}/{n_bootstrap}")


        bootstrap_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
        X_boot = X_train.iloc[bootstrap_indices]
        y_boot = y_train.iloc[bootstrap_indices]

        # Clone the pipeline to avoid interference
        from sklearn.base import clone
        boot_pipeline = clone(best_pipeline)
        boot_pipeline.fit(X_boot, y_boot)

        # Predict on test set
        boot_pred = boot_pipeline.predict(X_test)
        bootstrap_predictions.append(boot_pred)

    bootstrap_predictions = np.array(bootstrap_predictions)
    execution_time = time.time() - start_time

    logging.info(f"Bootstrap confidence intervals completed in {execution_time:.2f} seconds")
    return bootstrap_predictions


In [None]:
# Calculate confidence intervals
confidence_level = 0.95
alpha = 1 - confidence_level
lower_percentile = (alpha/2) * 100
upper_percentile = (1 - alpha/2) * 100

lower_bound = np.percentile(bootstrap_predictions, lower_percentile, axis=0)
upper_bound = np.percentile(bootstrap_predictions, upper_percentile, axis=0)

# Plot confidence intervals for a  predictions
sample_indices = np.random.choice(len(y_test), size=100, replace=False)
sample_indices = np.sort(sample_indices)

plt.figure(figsize=(12, 6))
plt.scatter(range(len(sample_indices)), y_test.iloc[sample_indices], alpha=0.7, label='Actual', color='blue')
plt.scatter(range(len(sample_indices)), y_pred_final[sample_indices], alpha=0.7, label='Predicted', color='red')
plt.fill_between(range(len(sample_indices)),
                 lower_bound[sample_indices],
                 upper_bound[sample_indices],
                 alpha=0.3, color='gray', label=f'{confidence_level*100}% Confidence Interval')
plt.xlabel('Sample Index')
plt.ylabel('Sales')
plt.title('Predictions with Confidence Intervals (Sample)')
plt.legend()
plt.show()

In [None]:
# Calculate coverage percentage
coverage = np.mean((y_test >= lower_bound) & (y_test <= upper_bound))
logging.info(f"Confidence interval coverage: {coverage*100:.1f}%")

# 2.5 SERIALIZE MODELS
logging.info("Serializing the best model...")

#  timestamp for model versioning
timestamp = datetime.now().strftime("%d-%m-%Y-%H-%M-%S-%f")[:-3]
model_filename = f"rossmann_model_{timestamp}.pkl"

# Save model with preprocessing pipeline and encoders
model_package = {
    'model': best_pipeline,
    'feature_columns': feature_columns,
    'label_encoders': {
        'StoreType': le_store_type,
        'Assortment': le_assortment,
        'StateHoliday': le_state_holiday,
        'PromoInterval': le_promo_interval
    },
    'model_metrics': {
        'rmse': final_rmse,
        'rmspe': final_rmspe,
        'r2': final_r2
    },
    'training_date': timestamp
}

with open(model_filename, 'wb') as f:
    pickle.dump(model_package, f)

logging.info(f"Model saved as {model_filename}")


In [None]:
# MLFlow logging
logging.info("Logging to MLFlow...")

mlflow.set_experiment("Rossmann Sales Forecasting")

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("model_type", best_model_name)
    mlflow.log_param("features_count", len(feature_columns))
    mlflow.log_param("training_samples", len(X_train))

    # Log metrics
    mlflow.log_metric("rmse", final_rmse)
    mlflow.log_metric("rmspe", final_rmspe)
    mlflow.log_metric("r2_score", final_r2)
    mlflow.log_metric("confidence_coverage", coverage)

    # Log model
    mlflow.sklearn.log_model(best_pipeline, "model")

    # Log artifacts
    mlflow.log_artifact(model_filename)

logging.info("MLFlow logging completed")

# Save predictions for analysis
predictions_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_final,
    'Lower_CI': lower_bound,
    'Upper_CI': upper_bound,
    'Store': X_test['Store'],
    'DayOfWeek': X_test['DayOfWeek']
})

predictions_df.to_csv(f'predictions_{timestamp}.csv', index=False)
logging.info(f"Predictions saved to predictions_{timestamp}.csv")


In [None]:
# Performance summary
print(f"\n=== FINAL MODEL PERFORMANCE SUMMARY ===")
print(f"Best Model: {best_model_name}")
print(f"RMSE: {final_rmse:.2f}")
print(f"RMSPE: {final_rmspe:.3f}%")
print(f"R² Score: {final_r2:.3f}")
print(f"Confidence Interval Coverage: {coverage*100:.1f}%")
print(f"Model saved as: {model_filename}")

logging.info("Machine Learning pipeline completed successfully")