# Retail Profit & Revenue Forecasting Demo

This notebook demonstrates the retail forecasting system with comprehensive model comparison and interpretability analysis.


In [None]:
# Import required libraries
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import our forecasting pipeline
from main import RetailForecastPipeline
from src.data.data_processor import DataProcessor

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

import warnings
warnings.filterwarnings('ignore')

## 1. Initialize the Forecasting Pipeline


In [None]:
# Initialize the pipeline
pipeline = RetailForecastPipeline("../config/config.yaml")

print("Pipeline initialized successfully!")
print(f"Available models: {pipeline.model_factory.get_available_models()}")

## 2. Load and Explore Sample Data


In [None]:
# Generate sample data (since we don't have actual data file)
data_processor = DataProcessor(pipeline.config)
sample_data = data_processor._create_sample_data()

print(f"Sample data shape: {sample_data.shape}")
print("\nFirst few rows:")
print(sample_data.head())

print("\nData summary:")
print(sample_data.describe())

In [None]:
# Visualize the data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Revenue over time
daily_revenue = sample_data.groupby('date')['revenue'].sum()
axes[0,0].plot(daily_revenue.index, daily_revenue.values)
axes[0,0].set_title('Daily Revenue Over Time')
axes[0,0].set_ylabel('Revenue')

# Profit by category
category_profit = sample_data.groupby('product_category')['profit'].mean()
axes[0,1].bar(category_profit.index, category_profit.values)
axes[0,1].set_title('Average Profit by Category')
axes[0,1].set_ylabel('Profit')
axes[0,1].tick_params(axis='x', rotation=45)

# Revenue by region
region_revenue = sample_data.groupby('region')['revenue'].sum()
axes[1,0].pie(region_revenue.values, labels=region_revenue.index, autopct='%1.1f%%')
axes[1,0].set_title('Revenue Distribution by Region')

# Promotion impact
promo_impact = sample_data.groupby('promotion')[['profit', 'revenue']].mean()
promo_impact.plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Promotion Impact')
axes[1,1].set_xlabel('Promotion (0=No, 1=Yes)')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## 3. Data Preprocessing


In [None]:
# Preprocess data for profit forecasting
profit_data = pipeline.data_processor.preprocess_for_profit_forecast(sample_data, 'profit')
print(f"Processed profit data shape: {profit_data.shape}")
print(f"Features: {list(profit_data.columns)}")

# Show feature statistics
print("\nFeature statistics:")
numeric_cols = profit_data.select_dtypes(include=[np.number]).columns
print(profit_data[numeric_cols].describe().round(2))

## 4. Model Training and Comparison


In [None]:
# Run profit forecasting
print("Training models for profit forecasting...")
profit_results = pipeline.run_profit_forecast("dummy_path", 'profit')

# Display comparison results
comparison = profit_results['comparison']
print("\nModel Performance Comparison:")
print(comparison.round(4))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['rmse', 'mae', 'r2', 'mape']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    if metric in comparison.columns:
        bars = ax.bar(comparison['model'], comparison[metric])
        ax.set_title(f'{metric.upper()} by Model')
        ax.set_ylabel(metric.upper())
        ax.tick_params(axis='x', rotation=45)
        
        # Add value labels
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 5. Interpretability Analysis


In [None]:
# Display interpretability results
interpretability = profit_results['interpretability']

for model_name, results in interpretability.items():
    if 'error' in results:
        print(f"\n{model_name.upper()} - Error: {results['error']}")
        continue
        
    print(f"\n=== {model_name.upper()} INTERPRETABILITY ===")
    
    # SHAP importance
    if 'shap_importance' in results:
        shap_imp = results['shap_importance']
        print(f"\nTop 5 Most Important Features (SHAP):")
        print(shap_imp.head().to_string(index=False))
    
    # Profit vs Loss drivers
    if 'profit_loss_drivers' in results and 'feature_impact' in results['profit_loss_drivers']:
        drivers = results['profit_loss_drivers']
        if 'top_profit_drivers' in drivers:
            print(f"\nTop Profit Drivers:")
            print(drivers['top_profit_drivers'][['feature', 'difference']].head().to_string(index=False))
        
        if 'top_loss_drivers' in drivers:
            print(f"\nTop Loss Drivers:")
            print(drivers['top_loss_drivers'][['feature', 'difference']].head().to_string(index=False))
    
    # Seasonal impact
    if 'seasonal_impact' in results and results['seasonal_impact']:
        print(f"\nSeasonal Impact:")
        for feature, impact in list(results['seasonal_impact'].items())[:3]:
            print(f"  {feature}: Mean Impact = {impact['mean_impact']:.3f}")

## 6. Feature Importance Visualization


In [None]:
# Plot feature importance for tree-based models
tree_models = ['xgboost', 'lightgbm']
fig, axes = plt.subplots(1, len(tree_models), figsize=(15, 8))

for i, model_name in enumerate(tree_models):
    if model_name in interpretability:
        results = interpretability[model_name]
        if 'shap_importance' in results:
            importance = results['shap_importance'].head(10)
            
            ax = axes[i] if len(tree_models) > 1 else axes
            bars = ax.barh(range(len(importance)), importance['shap_importance'])
            ax.set_yticks(range(len(importance)))
            ax.set_yticklabels(importance['feature'])
            ax.set_xlabel('SHAP Importance')
            ax.set_title(f'{model_name.upper()} Feature Importance')
            ax.invert_yaxis()
            
            # Add value labels
            for j, bar in enumerate(bars):
                width = bar.get_width()
                ax.text(width + width*0.01, bar.get_y() + bar.get_height()/2,
                       f'{width:.3f}', ha='left', va='center')

plt.tight_layout()
plt.show()

## 7. Prediction Visualization


In [None]:
# Get actual vs predicted values
predictions = profit_results['predictions']

# For demonstration, we'll use the test data from the last model training
# In practice, you'd want to pass the actual test values
test_data = pipeline.data_processor.split_data(profit_data)[2]  # Get test split
actual_values = test_data['profit'].values

# Plot predictions vs actual
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, (model_name, preds) in enumerate(predictions.items()):
    if i >= 4:  # Only plot first 4 models
        break
        
    # Ensure same length
    min_length = min(len(preds), len(actual_values))
    preds_trimmed = preds[:min_length]
    actual_trimmed = actual_values[:min_length]
    
    # Scatter plot
    axes[i].scatter(actual_trimmed, preds_trimmed, alpha=0.6)
    
    # Perfect prediction line
    min_val = min(np.min(actual_trimmed), np.min(preds_trimmed))
    max_val = max(np.max(actual_trimmed), np.max(preds_trimmed))
    axes[i].plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8)
    
    axes[i].set_title(f'{model_name.upper()}')
    axes[i].set_xlabel('Actual')
    axes[i].set_ylabel('Predicted')
    axes[i].grid(True, alpha=0.3)
    
    # Calculate and display R¬≤
    from sklearn.metrics import r2_score
    r2 = r2_score(actual_trimmed, preds_trimmed)
    axes[i].text(0.05, 0.95, f'R¬≤ = {r2:.3f}', transform=axes[i].transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

## 8. Time Series Forecast Visualization


In [None]:
# Time series plot of predictions
plt.figure(figsize=(15, 8))

# Plot actual values
x_values = range(len(actual_values))
plt.plot(x_values, actual_values, label='Actual', linewidth=2, color='black')

# Plot predictions for each model
colors = ['red', 'blue', 'green', 'orange', 'purple']
for (model_name, preds), color in zip(predictions.items(), colors):
    # Ensure same length
    min_length = min(len(preds), len(actual_values))
    preds_trimmed = preds[:min_length]
    
    plt.plot(range(min_length), preds_trimmed, 
             label=f'{model_name.title()} Prediction', 
             linewidth=1.5, alpha=0.8, color=color)

plt.title('Model Predictions vs Actual Values Over Time', fontsize=14)
plt.xlabel('Time Period')
plt.ylabel('Profit')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Business Insights Summary


In [None]:
print("=== BUSINESS INSIGHTS SUMMARY ===")
print()

# Best performing model
best_model = comparison.loc[comparison['rmse'].idxmin(), 'model']
best_rmse = comparison.loc[comparison['rmse'].idxmin(), 'rmse']
print(f"üèÜ BEST PERFORMING MODEL: {best_model.upper()}")
print(f"   RMSE: {best_rmse:.3f}")
print()

# Key findings from interpretability
print("üìä KEY FINDINGS:")
print()

if best_model in interpretability:
    best_results = interpretability[best_model]
    
    if 'shap_importance' in best_results:
        top_feature = best_results['shap_importance'].iloc[0]
        print(f"üí° Most Important Feature: {top_feature['feature']}")
        print(f"   Impact Score: {top_feature['shap_importance']:.3f}")
        print()
    
    if 'profit_loss_drivers' in best_results:
        drivers = best_results['profit_loss_drivers']
        if 'top_profit_drivers' in drivers and len(drivers['top_profit_drivers']) > 0:
            top_profit_driver = drivers['top_profit_drivers'].iloc[0]
            print(f"üìà Top Profit Driver: {top_profit_driver['feature']}")
            print(f"   Impact: {top_profit_driver['difference']:.3f}")
            print()
            
        if 'top_loss_drivers' in drivers and len(drivers['top_loss_drivers']) > 0:
            top_loss_driver = drivers['top_loss_drivers'].iloc[0]
            print(f"üìâ Top Loss Driver: {top_loss_driver['feature']}")
            print(f"   Impact: {top_loss_driver['difference']:.3f}")
            print()

# Model performance summary
print("üéØ MODEL PERFORMANCE RANKING:")
for i, row in comparison.iterrows():
    print(f"   {row['rank']}. {row['model'].upper()} - RMSE: {row['rmse']:.3f}")
print()

# Business recommendations
print("üíº BUSINESS RECOMMENDATIONS:")
print("   1. Deploy the best performing model in production")
print("   2. Focus on top profit drivers to maximize returns")
print("   3. Address top loss drivers to minimize losses")
print("   4. Monitor seasonal patterns for inventory planning")
print("   5. Optimize promotional strategies based on impact analysis")
print("   6. Consider regional differences in forecasting models")

## 10. Export Results


In [None]:
# Create output directory
import os
os.makedirs('notebook_outputs', exist_ok=True)

# Save model comparison results
comparison.to_csv('notebook_outputs/model_comparison.csv', index=False)
print("‚úÖ Model comparison results saved to 'notebook_outputs/model_comparison.csv'")

# Save feature importance for best model
if best_model in interpretability and 'shap_importance' in interpretability[best_model]:
    interpretability[best_model]['shap_importance'].to_csv(
        f'notebook_outputs/{best_model}_feature_importance.csv', index=False)
    print(f"‚úÖ Feature importance saved to 'notebook_outputs/{best_model}_feature_importance.csv'")

# Generate final visualizations
try:
    pipeline.generate_visualizations(profit_results, "notebook_outputs/")
    print("‚úÖ Visualizations saved to 'notebook_outputs/' directory")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not generate all visualizations: {e}")

print("\nüéâ Analysis complete! Check the 'notebook_outputs' directory for all results.")