# ðŸ“ˆ Sales Forecasting Model

This notebook builds a sales forecasting model using time series techniques.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = Path('../DataSource')
OUTPUT_PATH = Path('../outputs/predictions')

## 1. Data Preparation

In [None]:
# Load data
sales = pd.read_csv(DATA_PATH / 'Candy_Sales.csv')
products = pd.read_csv(DATA_PATH / 'Candy_Products.csv')

# Convert dates
sales['Order Date'] = pd.to_datetime(sales['Order Date'])

# Create time features
sales['Year'] = sales['Order Date'].dt.year
sales['Month'] = sales['Order Date'].dt.month
sales['Quarter'] = sales['Order Date'].dt.quarter
sales['DayOfWeek'] = sales['Order Date'].dt.dayofweek
sales['WeekOfYear'] = sales['Order Date'].dt.isocalendar().week
sales['DayOfMonth'] = sales['Order Date'].dt.day

print(f"Data shape: {sales.shape}")
print(f"Date range: {sales['Order Date'].min()} to {sales['Order Date'].max()}")

In [None]:
# Aggregate to monthly level
monthly_sales = sales.groupby(['Year', 'Month']).agg({
    'Sales': 'sum',
    'Units': 'sum',
    'Gross Profit': 'sum',
    'Order ID': 'nunique',
    'Customer ID': 'nunique'
}).reset_index()

monthly_sales.columns = ['Year', 'Month', 'Sales', 'Units', 'Profit', 'Orders', 'Customers']

# Create date column for plotting
monthly_sales['Date'] = pd.to_datetime(monthly_sales[['Year', 'Month']].assign(day=1))

# Create lag features
for lag in [1, 2, 3, 6, 12]:
    monthly_sales[f'Sales_Lag{lag}'] = monthly_sales['Sales'].shift(lag)

# Rolling averages
monthly_sales['Sales_MA3'] = monthly_sales['Sales'].rolling(window=3).mean()
monthly_sales['Sales_MA6'] = monthly_sales['Sales'].rolling(window=6).mean()

# YoY growth
monthly_sales['Sales_YoY'] = monthly_sales['Sales'] / monthly_sales['Sales_Lag12'] - 1

print(monthly_sales.tail(10))

## 2. Feature Engineering

In [None]:
# Prepare features
df = monthly_sales.dropna().copy()

features = ['Year', 'Month', 'Sales_Lag1', 'Sales_Lag2', 'Sales_Lag3', 
            'Sales_Lag6', 'Sales_Lag12', 'Sales_MA3', 'Sales_MA6']

X = df[features]
y = df['Sales']

print(f"Features: {features}")
print(f"Training samples: {len(X)}")

In [None]:
# Train/test split (last 6 months for testing)
train_size = len(X) - 6
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"Training: {len(X_train)} samples")
print(f"Testing: {len(X_test)} samples")

## 3. Model Training

In [None]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=5)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=3)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

print("Models trained successfully!")

In [None]:
# Model evaluation
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    print(f"\n{model_name}:")
    print(f"  MAE: ${mae:,.2f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  RÂ²: {r2:.3f}")
    print(f"  MAPE: {mape:.1f}%")
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'MAPE': mape}

rf_metrics = evaluate_model(y_test.values, rf_pred, "Random Forest")
gb_metrics = evaluate_model(y_test.values, gb_pred, "Gradient Boosting")

In [None]:
# Feature importance
importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance (Random Forest):")
print(importance.to_string(index=False))

## 4. Visualization

In [None]:
# Plot actual vs predicted
test_dates = df.iloc[train_size:]['Date'].values

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(df['Date'], df['Sales'], 'b-', label='Actual', alpha=0.7)
ax.plot(test_dates, rf_pred, 'r--', label='RF Prediction', linewidth=2)
ax.plot(test_dates, gb_pred, 'g--', label='GB Prediction', linewidth=2)
ax.axvline(test_dates[0], color='gray', linestyle=':', alpha=0.5, label='Train/Test Split')
ax.set_title('Sales Forecast: Actual vs Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Sales ($)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Future Forecasting

In [None]:
# Forecast next 6 months
last_data = monthly_sales.iloc[-1:].copy()
forecasts = []

for i in range(6):
    # Calculate next month
    next_month = (last_data['Month'].values[0] % 12) + 1
    next_year = last_data['Year'].values[0] + (1 if next_month == 1 else 0)
    
    # Prepare features (simplified approach)
    X_next = pd.DataFrame({
        'Year': [next_year],
        'Month': [next_month],
        'Sales_Lag1': [last_data['Sales'].values[0]],
        'Sales_Lag2': [monthly_sales.iloc[-2]['Sales'] if i == 0 else forecasts[-1]['Sales']],
        'Sales_Lag3': [monthly_sales.iloc[-3]['Sales'] if i <= 1 else forecasts[-2]['Sales']],
        'Sales_Lag6': [monthly_sales.iloc[-6]['Sales']],
        'Sales_Lag12': [monthly_sales.iloc[-12]['Sales'] if len(monthly_sales) > 12 else last_data['Sales'].values[0]],
        'Sales_MA3': [last_data['Sales_MA3'].values[0]],
        'Sales_MA6': [last_data['Sales_MA6'].values[0]]
    })
    
    pred_sales = gb_model.predict(X_next)[0]
    
    forecasts.append({
        'Year': next_year,
        'Month': next_month,
        'Sales': pred_sales,
        'Date': pd.Timestamp(year=next_year, month=next_month, day=1)
    })
    
    last_data = pd.DataFrame([{'Year': next_year, 'Month': next_month, 'Sales': pred_sales,
                               'Sales_MA3': pred_sales, 'Sales_MA6': pred_sales}])

forecast_df = pd.DataFrame(forecasts)
print("\nSales Forecast (Next 6 Months):")
print(forecast_df[['Year', 'Month', 'Sales']].to_string(index=False))

In [None]:
# Save forecast to CSV
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
forecast_df.to_csv(OUTPUT_PATH / 'sales_forecast.csv', index=False)
print(f"\nForecast saved to: {OUTPUT_PATH / 'sales_forecast.csv'}")

In [None]:
# Plot with forecast
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(monthly_sales['Date'], monthly_sales['Sales'], 'b-', label='Historical', linewidth=2)
ax.plot(forecast_df['Date'], forecast_df['Sales'], 'r--', marker='o', label='Forecast', linewidth=2)
ax.fill_between(forecast_df['Date'], 
                forecast_df['Sales'] * 0.9, 
                forecast_df['Sales'] * 1.1, 
                alpha=0.2, color='red', label='Â±10% Range')
ax.set_title('Sales Forecast - Next 6 Months')
ax.set_xlabel('Date')
ax.set_ylabel('Sales ($)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Model Summary

In [None]:
print("="*60)
print("MODEL SUMMARY")
print("="*60)
print("\nðŸ“Š Best Model: Gradient Boosting Regressor")
print(f"   - MAPE: {gb_metrics['MAPE']:.1f}%")
print(f"   - RÂ²: {gb_metrics['R2']:.3f}")
print("\nðŸ”® 6-Month Forecast:")
print(f"   - Total Projected Sales: ${forecast_df['Sales'].sum():,.0f}")
print(f"   - Average Monthly: ${forecast_df['Sales'].mean():,.0f}")
print("\nðŸ“‹ Key Drivers (Feature Importance):")
for _, row in importance.head(3).iterrows():
    print(f"   - {row['Feature']}: {row['Importance']:.3f}")
print("\nðŸ’¡ Insights:")
print("   - Previous month sales (Lag1) is strongest predictor")
print("   - Model captures seasonality through month feature")
print("   - Moving averages smooth out noise")