# Corporate Hybrid Forecast Notebook (Prophet + ARIMA) â€“ v2


## 01 - Imports & Settings

In [None]:
"""
Hybrid 3-Way (Prophet vs ARIMA vs TBATS/ETS) with 12-month horizon,
capacity report, and daily capacity plan with language split.
"""

import warnings
warnings.filterwarnings("ignore")

import os
import math
import numpy as np
import pandas as pd
from datetime import datetime

# Forecasting libs
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

try:
    from prophet import Prophet
except Exception:
    Prophet = None

try:
    from tbats import TBATS
except Exception:
    TBATS = None

# Pandas engines
PD_READ_XLSX_ENGINE = 'openpyxl'
PD_WRITE_XLSX_ENGINE = 'openpyxl'

# === Project paths (adapt to your local if needed) ===
INPUT_FOLDER  = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model"
OUTPUT_FOLDER = r"C:\Users\pt3canro\Desktop\CAPACITY\outputs"

# Source files (as in your original pipeline)
INCOMING_PATH   = os.path.join(INPUT_FOLDER, "Incoming_new.xlsx")
CALL_PATH       = os.path.join(INPUT_FOLDER, "call_performance.xlsx")
AGENTS_PATH     = os.path.join(INPUT_FOLDER, "agent_language_n_target.xlsx")
PROD_PATH       = os.path.join(INPUT_FOLDER, "productivity_agents.xlsx")
EINSTEIN_PATH   = os.path.join(INPUT_FOLDER, "einstein.xlsx")
INVENTORY_PATH  = os.path.join(INPUT_FOLDER, "inventory_month.xlsx")
DEPT_PATH  = os.path.join(INPUT_FOLDER, "department.xlsx")   # official mapping source
DEPT_SHEET = None

# Output files
OUT_XLSX = "capacity_forecast_hybrid.xlsx"
OUT_MAPE = "mape_by_department.xlsx"

# Forecast horizon
HORIZON_MONTHS = 6

# Language shares (fixed)
LANG_SHARE = {
    'English': 64.35,
    'French' : 7.41,
    'German' : 8.60,
    'Italian': 6.67,
    'Portuguese': 1.62,
    'Spanish': 11.35,
}


# ---------------------------
# Configuration
# ---------------------------

H_MONTHS = 12                        # 12-month horizon for monthly report
DAILY_HORIZON_DAYS = 90              # Days predicted 90 days ahead
USE_DAILY_MODELLING = True           # If False, disaggregate monthly -> daily
WEEKLY_START_THU = True              # Your organization uses Thu-Wed weeks

# Language shares (fixed)
LANGUAGE_SHARES = {
    'English': 0.6435,
    'French': 0.0741,
    'German': 0.0860,
    'Italian': 0.0667,
    'Portuguese': 0.0162,
    'Spanish': 0.1135
}

# Inputs
# Incoming daily volumes (ensure at least these columns): Date, department_id, ticket_total
INCOMING_SOURCE_PATH = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\Incoming_daily.csv"  # Example CSV
# Optional mapping file to fix Vertical & department names (sheet or CSV)
DEPT_MAP_PATH = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\departments_map.xlsx"       # Example
DEPT_MAP_SHEET = "map"  # columns: department_id, department_name, vertical

# Productivity file (as you indicated)
PRODUCTIVITY_PATH = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\productivity_agents.xlsx"  # columns: Date, agent_id, agent_name, department_id, department_name, prod_total_model

# Output
OUTPUT_XLSX = r"C:\Users\pt3canro\Desktop\CAPACITY\output\capacity_forecast_hybrid.xlsx"


## 02. Data loading and cleaning

In [2]:
def load_incoming(path: str) -> pd.DataFrame:
    """Load daily incoming volumes. Expected columns: Date, department_id, ticket_total. department_name optional."""
    # Try CSV then Excel
    if path.lower().endswith(".csv"):
        df = pd.read_csv(path)
    else:
        df = pd.read_excel(path, engine="openpyxl")
    # Basic normalization
    df['Date'] = pd.to_datetime(df['Date'])
    df['department_id'] = df['department_id'].astype(str).str.strip()
    if 'department_name' in df.columns:
        df['department_name'] = df['department_name'].astype(str).str.strip()
    else:
        df['department_name'] = None
    df['ticket_total'] = pd.to_numeric(df['ticket_total'], errors='coerce').fillna(0).astype(float)
    return df

def load_dept_map(path: str, sheet: str) -> pd.DataFrame:
    """Load dept mapping to get department_name and vertical."""
    if not os.path.exists(path):
        # Gracefully handle missing map file
        return pd.DataFrame(columns=['department_id', 'department_name', 'vertical'])
    if path.lower().endswith(".xlsx") or path.lower().endswith(".xlsm"):
        mp = pd.read_excel(path, sheet_name=sheet, engine="openpyxl")
    else:
        mp = pd.read_csv(path)
    # Normalize keys
    mp['department_id'] = mp['department_id'].astype(str).str.strip()
    if 'department_name' in mp.columns:
        mp['department_name'] = mp['department_name'].astype(str).str.strip()
    if 'vertical' in mp.columns:
        mp['vertical'] = mp['vertical'].astype(str).str.strip()
    else:
        mp['vertical'] = None
    return mp[['department_id', 'department_name', 'vertical']].drop_duplicates('department_id')

def load_productivity(path: str) -> pd.DataFrame:
    """Load agent productivity per day and compute dept-level mean tickets per agent per day."""
    df = pd.read_excel(path, engine="openpyxl")
    df['Date'] = pd.to_datetime(df['Date'])
    df['department_id'] = df['department_id'].astype(str).str.strip()
    # Compute avg productivity per agent-day, then aggregate to department level mean
    # If we want mean per department per day per agent:
    #   Take mean of prod_total_model across agents & days, grouped by department.
    df['prod_total_model'] = pd.to_numeric(df['prod_total_model'], errors='coerce')
    prod_dept = (df
                 .groupby('department_id', as_index=False)['prod_total_model']
                 .mean()
                 .rename(columns={'prod_total_model': 'avg_tickets_per_agent_day'}))
    return prod_dept

def apply_mapping(incoming: pd.DataFrame, mapping: pd.DataFrame) -> pd.DataFrame:
    """Merge department_name and vertical using department_id as the primary key."""
    merged = incoming.merge(mapping, on='department_id', how='left', suffixes=('', '_map'))
    # Prefer mapped department_name if available
    merged['department_name'] = np.where(merged['department_name'].isna() | (merged['department_name'] == 'None'),
                                         merged['department_name_map'], merged['department_name'])
    merged.drop(columns=[c for c in merged.columns if c.endswith('_map')], inplace=True, errors='ignore')
    # Flag unmapped verticals
    merged['vertical'] = merged['vertical'].fillna('Unmapped')
    return merged

## 02 - Helper functions

In [3]:
def month_floor(dt: pd.Timestamp) -> pd.Timestamp:
    return pd.Timestamp(year=dt.year, month=dt.month, day=1)

def business_days_in_month(year: int, month: int) -> int:
    """Approximate working (Mon-Fri) days in given month."""
    rng = pd.date_range(start=pd.Timestamp(year=year, month=month, day=1),
                        end=pd.Timestamp(year=year, month=month, day=1) + pd.offsets.MonthEnd(0),
                        freq='D')
    return np.sum(rng.weekday < 5)

def smape(y_true, y_pred):
    """sMAPE metric robust for intermittent series."""
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom[denom == 0] = 1.0
    return np.mean(2.0 * np.abs(y_pred - y_true) / denom) * 100.0