# Corporate Hybrid Forecast Notebook (Prophet + ARIMA) – v2


## 01 - Imports & Settings

In [1]:

# All comments in English (as requested)
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from prophet import Prophet
import pmdarima as pm
from sklearn.metrics import mean_absolute_percentage_error
from IPython.display import display

PD_READ_XLSX_ENGINE = 'openpyxl'
PD_WRITE_XLSX_ENGINE = 'openpyxl'

INCOMING_PATH  = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\Incoming_new.xlsx"
CALL_PATH      = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\call_performance.xlsx"
AGENTS_PATH    = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\agent_language_n_target.xlsx"
PROD_PATH      = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\productivity_agents.xlsx"
EINSTEIN_PATH  = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\einstein.xlsx"
INVENTORY_PATH = r"C:\Users\pt3canro\Desktop\CAPACITY\input_model\inventory_month.xlsx"

OUT_XLSX = "capacity_forecast_hybrid.xlsx"
OUT_MAPE = "mape_by_department.xlsx"
HORIZON_MONTHS = 6

LANG_SHARE = {
    'English': 64.35,
    'French': 7.41,
    'German': 8.60,
    'Italian': 6.67,
    'Portuguese': 1.62,
    'Spanish': 11.35,
}


## 02 - Helper functions

In [2]:

from typing import Optional, List

def find_first(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    low = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in low:
            return low[cand.lower()]
    for c in df.columns:
        cl = c.lower().replace(' ', '').replace('_', '')
        for cand in candidates:
            if cand.lower().replace(' ', '').replace('_', '') == cl:
                return c
    return None


def mape_safe(y_true: pd.Series, y_pred: pd.Series) -> Optional[float]:
    y_true = y_true.astype(float)
    y_pred = y_pred.astype(float)
    mask = y_true > 0
    if mask.sum() == 0:
        return None
    return float(mean_absolute_percentage_error(y_true[mask], y_pred[mask]) * 100)


def month_start(dt: pd.Timestamp) -> pd.Timestamp:
    return pd.Timestamp(dt.year, dt.month, 1)


def forecast_prophet(df_m: pd.DataFrame, dep_col="Department", date_col="Month", y_col="tickets", horizon=6) -> pd.DataFrame:
    rows = []
    for dep, g in df_m.groupby(dep_col):
        g = g[[date_col, y_col]].dropna().sort_values(date_col)
        g = g.rename(columns={date_col: 'ds', y_col: 'y'})
        if len(g) < 6:
            fc_vals = [g['y'].iloc[-1] if len(g)>0 else 0] * horizon
        else:
            try:
                m = Prophet(interval_width=0.90, yearly_seasonality=False)
                m.fit(g)
                future = m.make_future_dataframe(periods=horizon, freq='MS')
                pred = m.predict(future).tail(horizon)['yhat']
                fc_vals = np.maximum(0, pred.round()).astype(int).tolist()
            except Exception:
                fc_vals = [g['y'].iloc[-1]] * horizon
        start_fc = month_start(pd.Timestamp.today())
        future_dates = pd.date_range(start_fc, periods=horizon, freq='MS')
        for d, v in zip(future_dates, fc_vals):
            rows.append({dep_col: dep, date_col: d, 'Forecast_Prophet': int(v)})
    return pd.DataFrame(rows)


def forecast_arima(df_m: pd.DataFrame, dep_col="Department", date_col="Month", y_col="tickets", horizon=6) -> pd.DataFrame:
    rows = []
    for dep, g in df_m.groupby(dep_col):
        g = g[[date_col, y_col]].dropna().sort_values(date_col)
        y = g[y_col].astype(float)
        if len(y) >= 6:
            try:
                model = pm.auto_arima(y, seasonal=False, error_action='ignore', suppress_warnings=True)
                fc = model.predict(horizon)
                fc_vals = np.maximum(0, np.round(fc)).astype(int).tolist()
            except Exception:
                fc_vals = [int(y.iloc[-1])] * horizon
        else:
            fc_vals = [int(y.iloc[-1]) if len(y)>0 else 0] * horizon
        start_fc = month_start(pd.Timestamp.today())
        future_dates = pd.date_range(start_fc, periods=horizon, freq='MS')
        for d, v in zip(future_dates, fc_vals):
            rows.append({dep_col: dep, date_col: d, 'Forecast_ARIMA': int(v)})
    return pd.DataFrame(rows)


def build_hybrid_table(kpi: pd.DataFrame, dep_col='Department') -> pd.DataFrame:
    out = []
    for dep, g in kpi.groupby(dep_col):
        real = g['Actual Volume'].astype(float)
        p = g.get('Forecast_Prophet', pd.Series(index=g.index, dtype=float)).astype(float).fillna(0)
        a = g.get('Forecast_ARIMA', pd.Series(index=g.index, dtype=float)).astype(float).fillna(0)
        m_p = mape_safe(real, p)
        m_a = mape_safe(real, a)
        if m_p is None and m_a is None:
            best = 'Prophet'
        elif m_p is None:
            best = 'ARIMA'
        elif m_a is None:
            best = 'Prophet'
        else:
            best = 'Prophet' if m_p <= m_a else 'ARIMA'
        out.append({dep_col: dep, 'MAPE_Prophet': m_p, 'MAPE_ARIMA': m_a, 'Best_Model': best})
    return pd.DataFrame(out)


def apply_hybrid(kpi: pd.DataFrame, table: pd.DataFrame, dep_col='Department') -> pd.DataFrame:
    kpi = kpi.copy()
    kpi['Forecast_Hybrid'] = 0
    best_map = table.set_index(dep_col)['Best_Model'].to_dict()
    for dep, best in best_map.items():
        src = 'Forecast_Prophet' if best == 'Prophet' else 'Forecast_ARIMA'
        kpi.loc[kpi[dep_col]==dep, 'Forecast_Hybrid'] = kpi.loc[kpi[dep_col]==dep, src].values
    return kpi


def language_split(df_fc: pd.DataFrame, dep_col='Department', date_col='Month', fc_col='Forecast_Hybrid', lang_share: dict = LANG_SHARE) -> pd.DataFrame:
    rows = []
    shares = {k: float(v)/100.0 for k,v in lang_share.items()}
    for _, r in df_fc[[dep_col, date_col, fc_col]].iterrows():
        for lang, s in shares.items():
            rows.append({dep_col: r[dep_col], date_col: r[date_col], 'Language': lang, 'Forecast_Hybrid_Lang': int(round(r[fc_col]*s))})
    return pd.DataFrame(rows)

# ---- Ensure 'Actual Volume' exists in kpi (robust to variant names) ----

def ensure_actuals_column(kpi_df, train_df,
                          dep_col='Department', month_col='Month',
                          train_value_candidates=('Actual Volume','total_incoming','ticket_total','Tickets','tickets','volume','count')):
    kpi_df = kpi_df.copy()
    # If already present under any alias in kpi, standardize name
    lower = {c.lower(): c for c in kpi_df.columns}
    for alias in ('actual volume','actual','actuals','tickets','ticket_total','volume','total_incoming'):
        if alias in lower:
            col = lower[alias]
            if col != 'Actual Volume':
                kpi_df = kpi_df.rename(columns={col: 'Actual Volume'})
            break
    if 'Actual Volume' in kpi_df.columns:
        kpi_df['Actual Volume'] = pd.to_numeric(kpi_df['Actual Volume'], errors='coerce').fillna(0)
        return kpi_df

    # else, pull from train_df
    t_lower = {c.lower(): c for c in train_df.columns}
    val_col = None
    for cand in train_value_candidates:
        if cand.lower() in t_lower:
            val_col = t_lower[cand.lower()]
            break
    if val_col is None:
        raise KeyError("No value column found in train_m to use as Actual Volume")

    def to_month_start(s):
        s = pd.to_datetime(s, errors='coerce')
        return s.values.astype('datetime64[M]')

    # Normalize month types to avoid merge misses
    kpi_df[month_col] = to_month_start(kpi_df[month_col])
    tmp_train = train_df[[dep_col, month_col, val_col]].copy()
    tmp_train[month_col] = to_month_start(tmp_train[month_col])
    tmp_train = tmp_train.rename(columns={val_col: 'Actual Volume'})

    kpi_df = kpi_df.merge(tmp_train, on=[dep_col, month_col], how='left')
    kpi_df['Actual Volume'] = pd.to_numeric(kpi_df['Actual Volume'], errors='coerce').fillna(0)
    return kpi_df


## 03 - Load Data

In [3]:

incoming = pd.read_excel(INCOMING_PATH, engine=PD_READ_XLSX_ENGINE)
call = pd.read_excel(CALL_PATH, engine=PD_READ_XLSX_ENGINE)
agents = pd.read_excel(AGENTS_PATH, engine=PD_READ_XLSX_ENGINE)
productivity = pd.read_excel(PROD_PATH, engine=PD_READ_XLSX_ENGINE)
einstein = pd.read_excel(EINSTEIN_PATH, engine=PD_READ_XLSX_ENGINE)
inventory = pd.read_excel(INVENTORY_PATH, engine=PD_READ_XLSX_ENGINE)

print('Files loaded:', len(incoming), len(call), len(agents), len(productivity), len(einstein), len(inventory))


Files loaded: 12745 36983 476 25161 3760 57


## 04 - Preprocessing – Build train_m (Department, Month, tickets)

In [4]:

# Detect columns in INCOMING
cand_date = ['Date', 'date', 'Created Date', 'Ticket Date']
cand_dep  = ['Department', 'department', 'Department Name', 'dept_name', 'department_id']
cand_val  = ['total_incoming', 'ticket_total', 'volume', 'count', 'Tickets', 'tickets']

col_date = find_first(incoming, cand_date)
col_dep  = find_first(incoming, cand_dep)
col_val  = find_first(incoming, cand_val)

if col_date is None or col_dep is None or col_val is None:
    raise ValueError(f"Incoming file must contain date/department/tickets-like columns. Found: {col_date}, {col_dep}, {col_val}")

inc = incoming[[col_date, col_dep, col_val]].copy()
inc.columns = ['Date', 'Department', 'tickets']
inc['Date'] = pd.to_datetime(inc['Date'])
inc['Month'] = inc['Date'].values.astype('datetime64[M]')

train_m = (
    inc.groupby(['Department', 'Month'], as_index=False)['tickets']
       .sum()
       .sort_values(['Department', 'Month'])
)

print('train_m built:', train_m.shape)
train_m.head()


train_m built: (633, 3)


Unnamed: 0,Department,Month,tickets
0,1,2025-01-01,8761
1,1,2025-02-01,8792
2,1,2025-03-01,7972
3,1,2025-04-01,9535
4,1,2025-05-01,9808


## 05 - KPI skeleton – join actuals and future months

In [5]:

actuals = train_m.rename(columns={'tickets': 'Actual Volume'})[['Department','Month','Actual Volume']]
start_fc = pd.Timestamp.today().to_period('M').to_timestamp()
future_months = pd.date_range(start_fc, periods=HORIZON_MONTHS, freq='MS')
last_n_months = 18
hist_keep = (actuals.groupby('Department').tail(last_n_months))

kpi = pd.concat([
    hist_keep,
    pd.MultiIndex.from_product([actuals['Department'].unique(), future_months], names=['Department','Month']).to_frame(index=False)
], ignore_index=True).drop_duplicates(['Department','Month'])

kpi = kpi.merge(actuals, on=['Department','Month'], how='left')

# Optional inventory merge
if not inventory.empty:
    inv_dep = find_first(inventory, ['Department','department'])
    inv_month = find_first(inventory, ['Month','month','Date'])
    inv_val = find_first(inventory, ['Inventory','inventory','Backlog'])
    if inv_dep and inv_month and inv_val:
        inv = inventory[[inv_dep, inv_month, inv_val]].copy()
        inv.columns = ['Department','Month','Inventory']
        inv['Month'] = pd.to_datetime(inv['Month']).values.astype('datetime64[M]')
        kpi = kpi.merge(inv, on=['Department','Month'], how='left')

kpi.head()


Unnamed: 0,Department,Month,Actual Volume_x,Actual Volume_y
0,1,2025-01-01,8761.0,8761.0
1,1,2025-02-01,8792.0,8792.0
2,1,2025-03-01,7972.0,7972.0
3,1,2025-04-01,9535.0,9535.0
4,1,2025-05-01,9808.0,9808.0


## 06 - Forecasts – Prophet and AutoARIMA

In [6]:

fc_prophet = forecast_prophet(train_m, horizon=HORIZON_MONTHS)
fc_arima   = forecast_arima(train_m, horizon=HORIZON_MONTHS)

kpi = kpi.merge(fc_prophet, on=['Department','Month'], how='left')
kpi = kpi.merge(fc_arima,   on=['Department','Month'], how='left')

for c in ['Forecast_Prophet','Forecast_ARIMA']:
    if c in kpi.columns:
        kpi[c] = kpi[c].fillna(0).astype(int)

kpi.head()


17:22:28 - cmdstanpy - INFO - Chain [1] start processing
17:22:28 - cmdstanpy - INFO - Chain [1] done processing
17:22:28 - cmdstanpy - INFO - Chain [1] start processing
17:22:28 - cmdstanpy - INFO - Chain [1] done processing
17:22:29 - cmdstanpy - INFO - Chain [1] start processing
17:22:29 - cmdstanpy - INFO - Chain [1] done processing
17:22:29 - cmdstanpy - INFO - Chain [1] start processing
17:22:29 - cmdstanpy - INFO - Chain [1] done processing
17:22:29 - cmdstanpy - INFO - Chain [1] start processing
17:22:29 - cmdstanpy - INFO - Chain [1] done processing
17:22:29 - cmdstanpy - INFO - Chain [1] start processing
17:22:30 - cmdstanpy - INFO - Chain [1] done processing
17:22:30 - cmdstanpy - INFO - Chain [1] start processing
17:22:30 - cmdstanpy - INFO - Chain [1] done processing
17:22:30 - cmdstanpy - INFO - Chain [1] start processing
17:22:30 - cmdstanpy - INFO - Chain [1] done processing
17:22:30 - cmdstanpy - INFO - Chain [1] start processing
17:22:31 - cmdstanpy - INFO - Chain [1]

Unnamed: 0,Department,Month,Actual Volume_x,Actual Volume_y,Forecast_Prophet,Forecast_ARIMA
0,1,2025-01-01,8761.0,8761.0,0,0
1,1,2025-02-01,8792.0,8792.0,0,0
2,1,2025-03-01,7972.0,7972.0,0,0
3,1,2025-04-01,9535.0,9535.0,0,0
4,1,2025-05-01,9808.0,9808.0,0,0


## 07 - Hybrid selection & MAPE

In [7]:

# Ensure Actual Volume exists (pull from train_m if needed)
kpi = ensure_actuals_column(kpi, train_m)

# Build table and apply
mape_table = build_hybrid_table(kpi)
kpi = apply_hybrid(kpi, mape_table)

display(mape_table.sort_values('Best_Model'))


Unnamed: 0,Department,MAPE_Prophet,MAPE_ARIMA,Best_Model
0,1,94.395966,93.689336,ARIMA
36,53,92.899408,92.307692,ARIMA
44,63,96.526055,95.037221,ARIMA
48,71,100.000000,98.290598,ARIMA
28,41,95.238095,92.673993,ARIMA
...,...,...,...,...
34,49,100.000000,100.000000,Prophet
35,51,100.000000,110.000000,Prophet
37,56,100.000000,100.000000,Prophet
39,58,583.750000,598.750000,Prophet


## 08 - Capacity & Productivity – optional enrichment

In [8]:

kpi['Capacity'] = 0
kpi['Productivity'] = 0

prod_dep = find_first(productivity, ['Department','department'])
prod_month = find_first(productivity, ['Month','month','Date'])
prod_val = find_first(productivity, ['Productivity','productivity','Tickets per Agent','TPA'])
if prod_dep and prod_month and prod_val:
    prod = productivity[[prod_dep, prod_month, prod_val]].copy()
    prod.columns = ['Department','Month','Productivity']
    prod['Month'] = pd.to_datetime(prod['Month']).values.astype('datetime64[M]')
    kpi = kpi.drop(columns=['Productivity']).merge(prod, on=['Department','Month'], how='left').fillna({'Productivity':0})

cap_dep = find_first(agents, ['Department','department']) or find_first(call, ['Department','department'])
cap_month = find_first(agents, ['Month','month','Date']) or find_first(call, ['Month','month','Date'])
cap_val = find_first(agents, ['Capacity','capacity','HC Capacity']) or find_first(call, ['Capacity','capacity'])
if cap_dep and cap_month and cap_val:
    cap_df = (agents if find_first(agents, ['Capacity','capacity','HC Capacity']) else call)
    cap = cap_df[[cap_dep, cap_month, cap_val]].copy()
    cap.columns = ['Department','Month','Capacity']
    cap['Month'] = pd.to_datetime(cap['Month']).values.astype('datetime64[M]')
    kpi = kpi.drop(columns=['Capacity']).merge(cap, on=['Department','Month'], how='left').fillna({'Capacity':0})

kpi['Difference Capacity vs Productivity'] = kpi['Capacity'] - kpi['Productivity']
kpi['Expected Forecast vs Capacity'] = kpi['Forecast_Hybrid'] - kpi['Capacity']
kpi['Actual Volume vs Productivity'] = kpi.get('Actual Volume', pd.Series(0, index=kpi.index)).fillna(0) - kpi['Productivity']

if 'Inventory' not in kpi.columns:
    kpi = kpi.sort_values(['Department','Month'])
    kpi['Inventory'] = kpi.groupby('Department')['Actual Volume vs Productivity'].cumsum()


## 09 - Language split (fixed shares)

In [9]:

lang_df = language_split(kpi, fc_col='Forecast_Hybrid')
lang_pivot = lang_df.pivot_table(index=['Department','Month'], columns='Language', values='Forecast_Hybrid_Lang', aggfunc='sum').reset_index()


## 10 - Export results

In [12]:
# === Output folder ===
OUTPUT_FOLDER = r"C:\Users\pt3canro\Desktop\CAPACITY\outputs"

# Ensure the folder exists (optional but recommended)
import os
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Build output full paths
out_xlsx = os.path.join(OUTPUT_FOLDER, OUT_XLSX)
out_mape = os.path.join(OUTPUT_FOLDER, OUT_MAPE)

with pd.ExcelWriter(out_xlsx, engine=PD_WRITE_XLSX_ENGINE) as writer:
    kpi.to_excel(writer, sheet_name='kpi_final', index=False)
    mape_table.to_excel(writer, sheet_name='mape_table', index=False)
    lang_df.to_excel(writer, sheet_name='forecast_by_language', index=False)
    lang_pivot.to_excel(writer, sheet_name='language_pivot', index=False)

# Separate MAPE file
mape_table.to_excel(out_mape, index=False)

print("Exported:")
print(" -", out_xlsx)
print(" -", out_mape)

Exported:
 - C:\Users\pt3canro\Desktop\CAPACITY\outputs\capacity_forecast_hybrid.xlsx
 - C:\Users\pt3canro\Desktop\CAPACITY\outputs\mape_by_department.xlsx


## 11 - Summary

In [11]:

print('Rows in KPI:', len(kpi))
print('Departments:', kpi['Department'].nunique())
print('Actuals column present?', 'Actual Volume' in kpi.columns)


Rows in KPI: 997
Departments: 68
Actuals column present? True
