<a href="https://colab.research.google.com/github/brianpenrod/CORPORATE-AUTOMATED-PREDICTION-ENGINE-C.A.P.E.-/blob/main/CORPORATE_AUTOMATED_PREDICTION_ENGINE_(C_A_P_E_)_v2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -----------------------------------------------------------------------------
# PROJECT: CORPORATE AUTOMATED PREDICTION ENGINE (C.A.P.E.) v2.0
# PATCH: STATIONARITY IMPLEMENTATION (Fixing "Tree Extrapolation" Issue)
# -----------------------------------------------------------------------------

!pip install -q pandas polars lightgbm xgboost numpy scikit-learn

import pandas as pd
import polars as pl
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# --- CONSTANTS ---
SEED = 42

print("\n--- SYSTEM START: C.A.P.E. v2.0 (STATIONARY MODE) ---")

# [1] INGEST DATA (Unchanged)
dates = pd.date_range(start='2020-01-01', end='2025-12-31', freq='ME') # Fixed Deprecation Warning
n = len(dates)

df = pd.DataFrame({'Date': dates})
df['Headcount'] = np.linspace(500, 2500, n) + np.random.normal(0, 50, n)
df['Marketing_Spend'] = np.linspace(50, 200, n) * 1000 + np.random.normal(0, 5000, n)
df['Seasonality'] = np.sin(np.linspace(0, 3.14 * 8, n)) * 50000

# Revenue with Stronger Trend
df['Revenue_Actual'] = (
    (df['Headcount'] * 2000) +
    (df['Marketing_Spend'] * 1.5) +
    df['Seasonality'] +
    np.random.normal(0, 50000, n) # Reduced noise slightly for clarity
)

# [2] FEATURE ENGINEERING: THE FIX (DIFFERENCING)
# We convert RAW numbers to % CHANGE (Returns). This makes data STATIONARY.
print("[2] APPLYING STATIONARITY TRANSFORM (The 'Lopez de Prado' Method)...")

# Calculate % Change for Target
df['Target_Pct_Change'] = df['Revenue_Actual'].pct_change()

# Calculate % Change for Features
df['Headcount_Pct'] = df['Headcount'].pct_change()
df['Marketing_Pct'] = df['Marketing_Spend'].pct_change()

# Lag Features based on % Change
df['Lag_1M_Pct'] = df['Target_Pct_Change'].shift(1)
df['Rolling_Avg_3M_Pct'] = df['Target_Pct_Change'].rolling(window=3).mean()

# Drop NaN values created by shifting
df_clean = df.dropna().reset_index(drop=True)

# [3] TRAINING
train_size = int(len(df_clean) * 0.8)
train = df_clean.iloc[:train_size]
test = df_clean.iloc[train_size:].copy() # Use .copy() to avoid SettingWithCopyWarning

features = ['Headcount_Pct', 'Marketing_Pct', 'Lag_1M_Pct', 'Rolling_Avg_3M_Pct']
target = 'Target_Pct_Change'

print(f"[3] TRAINING ON {len(train)} SAMPLES...")
model_lgb = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.03, verbose=-1, random_state=SEED)
model_lgb.fit(train[features], train[target])

model_xgb = xgb.XGBRegressor(n_estimators=500, learning_rate=0.03, random_state=SEED)
model_xgb.fit(train[features], train[target])

# [4] PREDICTION & RECONSTRUCTION
# We predict the % Change, then "Reconstruct" the Dollar Amount
pred_pct_lgb = model_lgb.predict(test[features])
pred_pct_xgb = model_xgb.predict(test[features])
pred_pct_ensemble = (pred_pct_lgb * 0.5) + (pred_pct_xgb * 0.5)

# RECONSTRUCTION LOGIC: Previous Month Revenue * (1 + Predicted Change)
# We need the 'Revenue_Actual' from the PREVIOUS row to calculate current row
# For the first test row, we use the last train row.
last_revenue = train['Revenue_Actual'].iloc[-1]
reconstructed_forecast = []

# Iterative reconstruction
current_val = last_revenue
for pct in pred_pct_ensemble:
    next_val = current_val * (1 + pct)
    reconstructed_forecast.append(next_val)
    current_val = next_val # Update for next step if creating a pure forecast path

# Assign to dataframe
test['Forecast_Revenue'] = reconstructed_forecast

# [5] SCORING
mae = mean_absolute_error(test['Revenue_Actual'], test['Forecast_Revenue'])
variance_pct = (mae / test['Revenue_Actual'].mean()) * 100

print("-" * 60)
print(f"    >> MODEL ACCURACY REPORT (STATIONARY):")
print(f"    >> Mean Absolute Error: ${mae:,.2f}")
print(f"    >> Forecast Variance:   {variance_pct:.2f}%")

if variance_pct < 5.0:
    print("    >> STATUS: GREEN (High Confidence). Ready for Board Deck.")
else:
    print("    >> STATUS: YELLOW/RED. Variance persists (Check Volatility).")
print("-" * 60)

# Executive Dashboard
test['Variance_Dollars'] = test['Forecast_Revenue'] - test['Revenue_Actual']
display_cols = ['Date', 'Revenue_Actual', 'Forecast_Revenue', 'Variance_Dollars']
print("\n[5] EXECUTIVE DASHBOARD (RECONSTRUCTED VALUES):")
print(test[display_cols].tail(3).to_string(index=False))


--- SYSTEM START: C.A.P.E. v2.0 (STATIONARY MODE) ---
[2] APPLYING STATIONARITY TRANSFORM (The 'Lopez de Prado' Method)...
[3] TRAINING ON 55 SAMPLES...
------------------------------------------------------------
    >> MODEL ACCURACY REPORT (STATIONARY):
    >> Mean Absolute Error: $331,674.95
    >> Forecast Variance:   6.79%
    >> STATUS: YELLOW/RED. Variance persists (Check Volatility).
------------------------------------------------------------

[5] EXECUTIVE DASHBOARD (RECONSTRUCTED VALUES):
      Date  Revenue_Actual  Forecast_Revenue  Variance_Dollars
2025-10-31    5.198942e+06      4.619417e+06    -579524.889578
2025-11-30    5.093625e+06      4.427077e+06    -666547.984352
2025-12-31    5.125114e+06      4.331072e+06    -794041.923259
