In [40]:
# --- 0. Imports ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# --- 1. Load and preprocess ---
df = pd.read_csv("../data/clean/calls_by_district.csv")
df['CallDateTime'] = pd.to_datetime(df['CallDateTime'])
df = df[(df['CallDateTime'] >= '2017-01-01') & (df['CallDateTime'] < '2025-01-01')]

def get_time_segment(hour):
    if 7 <= hour < 10: return '07_10'
    elif 10 <= hour < 13: return '10_13'
    elif 13 <= hour < 16: return '13_16'
    elif 16 <= hour < 19: return '16_19'
    elif 19 <= hour < 22: return '19_22'
    elif 22 <= hour or hour < 1: return '22_01'
    elif 1 <= hour < 4: return '01_04'
    else: return '04_07'

df['hour'] = df['CallDateTime'].dt.hour
df['segment'] = df['hour'].apply(get_time_segment)
df['date'] = df['CallDateTime'].dt.date

# --- 2. Aggregate ---
agg = df.groupby(['date', 'NAME', 'segment']).size().reset_index(name='call_volume')
agg['date'] = pd.to_datetime(agg['date'])

# --- 3. Calendar Features ---
agg['day_of_week'] = agg['date'].dt.dayofweek
agg['week_of_year'] = agg['date'].dt.isocalendar().week.astype(int)
agg['month'] = agg['date'].dt.month
agg['year'] = agg['date'].dt.year
agg['day_of_year'] = agg['date'].dt.dayofyear
agg['sin_doy'] = np.sin(2 * np.pi * agg['day_of_year'] / 365.25)
agg['cos_doy'] = np.cos(2 * np.pi * agg['day_of_year'] / 365.25)

# --- 4. Lag and Rolling Features (Zero Leakage) ---
agg = agg.sort_values(['NAME', 'segment', 'date'])
group = agg.groupby(['NAME', 'segment'])

for lag in [1, 2, 3, 7, 14]:
    agg[f'lag_{lag}'] = group['call_volume'].shift(lag)

agg['rolling_3d'] = group['call_volume'].shift(1).rolling(3).mean().reset_index(level=0, drop=True)
agg['rolling_7d'] = group['call_volume'].shift(1).rolling(7).mean().reset_index(level=0, drop=True)
agg['rolling_30d'] = group['call_volume'].shift(1).rolling(30).mean().reset_index(level=0, drop=True)
agg['trend_3d'] = agg['rolling_3d'] - agg['rolling_7d']

# --- 5. Holiday and weekend flags ---
calendar = USFederalHolidayCalendar()
holidays = calendar.holidays(start=agg['date'].min(), end=agg['date'].max())
agg['is_holiday'] = agg['date'].isin(holidays).astype(int)
agg['is_weekend'] = agg['day_of_week'].isin([5, 6]).astype(int)

# --- 6. Drop missing values ---
agg.dropna(inplace=True)

# --- 7. Encode categorical features ---
agg['district_id'] = LabelEncoder().fit_transform(agg['NAME'])
agg['segment_id'] = LabelEncoder().fit_transform(agg['segment'])

# --- 8. Feature list ---
features = [
    'district_id', 'segment_id',
    'day_of_week', 'week_of_year', 'month', 'year',
    'lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14',
    'rolling_3d', 'rolling_7d', 'rolling_30d', 'trend_3d',
    'is_holiday', 'is_weekend', 'sin_doy', 'cos_doy'
]

# --- 9. Train/Test split ---
train = agg[agg['date'] < '2023-01-01']
test = agg[agg['date'] >= '2023-01-01']

X_train_num = train[features[2:]]
X_test_num = test[features[2:]]
y_train = train['call_volume']
y_test = test['call_volume']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_test_scaled = scaler.transform(X_test_num)

X_train_dist = train['district_id'].values
X_train_seg = train['segment_id'].values
X_test_dist = test['district_id'].values
X_test_seg = test['segment_id'].values

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# --- 10. Build and train Keras model ---
input_num = Input(shape=(X_train_scaled.shape[1],))
input_dist = Input(shape=(1,), dtype='int32')
input_seg = Input(shape=(1,), dtype='int32')

embed_dist = Embedding(input_dim=agg['district_id'].nunique(), output_dim=4)(input_dist)
embed_seg = Embedding(input_dim=agg['segment_id'].nunique(), output_dim=3)(input_seg)

flat_dist = Flatten()(embed_dist)
flat_seg = Flatten()(embed_seg)

x = Concatenate()([input_num, flat_dist, flat_seg])
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(64, activation='relu')(x)
output = Dense(1)(x)

model = Model(inputs=[input_num, input_dist, input_seg], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

model.fit(
    [X_train_scaled, X_train_dist, X_train_seg],
    y_train_log,
    validation_split=0.1,
    epochs=300,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# --- 11. Evaluate ---
preds_log = model.predict([X_test_scaled, X_test_dist, X_test_seg]).flatten()
preds = np.expm1(preds_log).round().astype(int)
y_test_actual = y_test.round().astype(int)

rmse = np.sqrt(mean_squared_error(y_test_actual, preds))
print(f"3-Hour Segment MLP RMSE (actual calls): {rmse:.2f}")

# --- 12. Output results ---
results_df = test[['date', 'NAME', 'segment']].copy()
results_df['predicted'] = preds
results_df['actual'] = y_test_actual.values
results_df_sorted = results_df.sort_values(['date', 'segment', 'NAME']).reset_index(drop=True)
print(results_df_sorted.head(20))

Epoch 1/300
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 799us/step - loss: 0.0886 - root_mean_squared_error: 0.2959 - val_loss: 0.0472 - val_root_mean_squared_error: 0.2172
Epoch 2/300
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 662us/step - loss: 0.0575 - root_mean_squared_error: 0.2398 - val_loss: 0.0440 - val_root_mean_squared_error: 0.2098
Epoch 3/300
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 674us/step - loss: 0.0537 - root_mean_squared_error: 0.2317 - val_loss: 0.0432 - val_root_mean_squared_error: 0.2079
Epoch 4/300
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 791us/step - loss: 0.0515 - root_mean_squared_error: 0.2268 - val_loss: 0.0421 - val_root_mean_squared_error: 0.2053
Epoch 5/300
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 670us/step - loss: 0.0504 - root_mean_squared_error: 0.2244 - val_loss: 0.0420 - val_root_mean_squared_error: 0.2049
Epoch 6/300
[1m620/620[