In [1]:
# --- 1. Load Data ---
df = pd.read_csv("../data/clean/calls_by_district.csv")
df['CallDateTime'] = pd.to_datetime(df['CallDateTime'])
df = df[(df['CallDateTime'] >= '2017-01-01') & (df['CallDateTime'] < '2025-01-01')]
df['date'] = df['CallDateTime'].dt.date

# --- 2. Aggregate per day per district ---
daily = df.groupby(['date', 'NAME']).size().reset_index(name='call_volume')
daily['date'] = pd.to_datetime(daily['date'])

# --- 3. Calendar Features ---
daily['day_of_week'] = daily['date'].dt.dayofweek
daily['week_of_year'] = daily['date'].dt.isocalendar().week.astype(int)
daily['month'] = daily['date'].dt.month
daily['year'] = daily['date'].dt.year
daily['day_of_year'] = daily['date'].dt.dayofyear
daily['sin_doy'] = np.sin(2 * np.pi * daily['day_of_year'] / 365.25)
daily['cos_doy'] = np.cos(2 * np.pi * daily['day_of_year'] / 365.25)

# --- 4. Lag Features (per district) ---
daily = daily.sort_values(['NAME', 'date'])
for lag in [1, 2, 3, 7, 14]:
    daily[f'lag_{lag}'] = daily.groupby('NAME')['call_volume'].shift(lag)

daily['rolling_3d'] = daily.groupby('NAME')['call_volume'].rolling(3).mean().reset_index(level=0, drop=True)
daily['rolling_7d'] = daily.groupby('NAME')['call_volume'].rolling(7).mean().reset_index(level=0, drop=True)
daily['rolling_30d'] = daily.groupby('NAME')['call_volume'].rolling(30).mean().reset_index(level=0, drop=True)
daily['trend_3d'] = daily['rolling_3d'] - daily['rolling_7d']

# --- 5. Holiday/Weekend Flags ---
calendar = USFederalHolidayCalendar()
holidays = calendar.holidays(start=daily['date'].min(), end=daily['date'].max())
daily['is_holiday'] = daily['date'].isin(holidays).astype(int)
daily['is_weekend'] = daily['day_of_week'].isin([5, 6]).astype(int)

# --- 6. Drop missing ---
daily.dropna(inplace=True)

# --- 7. Encode District (NAME) ---
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
daily['district_id'] = le.fit_transform(daily['NAME'])

# --- 8. Features ---
features = [
    'district_id',  # for embedding
    'day_of_week', 'week_of_year', 'month', 'year',
    'lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14',
    'rolling_3d', 'rolling_7d', 'rolling_30d',
    'trend_3d', 'is_holiday', 'is_weekend',
    'sin_doy', 'cos_doy'
]

# Split
train = daily[daily['date'] < '2023-01-01']
test = daily[daily['date'] >= '2023-01-01']

X_train_num = train[features[1:]]
X_test_num = test[features[1:]]
y_train = train['call_volume']
y_test = test['call_volume']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_test_scaled = scaler.transform(X_test_num)

# Keep district IDs separate for embeddings
X_train_dist = train['district_id'].values
X_test_dist = test['district_id'].values

# Log transform target
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# --- 9. Keras Model with Embedding for district ---
from tensorflow.keras.layers import Input, Embedding, Concatenate, Flatten
from tensorflow.keras.models import Model

n_districts = daily['district_id'].nunique()
input_num = Input(shape=(X_train_scaled.shape[1],))
input_cat = Input(shape=(1,), dtype='int32')

embed = Embedding(input_dim=n_districts, output_dim=4)(input_cat)
embed_flat = Flatten()(embed)

x = Concatenate()([input_num, embed_flat])
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(64, activation='relu')(x)
output = Dense(1)(x)

model = Model(inputs=[input_num, input_cat], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

history = model.fit(
    [X_train_scaled, X_train_dist],
    y_train_log,
    validation_split=0.1,
    epochs=300,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# --- 10. Predictions ---
preds_log = model.predict([X_test_scaled, X_test_dist]).flatten()
preds = np.expm1(preds_log).round().astype(int)
y_test_actual = y_test.round().astype(int)

rmse = np.sqrt(mean_squared_error(y_test_actual, preds))
print(f"District MLP RMSE (actual calls): {rmse:.2f}")

# --- 11. Plot: Show one district at a time (example: SHAWNEE) ---
plot_df = test.copy()
plot_df['preds'] = preds
plot_df['actual'] = y_test_actual

district = "SHAWNEE"
sub = plot_df[plot_df['NAME'] == district]

plt.figure(figsize=(15, 5))
plt.plot(sub['date'], sub['actual'], label='Actual')
plt.plot(sub['date'], sub['preds'], label='Predicted', alpha=0.7)
plt.title(f'Daily EMS Calls – {district}')
plt.legend()
plt.tight_layout()
plt.show()


NameError: name 'USFederalHolidayCalendar' is not defined