In [8]:
import pandas as pd
import numpy as np

In [28]:
# load raw data
df = pd.read_csv("data/influenza_data.csv")

# defin year and week in epi season
df["year"] = df["epiweek"] // 100
df["week"] = df["epiweek"] % 100

# vectorized season definition (same as your apply)
df["season"] = df["year"] - (df["week"] < 30)

# df by region (state vs. national)
df_states = df[df["region"].str.len() == 2].copy()
df_nat = df[df["region"] == "nat"].copy()

# compute peak target 
peaks_states = (
    df_states.groupby(["season", "region"])["wili"]
    .max()
    .reset_index()
    .rename(columns={"wili": "peak_value"})
)

peaks_nat = (
    df_nat.groupby(["season", "region"])["wili"]
    .max()
    .reset_index()
    .rename(columns={"wili": "peak_value"})
)

all_peaks = pd.concat([peaks_states, peaks_nat], ignore_index=True)
all_peaks = all_peaks[all_peaks["season"] < 2025].copy()

all_peaks.to_csv("data/target_peaks_wili.csv", index=False)

#  Combine states + nat for feature building 
df_use = pd.concat([df_states, df_nat], ignore_index=True)

# keep seasons that exist in targets
df_use = df_use[df_use["season"] < 2025].copy()

# sort for  time
df_use = df_use.sort_values(["region", "season", "epiweek"]).copy()


### Minimal set of features

In [29]:
# build minimal set of features
df_use["y_t"] = df_use["wili"]

# cumulative
df_use["cum_y"] = df_use.groupby(["season", "region"])["y_t"].cumsum()

# lags
df_use["y_t_minus_1"] = df_use.groupby(["season", "region"])["y_t"].shift(1)
df_use["y_t_minus_2"] = df_use.groupby(["season", "region"])["y_t"].shift(2)

# differences: slope + accel-ish
df_use["delta_1"] = df_use["y_t"] - df_use["y_t_minus_1"]
df_use["delta_2"] = (df_use["y_t"] - df_use["y_t_minus_1"]) - (df_use["y_t_minus_1"] - df_use["y_t_minus_2"])

# interactions: "high and rising"
df_use["y_t_x_delta_1"] = df_use["y_t"] * df_use["delta_1"]
df_use["cum_y_x_delta_1"] = df_use["cum_y"] * df_use["delta_1"]

In [30]:
# Merge target peaks onto every week row
df_use = df_use.merge(all_peaks, on=["season", "region"], how="left")

# model dataframe -- drop early rows without lag history
feature_cols = [
    "y_t", "cum_y", "y_t_minus_1", "y_t_minus_2",
    "delta_1", "delta_2",
    "y_t_x_delta_1", "cum_y_x_delta_1"
]

model_df = df_use.dropna(subset=feature_cols + ["peak_value"]).copy()

# keep columns neededd for model only
keep_cols = ["season", "region", "epiweek", "week"] + feature_cols + ["peak_value"]
model_df = model_df[keep_cols]

model_df.to_csv("data/model_df_wili.csv", index=False)

print(model_df.head())

   season region  epiweek  week       y_t     cum_y  y_t_minus_1  y_t_minus_2  \
2    2010     ak   201042    42  0.586042  2.589458     1.128270     0.875146   
3    2010     ak   201043    43  0.967742  3.557200     0.586042     1.128270   
4    2010     ak   201044    44  0.683851  4.241051     0.967742     0.586042   
5    2010     ak   201045    45  0.951904  5.192955     0.683851     0.967742   
6    2010     ak   201046    46  0.962567  6.155522     0.951904     0.683851   

    delta_1   delta_2  y_t_x_delta_1  cum_y_x_delta_1  peak_value  
2 -0.542228 -0.795352      -0.317768        -1.404077      4.8801  
3  0.381700  0.923928       0.369387         1.357783      4.8801  
4 -0.283891 -0.665591      -0.194139        -1.203996      4.8801  
5  0.268053  0.551944       0.255161         1.391987      4.8801  
6  0.010663 -0.257390       0.010264         0.065636      4.8801  


In [31]:
print(model_df.columns)

print(model_df.groupby(["season", "region"])["peak_value"].nunique().value_counts().head())

Index(['season', 'region', 'epiweek', 'week', 'y_t', 'cum_y', 'y_t_minus_1',
       'y_t_minus_2', 'delta_1', 'delta_2', 'y_t_x_delta_1', 'cum_y_x_delta_1',
       'peak_value'],
      dtype='object')
peak_value
1    763
Name: count, dtype: int64
