In [1]:
import pandas as pd
import numpy as np

In [2]:
# 1. Load Data
df = pd.read_csv('influenza_data.csv')
# The Target File (Your "Answers" from Phase 1)
targets = pd.read_csv('target_peaks_wili.csv')

# 2. Filter for National Data ONLY
df_nat = df[df['region'] == 'nat'].copy()

# Sort is CRITICAL for time-series features!
df_nat = df_nat.sort_values(['year', 'week'])

# --- DEFINING THE FEATURE FUNCTION ---
# We wrap this in a function so you can reuse it on the State data later
def create_features(df_in):
    # Avoid SettingWithCopy warnings
    df_feat = df_in.copy()
    
    # A. BASIC TEMPORAL FEATURES
    # Convert epiweek to a continuous index or just use sin/cos for seasonality
    # (Simple approach: Week number is a proxy for "time of year")
    df_feat['week_sin'] = np.sin(2 * np.pi * df_feat['week'] / 53)
    df_feat['week_cos'] = np.cos(2 * np.pi * df_feat['week'] / 53)
    
    # B. LAG FEATURES (What happened recently?)
    # Shift(1) = Last week's value
    df_feat['wili_lag1'] = df_feat['wili'].shift(1)
    df_feat['wili_lag2'] = df_feat['wili'].shift(2)
    df_feat['wili_lag4'] = df_feat['wili'].shift(4) # One month ago
    
    # C. TREND FEATURES (Derivatives)
    # 1-week change (Velocity)
    df_feat['delta_1w'] = df_feat['wili'] - df_feat['wili_lag1']
    # 4-week change (Broader trend)
    df_feat['delta_4w'] = df_feat['wili'] - df_feat['wili_lag4']
    
    # D. ACCELERATION FEATURES (Second Derivative)
    # Is the curve bending up or flattening?
    # (Current velocity - Previous velocity)
    df_feat['accel_1w'] = df_feat['delta_1w'] - df_feat['delta_1w'].shift(1)

    # E. ROLLING AVERAGES (Noise Reduction)
    # Average of last 3 weeks (smooths out blips)
    df_feat['rolling_mean_3w'] = df_feat['wili'].rolling(window=3).mean()
    
    return df_feat

# 3. Apply the function
print("Generating features...")
df_training = create_features(df_nat)

# 4. Merge with Targets (The "Labels")
# We need to attach the "True Peak" to every single week of that season.
# Example: Every row in 2018 gets the 2018 Peak value attached.
df_final = pd.merge(
    df_training, 
    targets[['season', 'region', 'peak_value', 'peak_week']], 
    on=['season', 'region'], 
    how='inner' # Only keep rows where we have a known answer (drops 2025)
)

# 5. Clean up (Drop NaNs created by lagging)
# The first 4 weeks of the dataset will be empty because of lag_4
df_final = df_final.dropna()

# 6. Select Final Columns for Machine Learning
# X = Features, Y = peak_value
cols_to_keep = [
    'season', 'epiweek', 'region',           # Metadata
    'wili', 'wili_lag1', 'wili_lag2',        # Recent Values
    'delta_1w', 'delta_4w', 'accel_1w',      # Trends
    'week_sin', 'week_cos',                  # Seasonality
    'peak_value'                             # TARGET (Y)
]

df_ml = df_final[cols_to_keep]

print("Training Data Ready!")
print(f"Rows: {len(df_ml)}")
print(df_ml.head())

# Save it
df_ml.to_csv('training_data_nat.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'target_peaks_wili.csv'