In [1]:
import pandas as pd
import numpy as np

In [3]:
import pandas as pd

# 1. Load your existing Master Data
df = pd.read_csv('influenza_data.csv')

# 2. Define "Seasons" 
# (Logic: If week >= 30, it belongs to that year's season. If < 30, it belongs to previous year)
df['year'] = df['epiweek'] // 100
df['week'] = df['epiweek'] % 100
df['season'] = df.apply(lambda x: x['year'] if x['week'] >= 30 else x['year'] - 1, axis=1)

# 3. Filter for States only (exclude National/HHS regions for now)
# We assume state codes are 2 characters (e.g., 'ny', 'ca')
df_states = df[df['region'].str.len() == 2].copy()

# 4. Find the Peak wILI for each Season + State
# Group by Season & Region -> Find the MAX wILI 
peaks = df_states.groupby(['season', 'region'])['wili'].max().reset_index()
peaks = peaks.rename(columns={'wili': 'peak_value'})

# 5. Also calculate for National ('nat') so we can train the National model
df_nat = df[df['region'] == 'nat'].copy()
peaks_nat = df_nat.groupby(['season', 'region'])['wili'].max().reset_index()
peaks_nat = peaks_nat.rename(columns={'wili': 'peak_value'})

# 6. Combine them
all_peaks = pd.concat([peaks, peaks_nat])

# 7. Cleanup
# Remove the current unfinished season (2025) because we don't know the true peak yet
all_peaks = all_peaks[all_peaks['season'] < 2025]

print("Target file created successfully!")
print(all_peaks.head())

# 8. Save it
all_peaks.to_csv('target_peaks_wili.csv', index=False)

Target file created successfully!
   season region  peak_value
0    2010     ak     4.88010
1    2010     al    14.73760
2    2010     ar     8.18854
3    2010     az     2.77627
4    2010     ca     5.17402
