# ML Analyses ( SARIMA )

## 1st SARIMA Attempt_Jun

In [None]:
# --- PARAMETERS ---
ISR_YEAR = 2006  # Easy to update for Setting where Pre VS POST lies.
forecast_start_date = "2024-11-01"
pollutants = ['PM10 Total 0-10um STP', 'PM2.5 - Local Conditions', 'Ozone', 'Nitrogen dioxide (NO2)']
counties = ['San Joaquin', 'Stanislaus', 'Merced', 'Fresno', 'Kings', 'Tulare', 'Kern']

# --- LOAD AND CLEAN ---
df = pd.read_csv("SJV_AQI_1980_2025.csv")
df = df[df['county'].isin(counties) & df['parameter'].isin(pollutants)]
df = df[df['metric_used'] == 'Daily Mean']


# Fix datetime and group
df['datetime'] = pd.to_datetime(df['first_max_datetime'], errors='coerce')
df = df.dropna(subset=['datetime'])
df['date'] = df['datetime'].dt.date
df['month'] = pd.to_datetime(df['datetime'].dt.to_period('M').astype(str))
df['year'] = df['datetime'].dt.year
df = df.rename(columns={'parameter': 'pollutant', 'arithmetic_mean': 'value'})


# --- FUNCTION: Forecast by time unit ---
def forecast_by_timescale(grouped, freq, periods, label):
    results = []
    for (county, pollutant), group in grouped.groupby(['county', 'pollutant']):
        ts = group.set_index('date').asfreq(freq)['value'].fillna(method='ffill')
        if len(ts.dropna()) < 36:
            continue
        try:
            model = SARIMAX(ts, order=(1,1,1), seasonal_order=(1,1,1,12), enforce_stationarity=False, enforce_invertibility=False)
            fit = model.fit(disp=False)
            start = len(ts)
            end = start + periods - 1
            forecast_index = pd.date_range(ts.index[-1] + pd.tseries.frequencies.to_offset(freq), periods=periods, freq=freq)
            forecast = fit.predict(start=start, end=end)
            results.append(pd.DataFrame({
                'date': forecast_index,
                'predicted_value': forecast.values,
                'county': county,
                'pollutant': pollutant,
                'scale': label
            }))
        except:
            continue
    return pd.concat(results)



# --- FORECAST EXECUTION ---
daily = df.groupby(['county', 'pollutant', 'datetime'])['value'].mean().reset_index()
daily = daily.rename(columns={'datetime': 'date'})
monthly = df.groupby(['county', 'pollutant', 'month'])['value'].mean().reset_index()
monthly = monthly.rename(columns={'month': 'date'})
yearly = df.groupby(['county', 'pollutant', 'year'])['value'].mean().reset_index()
yearly['date'] = pd.to_datetime(yearly['year'].astype(str) + "-01-01")

daily_forecast = forecast_by_timescale(daily, 'D', 10, 'daily')
monthly_forecast = forecast_by_timescale(monthly, 'MS', 10, 'monthly')
yearly_forecast = forecast_by_timescale(yearly, 'YS', 10, 'yearly')

forecast_df = pd.concat([daily_forecast, monthly_forecast, yearly_forecast])
forecast_df.to_csv("SJV_AQI_Predictions_AllScales.csv", index=False)




# --- PLOT: Per pollutant per timescale ---
for scale in ['daily', 'monthly', 'yearly']:
    scale_df = forecast_df[forecast_df['scale'] == scale]
    for pollutant in scale_df['pollutant'].unique():
        plt.figure(figsize=(12, 6))
        sns.lineplot(data=scale_df[scale_df['pollutant'] == pollutant], x='date', y='predicted_value', hue='county', marker='o')
        plt.title(f"Forecast for {pollutant} ({scale.capitalize()})")
        plt.xticks(rotation=45)
        plt.ylabel("Predicted Value (µg/m³)")
        plt.xlabel("Date")
        plt.tight_layout()
        plt.show()

# --- PLOT: Combined plot per scale, averaged across counties ---
for scale in ['daily', 'monthly', 'yearly']:
    combined = (
        forecast_df[forecast_df['scale'] == scale]
        .groupby(['date', 'pollutant'])['predicted_value']
        .mean()
        .reset_index()
    )
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=combined, x='date', y='predicted_value', hue='pollutant', marker='o')
    plt.title(f"Average Forecast per Pollutant ({scale.capitalize()})")
    plt.xticks(rotation=45)
    plt.ylabel("Average Predicted Value (µg/m³)")
    plt.xlabel("Date")
    plt.tight_layout()
    plt.show()

## 2nd SARIMA Attempt_Jun

In [None]:
# Load data
merged_df = pd.read_csv("ready_pm25_fresno_with_Date.csv")
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df = merged_df.sort_values('date')
merged_df.set_index('date', inplace=True)

# Target series
series = merged_df['aqi_smoothed'].dropna()

# Train/test split
split = int(len(series) * 0.8)
train, test = series.iloc[:split], series.iloc[split:]

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Assuming train/test are defined
model = SARIMAX(train,
                order=(1,1,1),
                seasonal_order=(1,1,1,7),
                enforce_stationarity=False,
                enforce_invertibility=False)
results = model.fit(disp=False)

# Forecast
forecast = results.forecast(steps=len(test))

# Metrics
mae = mean_absolute_error(test, forecast)
rmse = np.sqrt(mean_squared_error(test, forecast))
r2 = r2_score(test, forecast)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Plot
plt.figure(figsize=(14,6))
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test', color='orange')
plt.plot(test.index, forecast, label='Forecast', color='green')
plt.title('SARIMA Forecast vs Actual AQI')
plt.xlabel('Date')
plt.ylabel('AQI (Smoothed)')
plt.legend()
plt.grid(True)
plt.show()
