# 04_exploratory_analysis
Exploratory Data Analysis (EDA) on the enriched dataset (`master_with_topics.csv`).

**What this notebook does**
1. Load `master_with_topics.csv`
2. Basic summary (shape, date range, missing values)
3. Time‑series plots for rainfall, discharge, reservoir, load
4. Overlay/inspect text-derived features (keyword flags, LDA topics)
5. Correlations (including simple lagged correlation vs. load)
6. Seasonality views (monthly/weekday averages; optional decomposition)
7. Quick comparisons (flagged vs. non-flagged days)


### Cell 1 — Imports & config

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams['figure.figsize'] = (10, 4)  # default plot size
DATA_PATH = "master_with_topics.csv"      # expected input
print("Expecting:", Path(DATA_PATH).resolve())

### Cell 2 — Load dataset

In [None]:
df = pd.read_csv(DATA_PATH, parse_dates=['date']).sort_values('date').reset_index(drop=True)
print(df.shape)
print(df[['date']].agg(['min','max']))
df.head()

### Cell 3 — Schema & missing values

In [None]:
print("Columns:", len(df.columns))
print(df.columns.tolist()[:40])

na_counts = df.isna().sum().sort_values(ascending=False)
print("Top 20 columns by missing values:")
na_counts.head(20)

### Cell 4 — Helper functions

In [None]:
def first_existing(cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

COL_LOAD = first_existing(['load_MW','peak_load_mw','avg_load_mw'])
COL_RAIN = first_existing(['rainfall_mm','rain_mm'])
COL_Q    = first_existing(['discharge_m3s','discharge_cms','Q_cms'])
COL_RES  = first_existing(['reservoir_m','gauge_m'])

print("Detected columns ->",
      "load:", COL_LOAD, "| rain:", COL_RAIN, "| discharge:", COL_Q, "| reservoir:", COL_RES)

### Cell 5 — Time-series: rainfall, discharge, reservoir, load

In [None]:
def plot_series(col, title):
    if col and col in df.columns:
        plt.figure()
        plt.plot(df['date'], df[col])
        plt.xlabel('date'); plt.ylabel(col); plt.title(title)
        plt.tight_layout()
        plt.show()

plot_series(COL_RAIN, 'Daily Rainfall')
plot_series(COL_Q,    'River Discharge')
plot_series(COL_RES,  'Reservoir / Gauge Level')
plot_series(COL_LOAD, 'Load (MW)')

### Cell 6 — Text-derived keyword flags overview

In [None]:
flag_cols = [c for c in df.columns if c.endswith('_flag')]
print("Flag columns:", flag_cols)

flag_summary = df[flag_cols].sum().sort_values(ascending=False) if flag_cols else pd.Series(dtype=int)
flag_summary

### Cell 7 — Load with flagged days overlay

In [None]:
if COL_LOAD and flag_cols:
    plt.figure(figsize=(12,4))
    plt.plot(df['date'], df[COL_LOAD], label=COL_LOAD)
    # mark days with any flag
    any_flag = df[flag_cols].sum(axis=1) > 0
    plt.scatter(df.loc[any_flag, 'date'], df.loc[any_flag, COL_LOAD], s=12, marker='o', label='flagged day')
    plt.xlabel('date'); plt.ylabel(COL_LOAD); plt.title('Load with flagged days')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Skip overlay (no load or no flags).")

### Cell 8 — Correlation matrix & heatmap

In [None]:
num_df = df.select_dtypes(include=[np.number]).copy()
corr = num_df.corr().fillna(0.0)
print("Numeric columns:", num_df.columns.tolist()[:30])
print("corr shape:", corr.shape)

# Basic heatmap using matplotlib (no seaborn)
plt.figure(figsize=(8,6))
plt.imshow(corr, aspect='auto')
plt.colorbar(label='Pearson r')
plt.title('Correlation heatmap')
plt.xticks(ticks=np.arange(len(corr.columns)), labels=corr.columns, rotation=90, fontsize=7)
plt.yticks(ticks=np.arange(len(corr.columns)), labels=corr.columns, fontsize=7)
plt.tight_layout()
plt.show()

### Cell 9 — Lagged correlation vs. load (rain, discharge, reservoir)

In [None]:
if COL_LOAD:
    base = df[['date', COL_LOAD]].copy().dropna()
    lags = range(0, 31)  # 0..30 days
    results = []

    for col in [COL_RAIN, COL_Q, COL_RES]:
        if not col: 
            continue
        tmp = df[['date', col]].copy()
        for L in lags:
            tmp[f'{col}_lag{L}'] = tmp[col].shift(L)
        merged = base.merge(tmp.drop(columns=[col]), on='date', how='left')
        for L in lags:
            r = merged[COL_LOAD].corr(merged[f'{col}_lag{L}'])
            results.append({'feature': col, 'lag': L, 'r': r})

    res = pd.DataFrame(results).dropna().sort_values(['feature','lag']).reset_index(drop=True)
    print(res.groupby('feature')['r'].max())
    res.head()
else:
    print("Skip lagged correlation (no load column detected).")

### Cell 10 — Seasonality (monthly & weekday averages)

In [None]:
df['month'] = df['date'].dt.month
df['dow'] = df['date'].dt.weekday  # 0=Mon

def plot_group_mean(col, by, title):
    if col in df.columns:
        m = df.groupby(by)[col].mean()
        plt.figure()
        plt.plot(m.index, m.values, marker='o')
        plt.title(title); plt.xlabel(by); plt.ylabel(col)
        plt.tight_layout(); plt.show()

for col in [c for c in [COL_LOAD, COL_RAIN, COL_Q, COL_RES] if c]:
    plot_group_mean(col, 'month', f'{col}: monthly mean')
    plot_group_mean(col, 'dow',   f'{col}: weekday mean')

### Cell 11 — Seasonal decomposition of load (optional)

In [None]:
try:
    from statsmodels.tsa.seasonal import STL
    if COL_LOAD:
        s = df.set_index('date')[COL_LOAD].astype(float).interpolate(limit=7)
        s = s.asfreq('D')  # ensure daily frequency
        stl = STL(s, period=365, robust=True).fit()
        fig = stl.plot()
        fig.set_size_inches(10,6)
        plt.tight_layout(); plt.show()
    else:
        print("No load column to decompose.")
except Exception as e:
    print("statsmodels not installed or failed:", e)

### Cell 12 — Compare load on flagged vs. normal days

In [None]:
if COL_LOAD and 'flood_flag' in df.columns:
    df['_any_flag'] = (df[[c for c in df.columns if c.endswith('_flag')]].sum(axis=1) > 0).astype(int)
    grp = df.groupby('_any_flag')[COL_LOAD].agg(['count','mean','std','median'])
    print(grp)
else:
    print("Skip comparison (need load and at least one *_flag column).")

### Cell 13 — Notes / Findings (fill in)
- Summarize which features correlate with load (and at what lags).
- Highlight months/seasons with different behavior.
- Comment on how keyword flags and topics align with spikes/dips.
- Note any outliers or missing data patterns to handle before modeling.
