In [None]:
# =========================
# 0) Imports & folders
# =========================
import pandas as pd, numpy as np, warnings, re
from pathlib import Path
warnings.filterwarnings('ignore')

RAW  = Path('.')            # adjust if files live elsewhere
PROC = Path('./processed'); PROC.mkdir(exist_ok=True)

# Filenames you uploaded (adjust names if your casing differs)
f_rain = RAW/'AnnualRainfall.csv'
f_cpi  = RAW/'India_CPIMonthly_Jan_2013_to_Jun_2025.csv'
f_gdp  = RAW/'india_gdp_quarterly_2010_2025_imf_ifs.csv'
f_n50  = RAW/'Nifty50.csv'
f_mid  = RAW/'NIFTYMidcap100.csv'
f_repo = RAW/'RepoRate.csv'       # OPTIONAL: only if you later add one

# =========================
# 1) Helpers
# =========================
def _first_col(df, candidates):
    for c in candidates:
        if c in df.columns: return c
    # fuzzy match
    for c in df.columns:
        for k in candidates:
            if k.lower() in c.lower(): return c
    return None

def _ensure_datetime(df, col):
    if not np.issubdtype(df[col].dtype, np.datetime64):
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

def daily_to_quarter_ret(df, date_col, price_col):
    df = df[[date_col, price_col]].dropna().sort_values(date_col)
    df = _ensure_datetime(df, date_col)
    df[price_col] = (
        df[price_col].astype(str)
        .str.replace(',', '', regex=False)
        .str.replace('(', '-', regex=False)
        .str.replace(')', '', regex=False)
        .str.strip()
        .replace({'': np.nan, 'None': np.nan})
        .astype(float)
    )
    df = df.set_index(date_col)
    q_price = df[price_col].resample('Q').last()
    q_ret   = q_price.pct_change().rename(price_col + '_qret')
    return q_ret

def monthly_to_quarter(series_monthly, how='mean'):
    if how == 'mean':
        return series_monthly.resample('Q').mean()
    return series_monthly.resample('Q').last()

def yoy_from_monthly(series):
    """Compute YoY % change for a monthly index/value series."""
    return series.pct_change(12) * 100.0

# =========================
# 2) Index prices -> quarterly returns
# =========================
# NIFTY 50
n50 = pd.read_csv(f_n50)
c_date = _first_col(n50, ['Date','date','Time'])
c_px   = _first_col(n50, ['Close','Adj Close','Price','Close Price','Value'])
if c_date is None or c_px is None:
    raise ValueError("Nifty50.csv columns not recognized. Expected Date & Close-like column.")
n50_qret = daily_to_quarter_ret(n50, c_date, c_px).rename('nifty_qret')

# MIDCAP 100
mid = pd.read_csv(f_mid)
c_date_m = _first_col(mid, ['Date','date','Time'])
c_px_m   = _first_col(mid, ['Close','Adj Close','Price','Close Price','Value'])
if c_date_m is None or c_px_m is None:
    raise ValueError("NIFTYMidcap100.csv columns not recognized. Expected Date & Close-like column.")
mid_qret = daily_to_quarter_ret(mid, c_date_m, c_px_m).rename('midcap_qret')

# Excess return for quarter t
excess_q = (mid_qret - n50_qret).rename('excess_ret')

# =========================
# 3) Rainfall anomaly (annual -> quarterly)
# =========================
rain = pd.read_csv(f_rain)
# try to find year & anomaly columns
c_year = _first_col(rain, ['Year','year','YYYY'])
c_anom = _first_col(rain, ['Anomaly','Anomaly_%','Anomaly %','Rainfall Anomaly'])
if c_year is None or c_anom is None:
    raise ValueError("AnnualRainfall.csv must contain Year & Anomaly columns.")
rain = rain[[c_year, c_anom]].dropna()
rain.columns = ['Year','rain_anom']
# place anomaly at Sept 30 of that monsoon year; then ffill within year
rain_idx = pd.to_datetime(rain['Year'].astype(int).astype(str) + '-09-30')
rain_q = pd.Series(rain['rain_anom'].values, index=rain_idx).resample('Q').ffill()
rain_q = rain_q.rename('rain_anom')

# =========================
# 4) CPI monthly -> CPI YoY -> quarterly
# =========================
cpi = pd.read_csv(f_cpi)
c_date = _first_col(cpi, ['Date','date','Month','month','Period'])
c_val  = _first_col(cpi, ['Index','CPI','Value','CPI Index','cpi'])
if c_date is None or c_val is None:
    raise ValueError("India_CPIMonthly*.csv must have Date & CPI value columns.")
cpi = _ensure_datetime(cpi, c_date).sort_values(c_date)
cpi_m = cpi.set_index(c_date)[c_val].astype(float).rename('cpi_index')
cpi_yoy_m = yoy_from_monthly(cpi_m).rename('cpi_yoy_m')
cpi_yoy_q = monthly_to_quarter(cpi_yoy_m, how='mean').rename('cpi_yoy')

# =========================
# 5) GDP quarterly (YoY % preferred)
# =========================
gdp = pd.read_csv(f_gdp)
c_date = _first_col(gdp, ['Date','date','Quarter','quarter','Period'])
c_yoy  = _first_col(gdp, ['YoY','GDP_YoY','gdp_yoy','Growth','Growth YoY'])
c_lvl  = _first_col(gdp, ['GDP','Value','gdp_sa','gdp'])
if c_date is None:
    raise ValueError("GDP csv must have a Date/Quarter column.")
gdp = _ensure_datetime(gdp, c_date).sort_values(c_date)

if c_yoy is not None:
    gdp_q = gdp.set_index(c_date)[c_yoy].astype(float).rename('gdp_yoy')
elif c_lvl is not None:
    # compute yoy if levels provided (assumes quarterly level series)
    gdp_q = (gdp.set_index(c_date)[c_lvl].astype(float)
             .pct_change(4) * 100.0).rename('gdp_yoy')
else:
    raise ValueError("GDP file must have either a YoY% column or a level column to compute YoY.")

# =========================
# 6) OPTIONAL: Repo rate daily/point-in-time -> quarterly Δ
# =========================
if f_repo.exists():
    repo = pd.read_csv(f_repo)
    c_date = _first_col(repo, ['Date','date','Period'])
    c_rate = _first_col(repo, ['Repo','Rate','Repo Rate','Policy Rate'])
    repo = _ensure_datetime(repo, c_date).sort_values(c_date)
    repo_q = repo.set_index(c_date)[c_rate].astype(float).resample('Q').last().rename('repo')
    repo_chg_q = (repo_q.diff()*100).rename('repo_chg_bps')  # if rate in %, convert Δ to bps
else:
    repo_chg_q = pd.Series(dtype=float, name='repo_chg_bps')  # empty; handled later

# =========================
# 7) Align to common quarterly index & build features
# =========================
# Common quarterly index = intersection across main series
qdf = pd.concat(
    [excess_q.rename('excess_ret'),
     mid_qret.rename('midcap_ret'),
     n50_qret.rename('nifty_ret'),
     rain_q, cpi_yoy_q, gdp_q, repo_chg_q],
    axis=1
).dropna(how='any')

# Features (lagged 1Q):
qdf['ret_prev_q']    = qdf['midcap_ret'].shift(1)
qdf['rain_anom_lag'] = qdf['rain_anom'].shift(1)
qdf['cpi_yoy_lag']   = qdf['cpi_yoy'].shift(1)
qdf['gdp_yoy_lag']   = qdf['gdp_yoy'].shift(1)
if 'repo_chg_bps' in qdf.columns:
    qdf['repo_chg_lag'] = qdf['repo_chg_bps'].shift(1)

# Target (excess return next quarter)
qdf['excess_next_q'] = qdf['excess_ret'].shift(-1)

# Policy-amplifier interaction (if repo present)
if 'repo_chg_lag' in qdf.columns:
    qdf['rain_repo_int'] = qdf['rain_anom_lag'] * qdf['repo_chg_lag']

# Clean NA rows created by shifting
qdf = qdf.dropna().copy()

# =========================
# 8) Sanity checks & save
# =========================
print("Quarterly rows:", len(qdf), "| Range:", qdf.index.min().date(), "→", qdf.index.max().date())
print(qdf.filter(['excess_ret','excess_next_q','ret_prev_q','rain_anom_lag','cpi_yoy_lag','gdp_yoy_lag']).head(8))

qdf.to_parquet(PROC/'quarterly_features.parquet')
qdf.to_csv(PROC/'quarterly_features.csv')

# Small dictionary describing columns (handy for your slide)
data_dict = {
  'excess_ret': 'Midcap100_qret - Nifty50_qret (quarter t)',
  'excess_next_q': 'Excess return in quarter t+1 (prediction target)',
  'ret_prev_q': 'Midcap100 return in t-1',
  'rain_anom_lag': 'All-India monsoon rainfall anomaly (t-1, %)',
  'cpi_yoy_lag': 'CPI YoY (t-1, %)',
  'gdp_yoy_lag': 'Real GDP YoY (t-1, %)',
  'repo_chg_lag': 'Repo change in bps (t-1) [if provided]',
  'rain_repo_int': 'Interaction rain_anom_lag × repo_chg_lag [if provided]'
}
pd.Series(data_dict).to_csv(PROC/'data_dictionary.csv')
print("\nSaved:", (PROC/'quarterly_features.parquet').resolve())


TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
# --- Baseline vs Enriched (RQ-1) ---
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor

X_base = qdf[['ret_prev_q']]
y      = qdf['excess_next_q']
tscv = TimeSeriesSplit(n_splits=5)

def cv_pred(maker, X):
    y_true, y_pred = [], []
    for tr, te in tscv.split(X):
        mdl = maker(); mdl.fit(X.iloc[tr], y.iloc[tr])
        y_pred.extend(mdl.predict(X.iloc[te])); y_true.extend(y.iloc[te])
    return np.array(y_true), np.array(y_pred)

y_t, y_p = cv_pred(lambda: ElasticNetCV(cv=3), X_base)
print("Baseline  R²:", r2_score(y_t, y_p), "MAE:", mean_absolute_error(y_t, y_p))

feat_cols = ['ret_prev_q','rain_anom_lag','cpi_yoy_lag','gdp_yoy_lag']
if 'repo_chg_lag' in qdf.columns: feat_cols.append('repo_chg_lag')
X_en = qdf[feat_cols]

y_t2, y_p2 = cv_pred(lambda: LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=3), X_en)
print("Enriched  R²:", r2_score(y_t2, y_p2), "MAE:", mean_absolute_error(y_t2, y_p2))


In [None]:
# --- RQ-2 Good vs Poor monsoon ---
from scipy.stats import ttest_ind, ks_2samp
good = qdf.loc[qdf['rain_anom_lag'] >= 4,  'excess_next_q']
poor = qdf.loc[qdf['rain_anom_lag'] <= -4, 'excess_next_q']
print("t-test:", ttest_ind(good, poor, equal_var=False))
print("KS    :", ks_2samp(good, poor))


In [None]:
# --- RQ-3 Rain → GDP lead-lag ---
import statsmodels.api as sm
# align GDP on same index
gdp_y = qdf['gdp_yoy_lag'].shift(-1)  # GDP_{t+1}
X = sm.add_constant(qdf['rain_anom_lag'])
res = sm.OLS(gdp_y.dropna(), X.loc[gdp_y.dropna().index]).fit()
print(res.summary())

# engineered feature & re-run enriched model with gdp_pred_from_rain
qdf['gdp_pred_from_rain'] = (res.params['const'] + res.params['rain_anom_lag']*qdf['rain_anom_lag'])
feat_cols2 = feat_cols + ['gdp_pred_from_rain']
y_t3, y_p3 = cv_pred(lambda: LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=3), qdf[feat_cols2])
print("Enriched+Rain→GDP  R²:", r2_score(y_t3, y_p3), "MAE:", mean_absolute_error(y_t3, y_p3))


In [None]:
# --- RQ-4 Policy Amplifier (if repo available) ---
if 'repo_chg_lag' in qdf.columns:
    qdf['rain_repo_int'] = qdf['rain_anom_lag'] * qdf['repo_chg_lag']
    Xint = sm.add_constant(qdf[['rain_anom_lag','repo_chg_lag','rain_repo_int']])
    res_int = sm.OLS(qdf['excess_next_q'], Xint).fit()
    print(res_int.summary())

    feat_cols3 = feat_cols2 + ['rain_repo_int'] if 'gdp_pred_from_rain' in qdf.columns else feat_cols + ['rain_repo_int']
    y_t4, y_p4 = cv_pred(lambda: LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=3), qdf[feat_cols3])
    print("Enriched + interaction  R²:", r2_score(y_t4, y_p4), "MAE:", mean_absolute_error(y_t4, y_p4))
else:
    print("Repo rate file not provided; skipping RQ-4.")
