In [1]:
! pip -q install lightgbm statsmodels

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings, re
from pathlib import Path

warnings.filterwarnings('ignore')

RAW  = Path('./raw')            
PROC = Path('./processed'); PROC.mkdir(exist_ok=True)

In [3]:
def pick(df, candidates):
    """Return the first column name that matches any in candidates (flexible, case-insensitive)."""
    for c in candidates:
        if c in df.columns: return c
    for c in df.columns:
        for k in candidates:
            if k.lower() in c.lower(): return c
    return None

def ensure_dt(df, col):
    """Make sure a column is datetime."""
    df = df.copy()
    df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

def _parse_investing_prices(df, date_col='Date', price_col='Price'):
    """
    Cleans Investing.com CSV:
    - day-first dates (e.g. 08/01/2025 = 8-Jan-2025)
    - strips thousands separators from Price
    - returns sorted DataFrame with Date & numeric Price
    """
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], dayfirst=True, errors='coerce')
    out[price_col] = pd.to_numeric(
        out[price_col].astype(str).str.replace(',', '', regex=False),
        errors='coerce'
    )
    out = out.dropna(subset=[date_col, price_col]).sort_values(date_col)
    return out[[date_col, price_col]]

def investing_to_quarter_ret(df, date_col='Date', price_col='Price'):
    """
    Resamples (daily or monthly) prices to quarter-end and computes quarterly % returns.
    Works for either frequency as long as Date is parsed.
    """
    df2 = _parse_investing_prices(df, date_col, price_col)
    q_end_price = df2.set_index(date_col)[price_col].resample('Q').last()
    q_ret = q_end_price.pct_change()
    return q_ret

# ---------- coverage validator (so you catch incomplete exports) ----------
def check_monthly_coverage(df, date_col='Date', label='series'):
    """
    Prints which months exist per year and flags empty quarters.
    Works for daily or monthly files (Investing.com).
    """
    dates = pd.to_datetime(df[date_col], dayfirst=True, errors='coerce').dropna()
    if dates.empty:
        print(f"[{label}] No parsable dates.")
        return

    # Build a 1-count series indexed by date so we can resample safely
    ser = pd.Series(1, index=dates).sort_index()

    # Months present per year (use unique month numbers)
    months_per_year = ser.groupby(ser.index.year).apply(lambda s: sorted(pd.Index(s.index.month).unique()))
    print(f"[{label}] Months present per year:")
    for y, months in months_per_year.items():
        print(f"  {y}: {months}")

    # Quarter coverage
    q_counts = ser.resample('Q').size()
    missing_quarters = q_counts[q_counts == 0]
    if len(missing_quarters) > 0:
        print(f"[{label}] ⚠ Missing {len(missing_quarters)} quarter(s) with zero rows — resample will yield NaNs.")
    else:
        print(f"[{label}] ✅ At least one row in every quarter.")

def monthly_to_q(series, how='mean'):
    """Monthly series -> quarterly series (mean or last)."""
    return series.resample('Q').mean() if how=='mean' else series.resample('Q').last()

def yoy_from_monthly(series):
    """12-month % change (YoY) for monthly series."""
    return series.pct_change(12) * 100.0

In [4]:
# ---------- use RAW folder & run ----------
n50_raw = pd.read_csv(RAW/'Nifty50.csv')
mid_raw = pd.read_csv(RAW/'NIFTYMidcap100.csv')

check_monthly_coverage(n50_raw, 'Date', 'NIFTY 50')
check_monthly_coverage(mid_raw, 'Date', 'NIFTY Midcap 100')

nifty_qret  = investing_to_quarter_ret(n50_raw).rename('nifty_qret')
midcap_qret = investing_to_quarter_ret(mid_raw).rename('midcap_qret')

excess_ret = (midcap_qret - nifty_qret).rename('excess_ret')

# quick sanity: you want ~62 quarters in 2010Q1–2025Q2 (minus the first diff)
valid_quarters = excess_ret.dropna()
print("Quarterly points:", len(valid_quarters))
assert len(valid_quarters) >= 55, "Too few quarterly points — re-export full history (Monthly or Daily) from 2010-01-01 to 2025-06-30."

[NIFTY 50] Months present per year:
  2010: [1]
  2011: [1]
  2012: [1]
  2013: [1]
  2014: [1]
  2015: [1]
  2016: [1]
  2017: [1]
  2018: [1]
  2019: [1]
  2020: [1]
  2021: [1]
  2022: [1]
  2023: [1]
  2024: [1]
  2025: [1]
[NIFTY 50] ⚠ Missing 45 quarter(s) with zero rows — resample will yield NaNs.
[NIFTY Midcap 100] Months present per year:
  2010: [1]
  2011: [1]
  2012: [1]
  2013: [1]
  2014: [1]
  2015: [1]
  2016: [1]
  2017: [1]
  2018: [1]
  2019: [1]
  2020: [1]
  2021: [1]
  2022: [1]
  2023: [1]
  2024: [1]
  2025: [1]
[NIFTY Midcap 100] ⚠ Missing 45 quarter(s) with zero rows — resample will yield NaNs.
Quarterly points: 60


In [5]:
cpi = pd.read_csv('CPI_Monthly_Jan_2013_to_Jun_2025.csv')  # INFO_CPI_*.txt
c_date = pick(cpi, ['DATE','Date','Month'])
c_val  = pick(cpi, ['CPI_COMBINED_BASE2012_100','CPI','Index','Value'])

cpi = ensure_dt(cpi, c_date).sort_values(c_date)
cpi_m = cpi.set_index(c_date)[c_val].astype(float).rename('cpi_index')
cpi_yoy_m = yoy_from_monthly(cpi_m).rename('cpi_yoy_m')
cpi_yoy_q = monthly_to_q(cpi_yoy_m, how='mean').rename('cpi_yoy')


FileNotFoundError: [Errno 2] No such file or directory: 'CPI_Monthly_Jan_2013_to_Jun_2025.csv'

In [None]:
gdp = pd.read_csv('GDP_Quarterly_2010_2025.csv')  # INFO_GDP_*.txt
# Convert '2012-Q1' to quarter-end timestamps
g_col = pick(gdp, ['quarter','Quarter'])
v_col = pick(gdp, ['gdp','GDP','Value'])

q = pd.PeriodIndex(gdp[g_col].astype(str), freq='Q').to_timestamp(how='end')
gdp_q = (pd.Series(gdp[v_col].astype(float).values, index=q)
         .pct_change(4) * 100.0).rename('gdp_yoy')


In [None]:
repo = pd.read_csv('Repo_Rate_Monthly_2010_2025.csv')  # INFO_Repo_*.txt
r_date = pick(repo, ['DATE','Date','Month'])
r_val  = pick(repo, ['REPO_RATE_PERCENT','Repo','Rate'])

repo = ensure_dt(repo, r_date).sort_values(r_date)
repo_q = repo.set_index(r_date)[r_val].astype(float).resample('Q').last().rename('repo')
repo_chg_bps = (repo_q.diff() * 100).rename('repo_chg_bps')  # % to basis points


In [None]:
rain = pd.read_csv('AnnualRainfall_with_Good_and_Anomaly_2012_2025.csv')

# Normalize month names to numbers
mmap = {m[:3].lower():i for i,m in enumerate(
    ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], start=1)}
rain['mon'] = rain['month'].astype(str).str[:3].str.lower().map(mmap).astype(int)

# Keep the SW monsoon season: Jun(6)–Sep(9)
mons = rain[rain['mon'].between(6,9)].copy()

# Aggregate to seasonal totals per year
grp = mons.groupby('Year', as_index=False).agg(
    obs_mm   = ('rainfall_mm','sum'),
    norm_mm  = ('good_rainfall_mm','sum'),
    anom_mm  = ('anomaly_mm','sum')
)

# Convert to anomaly (%) where positive means "above normal" (IMD convention)
# IMD anomaly% ≈ (observed - normal) / normal * 100
grp['rain_anom_pct'] = (grp['obs_mm'] - grp['norm_mm'])/grp['norm_mm'] * 100.0

# Place each year's seasonal anomaly at Sep 30 (end of monsoon)
rain_idx = pd.to_datetime(grp['Year'].astype(int).astype(str) + '-09-30')
rain_q = pd.Series(grp['rain_anom_pct'].values, index=rain_idx).resample('Q').ffill().rename('rain_anom')


In [None]:
# Align everything on the common quarterly index (inner join drops incomplete edges)
qdf = pd.concat(
    [excess_ret, midcap_qret, nifty_qret, rain_q, cpi_yoy_q, gdp_q, repo_chg_bps],
    axis=1
).dropna(how='any').copy()

# Lags (all predictors are t-1) and target (t+1)
qdf['ret_prev_q']    = qdf['midcap_qret'].shift(1)
qdf['rain_anom_lag'] = qdf['rain_anom'].shift(1)
qdf['cpi_yoy_lag']   = qdf['cpi_yoy'].shift(1)
qdf['gdp_yoy_lag']   = qdf['gdp_yoy'].shift(1)
qdf['repo_chg_lag']  = qdf['repo_chg_bps'].shift(1)

qdf['excess_next_q'] = qdf['excess_ret'].shift(-1)

# Clean rows created by shifting
qdf = qdf.dropna().copy()

print("Quarterly rows:", len(qdf), "| Range:", qdf.index.min().date(), "→", qdf.index.max().date())
qdf.head()


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor

tscv = TimeSeriesSplit(n_splits=5)

def cv_scores(model_maker, X, y):
    y_true, y_pred = [], []
    for tr, te in tscv.split(X):
        mdl = model_maker()
        mdl.fit(X.iloc[tr], y.iloc[tr])
        y_pred.extend(mdl.predict(X.iloc[te]))
        y_true.extend(y.iloc[te])
    return r2_score(y_true, y_pred), mean_absolute_error(y_true, y_pred)

y = qdf['excess_next_q']

# Baseline: last quarter's midcap return only
X_base = qdf[['ret_prev_q']]
r2_b, mae_b = cv_scores(lambda: ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.9]), X_base, y)

# Enriched: lag-return + rain + macro lags
feat_cols = ['ret_prev_q','rain_anom_lag','gdp_yoy_lag','cpi_yoy_lag','repo_chg_lag']
X_en = qdf[feat_cols]
r2_e, mae_e = cv_scores(lambda: LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42),
                        X_en, y)

print(f"Baseline : R²={r2_b:.3f}, MAE={mae_b:.3f}")
print(f"Enriched : R²={r2_e:.3f}, MAE={mae_e:.3f}")
print(f"ΔR²={r2_e-r2_b:.3f},  ΔMAE={(mae_b-mae_e):.3f}")


In [None]:
gbt = LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42)
gbt.fit(X_en, y)
imp = pd.Series(gbt.feature_importances_, index=feat_cols).sort_values(ascending=False)
print(imp)
imp.plot(kind='bar'); plt.title('LightGBM feature importance'); plt.show()


In [None]:
from scipy.stats import ttest_ind, ks_2samp

good = qdf.loc[qdf['rain_anom_lag'] >= 4,  'excess_next_q']
poor = qdf.loc[qdf['rain_anom_lag'] <= -4, 'excess_next_q']

print('t-test:', ttest_ind(good, poor, equal_var=False))
print('KS    :', ks_2samp(good, poor))


In [None]:
import statsmodels.api as sm
gdp_fwd1 = qdf['gdp_yoy_lag'].shift(-1)   # GDP_{t+1}
X = sm.add_constant(qdf['rain_anom_lag'])
res = sm.OLS(gdp_fwd1.dropna(), X.loc[gdp_fwd1.dropna().index]).fit()
print(res.summary())


In [None]:
qdf.to_csv(PROC/'quarterly_features.csv')
print("Saved:", (PROC/'quarterly_features.csv').resolve())
