# NIFTY Midcap 100 – 7‑Day Capstone Starter

This notebook collects public macro + weather data, constructs features, and compares a **baseline ElasticNet** model with an **enriched LightGBM** model for 1‑quarter‑ahead excess returns.

_Fill in your own file paths where indicated._

## 0. Imports & setup

In [None]:
import pandas as pd, numpy as np, json, warnings, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
warnings.filterwarnings('ignore')

DATA_DIR = Path('data')
DATA_DIR.mkdir(exist_ok=True)

## 1. Load raw series

In [None]:
# TODO: Replace the placeholder CSV paths with your own
nifty = pd.read_csv('path_to_nifty_midcap_total_return.csv', parse_dates=['date'])
gdp   = pd.read_csv('path_to_gdp_yoy.csv', parse_dates=['date'])
cpi   = pd.read_csv('path_to_cpi_yoy.csv', parse_dates=['date'])
pmi   = pd.read_csv('path_to_pmi.csv', parse_dates=['date'])
repo  = pd.read_csv('path_to_repo_change.csv', parse_dates=['date'])
rain  = pd.read_csv('path_to_rainfall_anomaly.csv', parse_dates=['date'])

## 2. Resample to quarterly & merge

In [None]:
def to_q(df, col):
    return df.set_index('date')[col].resample('Q').last()

df = pd.DataFrame({
    'nifty_tr': to_q(nifty, nifty.columns[1]),
    'gdp_yoy':  to_q(gdp, gdp.columns[1]),
    'cpi_yoy':  to_q(cpi, cpi.columns[1]),
    'pmi':      to_q(pmi, pmi.columns[1]),
    'repo_chg': to_q(repo, repo.columns[1]),
    'rain_anom':to_q(rain, rain.columns[1]).ffill()
}).dropna()

df['ret_prev_q'] = df['nifty_tr'].pct_change()
df['ret_next_q'] = df['ret_prev_q'].shift(-1)
for col in ['gdp_yoy','cpi_yoy','pmi','repo_chg','rain_anom']:
    df[f'{col}_lag'] = df[col].shift(1)

df = df.dropna()
df.head()

## 3. Baseline ElasticNet (lagged return only)

In [None]:
X_base = df[['ret_prev_q']]
y = df['ret_next_q']
tscv = TimeSeriesSplit(n_splits=5)
preds_base, true_base = [], []
for train, test in tscv.split(X_base):
    model = ElasticNetCV(cv=3)
    model.fit(X_base.iloc[train], y.iloc[train])
    preds_base.extend(model.predict(X_base.iloc[test]))
    true_base.extend(y.iloc[test])
r2_base = r2_score(true_base, preds_base)
mae_base = mean_absolute_error(true_base, preds_base)
print('Baseline R2', r2_base, 'MAE', mae_base)

## 4. Enriched LightGBM (macro + rain)

In [None]:
feat_cols = ['ret_prev_q','gdp_yoy_lag','cpi_yoy_lag','pmi_lag','repo_chg_lag','rain_anom_lag']
X_en = df[feat_cols]
preds_en, true_en = [], []
for train, test in tscv.split(X_en):
    mdl = LGBMRegressor(n_estimators=300, learning_rate=0.05)
    mdl.fit(X_en.iloc[train], y.iloc[train])
    preds_en.extend(mdl.predict(X_en.iloc[test]))
    true_en.extend(y.iloc[test])
r2_en = r2_score(true_en, preds_en)
mae_en = mean_absolute_error(true_en, preds_en)
print('Enriched R2', r2_en, 'MAE', mae_en)

metrics = {'baseline': {'r2': r2_base, 'mae': mae_base},
           'enriched': {'r2': r2_en,  'mae': mae_en}}
Path('data/metrics.json').write_text(json.dumps(metrics, indent=2))