In [1]:
! pip install lightgbm, pandas, numpy, sklearn

Defaulting to user installation because normal site-packages is not writeable


ERROR: Invalid requirement: 'lightgbm,': Expected end or semicolon (after name and no valid version specifier)
    lightgbm,
            ^


In [2]:
# Compact end-to-end with final prints
import pandas as pd, numpy as np
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor

# Index data
nifty50 = pd.read_csv("raw/Nifty50.csv", parse_dates=["Date"]).sort_values("Date")
midcap  = pd.read_csv("raw/NIFTYMidcap100.csv", parse_dates=["Date"]).sort_values("Date")
for df in [nifty50, midcap]:
    for col in ["Price","Open","High","Low"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(",", "", regex=False).astype(float)
def daily_to_qtr_last(series_with_date_index):
    return series_with_date_index.resample("Q").last()
midcap_q = daily_to_qtr_last(midcap.set_index("Date")["Price"]).pct_change().rename("midcap_ret")
nifty_q  = daily_to_qtr_last(nifty50.set_index("Date")["Price"]).pct_change().rename("nifty_ret")
excess   = (midcap_q - nifty_q).rename("excess_ret")

# Rainfall data
rain = pd.read_csv("raw/AnnualRainfall.csv")
rain.columns = [c.strip().lower() for c in rain.columns]
month_map = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
rain["month_num"] = rain["month"].str.strip().str[:3].str.lower().map(month_map).astype(int)
monsoon = rain[rain["month_num"].between(6,9)].copy()
monsoon_yr = monsoon.groupby("year", as_index=False)["rainfall_mm"].sum().rename(columns={"rainfall_mm":"monsoon_total_mm"})
normal = monsoon_yr["monsoon_total_mm"].mean()
monsoon_yr["rain_anom_pct"] = 100.0*(monsoon_yr["monsoon_total_mm"]-normal)/normal
rain_q = pd.Series(monsoon_yr["rain_anom_pct"].values,
                   index=pd.to_datetime(monsoon_yr["year"].astype(int).astype(str)+"-09-30")).resample("Q").ffill().rename("rain_anom")

# Assemble features/target
df = pd.concat([excess, rain_q], axis=1)
for lag in [1,2,3]:
    df[f"excess_ret_lag{lag}"] = df["excess_ret"].shift(lag)
df["rain_anom_lag1"] = df["rain_anom"].shift(1)
df["excess_next_q"] = df["excess_ret"].shift(-1)
df = df.dropna()

# Models
try:
    tscv = TimeSeriesSplit(n_splits=5, test_size=2)
except TypeError:
    tscv = TimeSeriesSplit(n_splits=5)

X_base = df[["excess_ret_lag1"]]
y      = df["excess_next_q"]

base_pred, base_true = [], []
for tr, te in tscv.split(X_base):
    mdl = ElasticNetCV(cv=3, l1_ratio=[0.1,0.5,0.9], n_alphas=50, random_state=42)
    mdl.fit(X_base.iloc[tr], y.iloc[tr])
    base_pred.extend(mdl.predict(X_base.iloc[te]))
    base_true.extend(y.iloc[te])
base_r2  = r2_score(base_true, base_pred)
base_mae = mean_absolute_error(base_true, base_pred)

feat_cols = ["excess_ret_lag1","excess_ret_lag2","excess_ret_lag3","rain_anom_lag1"]
X_en = df[[c for c in feat_cols if c in df.columns]]
en_pred, en_true = [], []
for tr, te in tscv.split(X_en):
    gbt = LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, num_leaves=7, min_data_in_leaf=5, random_state=42)
    gbt.fit(X_en.iloc[tr], y.iloc[tr])
    en_pred.extend(gbt.predict(X_en.iloc[te]))
    en_true.extend(y.iloc[te])
en_r2  = r2_score(en_true, en_pred)
en_mae = mean_absolute_error(en_true, en_pred)

delta_r2 = en_r2 - base_r2
mae_cut  = (base_mae - en_mae) / base_mae if base_mae else np.nan
success  = (delta_r2 >= 0.10) and (mae_cut >= 0.10)

print("Rows:", len(df))
print(f"Baseline  R2={base_r2:.3f}, MAE={base_mae:.4f}")
print(f"Enriched  R2={en_r2:.3f}, MAE={en_mae:.4f}")
print(f"ΔR²={delta_r2:.3f}, MAE reduction={mae_cut*100:.1f}% | Success? {success}")


  return series_with_date_index.resample("Q").last()
  index=pd.to_datetime(monsoon_yr["year"].astype(int).astype(str)+"-09-30")).resample("Q").ffill().rename("rain_anom")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 43
[LightGBM] [Info] Number of data points in the train set: 29, number of used features: 4
[LightGBM] [Info] Start training from score 0.010852
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 31, number of used features: 4
[LightGBM] [Info] Start training from score 0.009236
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 33, number o

In [3]:
# Build Monsoon (Jun–Sep) anomaly % from monthly rainfall
# Input schema (case-insensitive): year, month, rainfall_mm, [source]
# Output: quarterly series `rain_anom` (% from multi-year mean), ready to merge

import pandas as pd
from pathlib import Path

# ---- paths (change if needed) ----
RAIN_PATH = Path("raw/AnnualRainfall.csv")  # or Path("/mnt/data/AnnualRainfall.csv")

# Robust loader (case-insensitive, tolerant to month names)
rain = pd.read_csv(RAIN_PATH)
rain.columns = [c.strip().lower() for c in rain.columns]

# Expect minimal columns
need = {"year", "month", "rainfall_mm"}
if not need.issubset(set(rain.columns)):
    raise ValueError(f"AnnualRainfall.csv must have columns {need} (case-insensitive). Found {list(rain.columns)}")

# Map month → number
month_map = {
    "jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6,
    "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12
}
rain["month_num"] = (
    rain["month"].astype(str).str.strip().str[:3].str.lower().map(month_map).astype(int)
)

# Sum monsoon rainfall per year (Jun–Sep)
monsoon = rain[rain["month_num"].between(6, 9)].copy()
monsoon_yr = (
    monsoon.groupby("year", as_index=False)["rainfall_mm"]
           .sum()
           .rename(columns={"rainfall_mm": "monsoon_total_mm"})
)

# Compute "normal" as multi-year mean of monsoon totals over available years
# (Proxy for the 50-year LPA if we don't have longer history in the file)
normal = monsoon_yr["monsoon_total_mm"].mean()

# Anomaly %: +ve means wetter-than-normal monsoon
monsoon_yr["rain_anom_pct"] = 100.0 * (monsoon_yr["monsoon_total_mm"] - normal) / normal

# Optional tag for later RQ2 use (good/poor/neutral)
# Good ≥ +4%, Poor ≤ −4%, Neutral otherwise
monsoon_yr["rain_band"] = pd.cut(
    monsoon_yr["rain_anom_pct"],
    bins=[-10_000, -4, 4, 10_000],
    labels=["poor", "neutral", "good"]
)

# Convert annual anomaly to quarterly series anchored at Sep-30 (monsoon ends),
# then forward-fill so it’s available through the next quarters until next monsoon.
rain_q = pd.Series(
    monsoon_yr["rain_anom_pct"].values,
    index=pd.to_datetime(monsoon_yr["year"].astype(int).astype(str) + "-09-30")
).resample("Q").ffill().rename("rain_anom")

# Also create lag (known at t to predict t+1)
rain_q_lag1 = rain_q.shift(1).rename("rain_anom_lag1")

# Keep for merge with returns
rain_features = pd.concat([rain_q, rain_q_lag1], axis=1)

print("Monsoon years:", monsoon_yr.shape[0], "| Normal (mm):", round(normal, 2))
print(monsoon_yr.head(5))
rain_features.head(6)


Monsoon years: 10 | Normal (mm): 1022.83
   year  monsoon_total_mm  rain_anom_pct rain_band
0  2010             920.7      -9.985042      poor
1  2011            1204.5      17.761505      good
2  2012            1061.5       3.780687   neutral
3  2013            1475.4      44.246845      good
4  2014             803.8     -21.414116      poor


  ).resample("Q").ffill().rename("rain_anom")


Unnamed: 0_level_0,rain_anom,rain_anom_lag1
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-09-30,-9.985042,
2010-12-31,-9.985042,-9.985042
2011-03-31,-9.985042,-9.985042
2011-06-30,-9.985042,-9.985042
2011-09-30,17.761505,-9.985042
2011-12-31,17.761505,17.761505


In [4]:
# Merge excess returns with rainfall features and build the target
df = pd.concat([excess, rain_features], axis=1)

# Baseline lags for excess returns (1..3)
for lag in [1, 2, 3]:
    df[f"excess_ret_lag{lag}"] = df["excess_ret"].shift(lag)

# Target = next-quarter excess return
df["excess_next_q"] = df["excess_ret"].shift(-1)

# Final frame
df = df.dropna().copy()
print("Quarterly rows (after lags & target):", len(df))
df.head(8)

Quarterly rows (after lags & target): 39


Unnamed: 0,excess_ret,rain_anom,rain_anom_lag1,excess_ret_lag1,excess_ret_lag2,excess_ret_lag3,excess_next_q
2011-03-31,-0.043221,-9.985042,-9.985042,-0.050844,-0.007953,0.043205,0.023405
2011-06-30,0.023405,-9.985042,-9.985042,-0.043221,-0.050844,-0.007953,0.014606
2011-09-30,0.014606,17.761505,-9.985042,0.023405,-0.043221,-0.050844,-0.073926
2011-12-31,-0.073926,17.761505,17.761505,0.014606,0.023405,-0.043221,0.116556
2012-03-31,0.116556,17.761505,17.761505,-0.073926,0.014606,0.023405,-0.043488
2012-06-30,-0.043488,17.761505,17.761505,0.116556,-0.073926,0.014606,-0.013915
2012-09-30,-0.013915,3.780687,17.761505,-0.043488,0.116556,-0.073926,0.049375
2012-12-31,0.049375,3.780687,3.780687,-0.013915,-0.043488,0.116556,-0.092058


In [5]:
candidate_feats = [
    "excess_ret_lag1","excess_ret_lag2","excess_ret_lag3",
    "rain_anom_lag1",  # now created above
    # (add macro lags later: gdp_yoy_lag1, cpi_yoy_lag1, pmi_lag1, repo_chg_lag1)
]