# 1. Title & objective
# Monsoon & Macro – Midcap Forecasting

Starter notebook for Data Analytics Capstone.

**Objective:** Build an end-to-end pipeline that merges rainfall anomaly + macro drivers and predicts next-quarter *excess* return of the NIFTY Midcap 100.

> **Tip:** fill in download paths or API calls where you see `TODO:`. Run each section sequentially.

In [None]:
! pip install pandas numpy requests lightgbm

Defaulting to user installation because normal site-packages is not writeable


ERROR: Invalid requirement: 'pandas,': Expected end or semicolon (after name and no valid version specifier)
    pandas,
          ^


In [2]:
### Imports & directory setup

In [3]:
# 0. Imports & global config
import pandas as pd, numpy as np, json, matplotlib.pyplot as plt
import warnings, datetime as dt, requests, io, os, re
from pathlib import Path
warnings.filterwarnings('ignore')

RAW  = Path('raw');       RAW.mkdir(exist_ok=True)
PROC = Path('processed'); PROC.mkdir(exist_ok=True)

### Section 1 – Load raw data

In [None]:
# --- 1.1 Index prices -------------------------------------------------

midcap_csv = RAW / 'NIFTYMidcap100.csv'   # columns: Date, Close
nifty50_csv = RAW / 'Nifty50.csv'           # columns: Date, Close

midcap  = pd.read_csv(midcap_csv,  parse_dates=['Date']).sort_values('Date')
nifty50 = pd.read_csv(nifty50_csv, parse_dates=['Date']).sort_values('Date')

# --- 1.2 Rainfall  --------------------------------------------

rain = pd.read_csv(RAW / 'rain_anomaly.csv')


# --- 1.3 GDP --------------------------------------------
gdp = pd.read_csv(RAW / 'rain_anomaly.csv')

# --- 1.4 Consumer Price Index (CPI) --------------------------------------------
cpi= pd.read_csv(RAW / 'rain_anomaly.csv')

# --- 1.5 Purchasing Managers Index --------------------------------------------
pmi= pd.read_csv(RAW / 'rain_anomaly.csv')

# --- 1.6 Repo Rate --------------------------------------------
repo= pd.read_csv(RAW / 'rain_anomaly.csv')

SyntaxError: invalid syntax (3869948012.py, line 15)

### Section 2 – Resample to quarters

In [None]:
def daily_to_qtr(df, price_col):
    return df.set_index('Date')[price_col].resample('Q').last()

midcap_q = daily_to_qtr(midcap,  midcap.columns[1]).pct_change().rename('midcap_ret')
nifty_q  = daily_to_qtr(nifty50, nifty50.columns[1]).pct_change().rename('nifty_ret')
excess   = (midcap_q - nifty_q).rename('excess_ret')

# Rainfall: forward-fill anomaly into four quarters of same FY
rain_q = rain.set_index(pd.to_datetime(rain['Year'].astype(str) + '-09-30'))['Anomaly_%']
rain_q = rain_q.resample('Q').ffill().rename('rain_anom')

# TODO: GDP, CPI, PMI, repo -> load, then resample('Q').last() or .mean()


### Section 3 – Merge & lag

In [None]:
# Minimal merge (add your macro lags later)
series = [
    excess,
    midcap_q.shift(1).rename('ret_prev_q'),
    rain_q.shift(1)                 # rain_anom_lag
]

# TODO: append gdp_yoy_lag, cpi_yoy_lag, pmi_lag, repo_chg_lag
df = pd.concat(series, axis=1).dropna()

# Target = next-quarter excess return
df['excess_next_q'] = df['excess_ret'].shift(-1)
df.dropna(inplace=True)

df.to_parquet(PROC / 'quarterly.parquet')
print(df.head())


### Section 4 – Baseline vs Enriched (RQ-1)

In [None]:
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics  import r2_score, mean_absolute_error
from lightgbm import LGBMRegressor

tscv = TimeSeriesSplit(n_splits=5)
X_base = df[['ret_prev_q']]
y      = df['excess_next_q']

# ----- baseline ElasticNet -----
base_pred, base_true = [], []
for tr, te in tscv.split(X_base):
    mdl = ElasticNetCV(cv=3).fit(X_base.iloc[tr], y.iloc[tr])
    base_pred.extend(mdl.predict(X_base.iloc[te]))
    base_true.extend(y.iloc[te])

print('Baseline  R²:', r2_score(base_true, base_pred),
      'MAE:', mean_absolute_error(base_true, base_pred))

# ----- enriched LightGBM -----
feat_cols = ['ret_prev_q', 'rain_anom']   # extend with macro lags
X_en = df[feat_cols]

en_pred, en_true = [], []
for tr, te in tscv.split(X_en):
    gbt = LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=3)
    gbt.fit(X_en.iloc[tr], y.iloc[tr])
    en_pred.extend(gbt.predict(X_en.iloc[te]))
    en_true.extend(y.iloc[te])

print('Enriched  R²:', r2_score(en_true, en_pred),
      'MAE:', mean_absolute_error(en_true, en_pred))


### Section 5 – Good vs Poor monsoon (RQ-2)

In [None]:
from scipy.stats import ttest_ind, ks_2samp

good = df.loc[df['rain_anom'] >= 4,  'excess_next_q']
poor = df.loc[df['rain_anom'] <= -4, 'excess_next_q']

print('t-test:', ttest_ind(good, poor, equal_var=False))
print('KS    :', ks_2samp(good, poor))

# Optional boxplot
plt.boxplot([good, poor], labels=['Good rain', 'Poor rain'])
plt.ylabel('Excess Return next-Q')
plt.show()


### Section 6 – Rain → GDP lead-lag (RQ-3) (code stub)

In [None]:
# TODO: after adding 'gdp_yoy' series
import statsmodels.api as sm
df['gdp_fwd1'] = df['gdp_yoy'].shift(-1)

model = sm.OLS(df['gdp_fwd1'].dropna(),
               sm.add_constant(df.loc[df['gdp_fwd1'].notna(), 'rain_anom']))
res = model.fit()
print(res.summary())

# Create engineered feature
df['gdp_pred_from_rain'] = res.params[0] + res.params[1] * df['rain_anom']

### Section 7 – Rain × Repo interaction (RQ-4) (code stub)

In [None]:
# TODO: ensure repo_chg_lag present
df['rain_repo_int'] = df['rain_anom'] * df['repo_chg_lag']

# Simple OLS to test interaction
X_int = df[['rain_anom', 'repo_chg_lag', 'rain_repo_int']].dropna()
y_int = df.loc[X_int.index, 'excess_next_q']
res_int = sm.OLS(y_int, sm.add_constant(X_int)).fit()
print(res_int.summary())

API key for data.gov

579b464db66ec23bdd0000018d950c9ec61a4b867ca26369d83a0fe5

Request URL:

https://api.data.gov.in/resource/8e0bd482-4aba-4d99-9cb9-ff124f6f1c2f?api-key=579b464db66ec23bdd0000018d950c9ec61a4b867ca26369d83a0fe5&format=csv

CPI data from 2013

In [None]:
# Try loading the fixed file with openpyxl engine
file_path = "/mnt/data/All India General CPI-2013.xlsx"
xls = pd.ExcelFile(file_path, engine="openpyxl")

# List all sheet names to inspect the available data
xls.sheet_names

# Load the contents of 'Sheet1' to inspect its structure
df = xls.parse('Sheet1')

# Display the first few rows
df.head(10)

# Melt the dataframe to long format: one row per month
df_long = df.melt(id_vars='Year', var_name='Month', value_name='CPI')

# Convert 'Month' name to number
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
df_long['Month'] = df_long['Month'].apply(lambda m: month_order.index(m) + 1)

# Create a proper datetime column
df_long['Date'] = pd.to_datetime(dict(year=df_long['Year'], month=df_long['Month'], day=1))

# Sort by date and set index
df_long = df_long.sort_values('Date').set_index('Date')

# Resample to quarterly average CPI
df_quarterly = df_long['CPI'].resample('Q').mean().to_frame()
df_quarterly.columns = ['CPI_avg']

# Preview the quarterly CPI
df_quarterly.head(10)


In [None]:
# Compute lagged CPI (1 quarter before)
df_quarterly['CPI_lag1Q'] = df_quarterly['CPI_avg'].shift(1)

# Compute YoY change (compare to same quarter previous year)
df_quarterly['CPI_YoY_pct'] = df_quarterly['CPI_avg'].pct_change(periods=4) * 100

# Drop early rows with missing values due to lag/Yoy calc
df_ml_ready = df_quarterly.dropna().copy()

# Show the prepared DataFrame
import caas_jupyter_tools as cj
cj.display_dataframe_to_user(name="Quarterly CPI (ML-Ready)", dataframe=df_ml_ready)


Quarterly GDP

In [None]:
# Load the newly uploaded GDP file (wide-format)
file_path = "/mnt/data/GDP_Quarterly_2011-12 TO 2024-25.xlsx"
df_raw = pd.read_excel(file_path, header=None, engine="openpyxl")

# Extract rows for years, quarters, and GDP values
years = df_raw.iloc[0].ffill().tolist()
quarters = df_raw.iloc[1].tolist()
gdp_values = df_raw.iloc[2].tolist()

# Construct DataFrame
df = pd.DataFrame({
    'Year': years,
    'Quarter': quarters,
    'GDP_Lakhs': gdp_values
})

# Clean GDP values: remove commas, convert to numeric
df['GDP_Lakhs'] = df['GDP_Lakhs'].astype(str).str.replace(',', '', regex=False)
df['GDP_Lakhs'] = pd.to_numeric(df['GDP_Lakhs'], errors='coerce')

# Map quarters to quarter-end dates
quarter_end_map = {
    'Q1': '-03-31',
    'Q2': '-06-30',
    'Q3': '-09-30',
    'Q4': '-12-31'
}
df['Date'] = pd.to_datetime(df['Year'].str[:4] + df['Quarter'].map(quarter_end_map), errors='coerce')

# Set date as index and sort
df = df.set_index('Date').sort_index()

# Create ML features
df['GDP_Cr'] = df['GDP_Lakhs'] / 100
df['GDP_lag1Q'] = df['GDP_Cr'].shift(1)
df['GDP_YoY_pct'] = df['GDP_Cr'].pct_change(periods=4) * 100

# Drop rows with missing values due to lag or YoY computation
df_ml_ready = df.dropna(subset=['GDP_Cr', 'GDP_lag1Q', 'GDP_YoY_pct'])

# Display the final ML-ready dataframe
cj.display_dataframe_to_user(name="GDP Quarterly ML Ready (Wide Format)", dataframe=df_ml_ready)