In [3]:
! pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [4]:
# =========================
# 0) Imports, engine check, folders
# =========================
import pandas as pd, numpy as np, warnings, re
from pathlib import Path
warnings.filterwarnings('ignore')

# --- Require a Parquet engine ---
try:
    import pyarrow  # noqa: F401
    PARQUET_ENGINE = "pyarrow"
except ImportError:
    try:
        import fastparquet  # noqa: F401
        PARQUET_ENGINE = "fastparquet"
    except ImportError as e:
        raise ImportError(
            "Parquet output is required but no engine is installed.\n"
            "Install one of:\n"
            "  pip install pyarrow\n"
            "    or\n"
            "  pip install fastparquet"
        ) from e

RAW  = Path('./raw')            # change if your files are elsewhere
PROC = Path('./processed'); PROC.mkdir(exist_ok=True)

In [5]:
# Robust column picker (case-insensitive, partial matches allowed)
def pick(df, candidates):
    for c in candidates:
        if c in df.columns: return c
    for c in df.columns:
        for k in candidates:
            if k.lower() in c.lower(): return c
    return None

# Smart date parser: chooses the best of multiple formats by coverage
from dateutil import parser as _dateparser

def smart_parse_dates(s: pd.Series) -> pd.Series:
    s = s.astype(str)
    attempts = []
    attempts.append(('infer', pd.to_datetime(s, errors='coerce')))
    attempts.append(('dayfirst', pd.to_datetime(s, errors='coerce', dayfirst=True)))
    fmts = ['%d-%b-%Y','%d-%b-%y','%d/%m/%Y','%m/%d/%Y','%Y-%m-%d','%b %d, %Y','%d %b %Y','%b %Y','%m-%Y']
    for fmt in fmts:
        attempts.append((fmt, pd.to_datetime(s, format=fmt, errors='coerce')))
    def score(dt):
        ok = dt.dropna()
        if ok.empty: return (0,0,0)
        ser = pd.Series(1, index=ok).sort_index()
        by_year_months = ser.groupby(ser.index.year).apply(lambda x: len(pd.Index(x.index.month).unique()))
        med_months = int(by_year_months.median()) if len(by_year_months) else 0
        return (len(ok), med_months, len(pd.Index(ok.dt.month).unique()))
    best_name, best_dt, best_score = max([(n,d,score(d)) for n,d in attempts], key=lambda t: t[2])
    print(f"[smart_parse_dates] picked: {best_name} | parsed={best_score[0]} | median months/yr={best_score[1]} | unique months={best_score[2]}")
    return best_dt

def parse_dates_in_df(df, date_col='Date'):
    out = df.copy()
    out[date_col] = smart_parse_dates(out[date_col])
    out = out.dropna(subset=[date_col]).sort_values(date_col)
    return out

def _parse_investing_prices(df, date_col='Date', price_col='Price'):
    out = parse_dates_in_df(df, date_col=date_col)
    out[price_col] = pd.to_numeric(out[price_col].astype(str).str.replace(',', '', regex=False), errors='coerce')
    out = out.dropna(subset=[price_col])
    return out[[date_col, price_col]]

def investing_to_quarter_ret(df, date_col='Date', price_col='Price'):
    df2 = _parse_investing_prices(df, date_col, price_col)
    q_end_price = df2.set_index(date_col)[price_col].resample('Q').last()
    return q_end_price.pct_change()

def monthly_to_quarter(series, how='mean'):
    return series.resample('Q').mean() if how=='mean' else series.resample('Q').last()

def yoy_from_monthly(series):
    return series.pct_change(12) * 100.0

def check_monthly_coverage(df, date_col='Date', label='series'):
    dates = smart_parse_dates(df[date_col]).dropna()
    ser = pd.Series(1, index=dates).sort_index()
    months_per_year = ser.groupby(ser.index.year).apply(lambda s: sorted(pd.Index(s.index.month).unique()))
    print(f"[{label}] Months present per year:")
    for y, months in months_per_year.items():
        print(f"  {y}: {months}")
    q_counts = ser.resample('Q').size()
    miss = q_counts[q_counts == 0]
    if len(miss) > 0:
        print(f"[{label}] ⚠ Missing {len(miss)} quarter(s) with zero rows — resample will yield NaNs.")
    else:
        print(f"[{label}] ✅ At least one row in every quarter.")


In [6]:
# Ensure files exist in ./raw
n50_raw = pd.read_csv(RAW/'Nifty50.csv')
mid_raw = pd.read_csv(RAW/'NIFTYMidcap100.csv')

# Quick coverage QA (helps catch wrong date parsing or partial exports)
check_monthly_coverage(n50_raw, 'Date', 'NIFTY 50')
check_monthly_coverage(mid_raw, 'Date', 'NIFTY Midcap 100')

nifty_qret  = investing_to_quarter_ret(n50_raw).rename('nifty_qret')
midcap_qret = investing_to_quarter_ret(mid_raw).rename('midcap_qret')
excess_ret  = (midcap_qret - nifty_qret).rename('excess_ret')

print("Index quarterly points (excess_ret):", excess_ret.dropna().shape[0])


[smart_parse_dates] picked: infer | parsed=188 | median months/yr=12 | unique months=12
[NIFTY 50] Months present per year:
  2010: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2011: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2012: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2013: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2014: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2015: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2016: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2017: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2018: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2019: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2020: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2021: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2022: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2023: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2024: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
  2025: [1, 2, 3, 4, 5, 6, 7, 8]
[NIFTY 50] ✅ At least one row in every quarter.
[smart_parse_dates] picked: infer | parsed=188 | median months/yr=12 | uniq

In [None]:
# CPI monthly to YoY% to quarterly mean
cpi = pd.read_csv(RAW/'CPI_Monthly_Jan_2013_to_Jun_2025.csv')  # <- file name you uploaded
c_date = pick(cpi, ['DATE','Date','Month','Period'])           # will choose 'DATE'
c_val  = pick(cpi, ['CPI','Index','Value','CPI_COMBINED_BASE2012_100'])  # will choose 'CPI_COMBINED_BASE2012_100'

cpi = parse_dates_in_df(cpi, c_date)   # smart date parser picked "infer" in my check
cpi_m = cpi.set_index(c_date)[c_val].astype(float).rename('cpi_index')

# YoY requires 12 months of history, so the first 12 rows will be NaN (expected)
cpi_yoy_m = yoy_from_monthly(cpi_m).rename('cpi_yoy_m')

# Aggregate to quarterly (mean of three months)
cpi_yoy_q = monthly_to_quarter(cpi_yoy_m, how='mean').rename('cpi_yoy')
print("CPI monthly points:", cpi_m.shape[0], "| CPI YoY monthly (non-NaN):", cpi_yoy_m.dropna().shape[0])
print("CPI quarterly points:", cpi_yoy_q.dropna().shape[0])
