In [7]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

# Load data
df = pd.read_csv("train_data_cleaned.csv")  # or test_data.csv

# Find all IV columns and group them by strike
strike_iv_map = defaultdict(list)
iv_cols = [col for col in df.columns if re.match(r'(call|put)_iv_\d+', col)]

for col in iv_cols:
    strike = int(re.search(r'\d+', col).group())
    strike_iv_map[strike].append(col)

# Create unified IV columns by averaging all call/put ivs per strike
unified_ivs = {}
for strike, cols in strike_iv_map.items():
    unified_ivs[f"iv_{strike}"] = df[cols].mean(axis=1, skipna=True)

# Build unified dataframe
unified_df = pd.DataFrame(unified_ivs)

# Append underlying and features x0–x41
df_iv = pd.concat([df[['underlying'] + [f'X{i}' for i in range(42)]], unified_df], axis=1)

# Compute moneyness columns
for col in unified_df.columns:
    strike = int(col.split("_")[1])
    df_iv[f"m_{strike}"] = strike / df_iv['underlying']

# Extract unique strikes
unique_strikes = sorted([int(col.split("_")[1]) for col in unified_df.columns])


In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.interpolate import CubicSpline
import random

def fit_spline_with_mirroring(moneyness, ivs, m_underlying=1.0):
    mask = np.isfinite(ivs)
    x, y = moneyness[mask], ivs[mask]

    if len(x) < 3:
        return None

    left = x < m_underlying
    right = x > m_underlying
    if left.sum() < 2 and right.sum() >= 2:
        x = np.concatenate([x, 2 * m_underlying - x[right]])
        y = np.concatenate([y, y[right]])
    elif right.sum() < 2 and left.sum() >= 2:
        x = np.concatenate([2 * m_underlying - x[left], x])
        y = np.concatenate([y[left], y])

    sorted_idx = np.argsort(x)
    x, y = x[sorted_idx], y[sorted_idx]

    idx_u = np.abs(x - m_underlying).argmin()
    slope_l = 1.5 * (y[idx_u] - y[0]) / (x[idx_u] - x[0] + 1e-6)
    slope_r = 1.5 * (y[-1] - y[idx_u]) / (x[-1] - x[idx_u] + 1e-6)

    return CubicSpline(x, y, bc_type=((1, slope_l), (1, slope_r)), extrapolate=False)

def process_row(row, unique_strikes):
    ivs = row[[f"iv_{s}" for s in unique_strikes]].values
    moneyness = row[[f"m_{s}" for s in unique_strikes]].values
    known = np.where(~np.isnan(ivs))[0]
    if len(known) < 6:
        return []

    mask_size = len(known) // 2
    masked_idx = np.random.choice(known, size=mask_size, replace=False)
    ivs_masked = ivs.copy()
    ivs_masked[masked_idx] = np.nan

    spline = fit_spline_with_mirroring(moneyness, ivs_masked)
    if spline is None:
        return []

    preds = spline(moneyness[masked_idx])
    residuals = ivs[masked_idx] - preds

    row_result = []
    for i, s_idx in enumerate(masked_idx):
        s = unique_strikes[s_idx]
        row_result.append({
            "residual": residuals[i],
            "iv_spline": preds[i],
            "moneyness": moneyness[s_idx],
            "log_moneyness": np.log(moneyness[s_idx]),
            "iv_min": np.nanmin(ivs),
            "iv_max": np.nanmax(ivs),
            "iv_mean": np.nanmean(ivs),
            "underlying": row['underlying'],
            **{f"X{i}": row[f"X{i}"] for i in range(42)}
        })
    return row_result

# Parallel processing with progress
results = Parallel(n_jobs=-1)(
    delayed(process_row)(row, unique_strikes)
    for _, row in tqdm(df_iv.iterrows(), total=len(df_iv))
)

# Flatten results
train_residual_df = pd.DataFrame([item for sublist in results for item in sublist])


  5%|▍         | 8208/178340 [00:05<01:47, 1579.64it/s]
 78%|███████▊  | 138768/178340 [01:49<00:38, 1016.78it/s]