In [3]:
### Dependencies
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler

In [4]:
### Data Loading

def load_csv(file_name: str) -> pd.DataFrame:
    """
    Load a CSV file from the data directory.
    """
    data_path = Path.cwd().parent / "data" / file_name 
    if not data_path.exists():
        raise FileNotFoundError(f"{data_path} not found.")

    df = pd.read_csv(data_path, parse_dates=['Date'])
    
    # Parse 'Date' column explicitly
    # MM/DD/YY format
    df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y', errors='coerce')
    
    # Correct future years (pandas may interpret '30' as 2030)
    # Assume dates from 1928â€“2020
    df.loc[df['Date'] > pd.Timestamp.today(), 'Date'] -= pd.offsets.DateOffset(years=100)
    
    # Sort by date
    df = df.sort_values('Date').reset_index(drop=True)
    return df 



def add_features(df: pd.DataFrame, rolling_window: int = 10):
    """
    Add features for ML: returns, rolling volatility, price/volume features, lags.
    Handles NaN and infinite values, ready for scaling.
    """
    df = df.copy()
    
    # Returns and volatility
    df['Return'] = df['Adj Close'].pct_change()
    df['Volatility'] = df['Return'].rolling(rolling_window).std()

    # Price-based features
    df['High_Low_pct'] = (df['High'] - df['Low']) / df['Close']
    df['Close_Open_pct'] = (df['Close'] - df['Open']) / df['Open']
    df['MA5'] = df['Close'].rolling(5).mean()
    df['MA10'] = df['Close'].rolling(10).mean()

    # Volume-based features
    df['Volume_pct_change'] = df['Volume'].pct_change()
    df['Volume_MA5'] = df['Volume'].rolling(5).mean()

    # Lag features
    df['Return_lag1'] = df['Return'].shift(1)
    df['Return_lag2'] = df['Return'].shift(2)
    df['Volatility_lag1'] = df['Volatility'].shift(1)

    # Target: 10 days-ahead volatility
    df['Volatility_future'] = df['Volatility'].shift(-10)

    # Replace inf with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Fill volume-related NaNs (common in early data)
    df['Volume_pct_change'] = df['Volume_pct_change'].fillna(0)
    df['Volume_MA5'] = df['Volume_MA5'].ffill()

    # Drop only rows where the key features are missing
    df = df[df['Volatility'].notna() & df['Volatility_future'].notna()]
    df.reset_index(drop=True, inplace=True)

    # Features to scale
    features = [
        'Return', 'High_Low_pct', 'Close_Open_pct', 'MA5', 'MA10',
        'Volume_pct_change', 'Volume_MA5', 'Return_lag1', 'Return_lag2', 'Volatility_lag1'
    ]

    # Scaling
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features].values)

    target_col = 'Volatility_future'
    return df, features, target_col


# Load raw data
df = load_csv("SPX.csv")

# Preprocess
df_prepared, feature_cols, target_col = add_features(df)

X = df_prepared[feature_cols]
y = df_prepared[target_col]

# Drop any row where X or y has NaN
mask = (~X.isna().any(axis=1)) & (~y.isna())
X = X[mask]
y = y[mask]

print(X.head(), y.head())


# Split data into train (80%) and test (20%) by time
split_idx = int(len(df_prepared) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

  df = pd.read_csv(data_path, parse_dates=['Date'])


     Return  High_Low_pct  Close_Open_pct       MA5      MA10  \
0 -1.245743      1.441459       -1.718118 -0.724791 -0.725224   
1  0.113232      1.229569        0.162853 -0.725109 -0.725223   
2 -0.455362      1.436386       -0.624143 -0.725362 -0.725264   
3  1.165124      1.867372        1.618788 -0.725466 -0.725044   
4  0.348258      1.721721        0.488155 -0.725460 -0.724928   

   Volume_pct_change  Volume_MA5  Return_lag1  Return_lag2  Volatility_lag1  
0          -0.649482   -0.615332    -0.161107     0.614987         0.420192  
1          -0.266756   -0.616119    -1.566368    -0.161073         0.641917  
2           0.276079   -0.616346     0.136393    -1.566493         0.623764  
3           0.061046   -0.616502    -0.576041     0.136461         0.646338  
4           0.529473   -0.616396     1.454387    -0.576053         0.618225   0    0.011782
1    0.011681
2    0.011807
3    0.011650
4    0.010676
Name: Volatility, dtype: float64
Train size: 14252, Test size: 3564
