In [1]:
### Dependencies
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler

In [6]:
### Data Loading

def load_csv(file_name: str) -> pd.DataFrame:
    """
    Load a CSV file from the data directory.
    """
    data_path = Path.cwd().parent / "data" / file_name 
    if not data_path.exists():
        raise FileNotFoundError(f"{data_path} not found.")

    df = pd.read_csv(
        data_path, 
        parse_dates=['Date'],
        date_format={'Date': '%m-%d-%y'}
    )
    df = df.sort_values('Date').reset_index(drop=True)
    return df 


def add_features(df: pd.DataFrame, rolling_window: int = 10):
    """
    Add features for ML: returns, rolling volatility, price/volume features, lags.
    Handles NaN and infinite values, ready for scaling.
    """
    df = df.copy()
    
    # Returns and volatility
    df['Return'] = df['Adj Close'].pct_change()
    df['Volatility'] = df['Return'].rolling(rolling_window).std()

    # Price-based features
    df['High_Low_pct'] = (df['High'] - df['Low']) / df['Close']
    df['Close_Open_pct'] = (df['Close'] - df['Open']) / df['Open']
    df['MA5'] = df['Close'].rolling(5).mean()
    df['MA10'] = df['Close'].rolling(10).mean()

    # Volume-based features
    df['Volume_pct_change'] = df['Volume'].pct_change()
    df['Volume_MA5'] = df['Volume'].rolling(5).mean()

    # Lag features
    df['Return_lag1'] = df['Return'].shift(1)
    df['Return_lag2'] = df['Return'].shift(2)
    df['Volatility_lag1'] = df['Volatility'].shift(1)

    # Replace inf with NaN, then drop NaNs
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Features to scale
    features = [
        'Return', 'High_Low_pct', 'Close_Open_pct', 'MA5', 'MA10',
        'Volume_pct_change', 'Volume_MA5', 'Return_lag1', 'Return_lag2', 'Volatility_lag1'
    ]

    # Scaling
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features].values)

    target_col = 'Volatility'
    return df, features, target_col


# Load raw data
df = load_csv("SPX.csv")

# Preprocess
df_prepared, feature_cols, target_col = add_features(df)

X = df_prepared[feature_cols]
y = df_prepared[target_col]

print(X.head(), y.head())


# Split data into train (80%) and test (20%) by time
split_idx = int(len(df_prepared) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

     Return  High_Low_pct  Close_Open_pct       MA5      MA10  \
0 -0.142291      0.565622       -0.190756 -0.686450 -0.686529   
1 -0.292893      0.426730       -0.399186 -0.686127 -0.686630   
2 -0.071371      0.344870       -0.092604 -0.685998 -0.686577   
3  0.254500      0.308508        0.358396 -0.685905 -0.686425   
4  0.332750      0.455867        0.466694 -0.685840 -0.686268   

   Volume_pct_change  Volume_MA5  Return_lag1  Return_lag2  Volatility_lag1  
0          -0.475246   -0.618562     0.522312     0.455054        -0.131453  
1          -0.261197   -0.618588    -0.183880     0.522396        -0.138714  
2          -0.081415   -0.618562    -0.372534    -0.183897        -0.395098  
3           0.113468   -0.618603    -0.095040    -0.372578        -0.670212  
4           0.435448   -0.618601     0.313170    -0.095044        -0.710004   0    0.007430
1    0.005999
2    0.004463
3    0.004241
4    0.004261
Name: Volatility, dtype: float64
Train size: 14252, Test size: 3564


In [8]:
# Train Linear Regression Model (Richard)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Squared Error: {mse:.6f}")
print(f"Accuracy (R2): {r2 * 100:.2f}%")

# First predictions
print("-------------------")
print("Sample predictions:")
print(y_pred[:5])

Mean Squared Error: 0.000003
Accuracy (R2): 71.25%
-------------------
Sample predictions:
[0.0048275  0.00585225 0.00520611 0.00334023 0.00316915]
