In [2]:
from volo_tools import volo_feature_impl, volo_feature_impl_v2
import importlib
importlib.reload(volo_feature_impl_v2)
import importlib
import pandas as pd
import glob
import os
import numpy as np
importlib.reload(volo_feature_impl)
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from volo_tools import volo_model 

In [4]:
train_window = 500
steps = 30
smoothing_window = 5
feature_processor = volo_feature_impl_v2.VoloFeatureImplV2()

index = "SPY"
index_df = feature_processor.feature_process_var_index(index, future_window=45)
# Get target_variance_index_multiplier as y and keep as data frame and indexed by date
# Copy first into y_index with date index
print(index_df.head())
y_index = index_df["target_variance_index_multiplier"].to_frame()
y_index.index = index_df.index
# remove anything with any text with future in the column name
X_index = index_df.loc[:, 
    ~index_df.columns.str.contains('Future', case=False) &
    ~index_df.columns.str.contains('target', case=False)
]
X_index.dropna(inplace=True)
print(X_index.head())

csv_files = glob.glob(os.path.join("data/", "*.csv"))
valid_symbols = [os.path.basename(f).replace("_data.csv", "") for f in csv_files if pd.read_csv(f).shape[0] > 1300]
x_data = feature_processor.feature_collector(valid_symbols)

# Merge X with x_data on index and preserve index
X_index = pd.merge(
    X_index, x_data, left_index=True, right_index=True, how="left"
)
model_processor = volo_model.VoloModel()
var_index_pred = model_processor.rolling_xgb_pca_fold(
    x_data,
    y_index,
    train_window=train_window,
    step=5,
    fold_interval=5,
    low_importance_ratio=0.3,
    n_pca_components=3
)

var_index_pred_df = var_index_pred[0]

# Use har_pred to get actual variance prediction as data frame
var_index_pred_df = var_index_pred_df.merge(
    index_df[["HAR_Pred"]],
    left_index=True,
    right_index=True,
    how="left"
)
var_index_pred_df["Predicted_Index_Var"] = var_index_pred_df["Predicted"] * var_index_pred_df["HAR_Pred"]
future_window = 45


            HAR_Pred  target_variance_index_multiplier
Date                                                  
2010-05-12  0.000149                          1.744695
2010-05-13  0.000145                          1.783744
2010-05-14  0.000147                          1.747158
2010-05-17  0.000144                          1.804784
2010-05-18  0.000146                          1.832502
            HAR_Pred
Date                
2010-05-12  0.000149
2010-05-13  0.000145
2010-05-14  0.000147
2010-05-17  0.000144
2010-05-18  0.000146


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_index.dropna(inplace=True)
  combined_features.reset_index(inplace=True)



[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 545: Combining 271 low-importance features into 3 PCA components

[PCA Fold] Window 570: Combining 191 low-importance features into 3 PCA components

[PCA Fold] Window 595: Combining 134 low-importance features into 3 PCA components

[PCA Fold] Window 620: Combining 95 low-importance features into 3 PCA components

[PCA Fold] Window 645: Combining 67 low-importance features into 3 PCA components

[PCA Fold] Window 670: Combining 48 low-importance features into 3 PCA components

[PCA Fold] Window 695: Combining 35 low-importance features into 3 PCA components

[PCA Fold] Window 720: Combining 25 low-importance features into 3 PCA components

[PCA Fold] Window 745: Combining 18 low-importance features into 3 PCA components

[PCA Fold] Window 770: Combining 14 low-importance features into 3 PCA components

[PCA Fold] Window 795: Combining 11 low-importance features into 3 PCA components


In [8]:

def run_through(stock):
    
    merged_df = feature_processor.feature_process_var_corr(index, stock, future_window)
    y_beta = merged_df[f"Future_Beta_45d_{stock}"].to_frame()

    # keep as data frame for y
    y_covar = merged_df[f"Future_Cov_45_{stock}"]
    y_correlation = merged_df[f"target_correlation"]
    y_variance = merged_df[f"target_variance"]
    # Turn into series dataframe
    y_covar = y_covar.to_frame()
    y_correlation = y_correlation.to_frame()
    y_variance = y_variance.to_frame()

    #print(y_correlation.head())


    # remove anything with any text with future in the column name
    X = merged_df.loc[:, 
        ~merged_df.columns.str.contains('Future', case=False) &
        ~merged_df.columns.str.contains('target', case=False)
    ]
    X.dropna(inplace=True)
    #print(X.head())

    csv_files = glob.glob(os.path.join("data/", "*.csv"))
    valid_symbols = [os.path.basename(f).replace("_data.csv", "") for f in csv_files if pd.read_csv(f).shape[0] > 1300]
    x_data = feature_processor.feature_collector(valid_symbols)

    # Merge X with x_data on index and preserve index
    X = pd.merge(
        X, x_data, left_index=True, right_index=True, how="left"
    )

    #print(X.head())
    # Print dimensions of X
    #print(f"Feature matrix shape: {X.shape}")
    

    model_processor = volo_model.VoloModel()
    corrpred_df = model_processor.rolling_xgb_pca_fold(
        x_data,
        y_correlation,
        train_window=train_window,
        step=5,
        fold_interval=5,
        low_importance_ratio=0.3,
        n_pca_components=3
    )
    corrpred_df[0]["Predicted"] = corrpred_df[0]["Predicted"].ewm(span=5, adjust=False).mean()
    pred_converted_corr = pd.DataFrame()
    pred_converted_corr = pd.merge(
        corrpred_df[0],
        merged_df[[f"HAR_Pred_Corr_{stock}"]],
        left_index=True,
        right_index=True,
        how="left"
    )
    # Merge again with 
    pred_converted_corr = pd.merge(
        pred_converted_corr,
        merged_df[[f"Future_Corr_45_{stock}"]],
        left_index=True,
        right_index=True,
        how="left"
    )
    pred_converted_corr["Predicted_Corr"] = pred_converted_corr["Predicted"] * pred_converted_corr[f"HAR_Pred_Corr_{stock}"]
    pred_converted_corr["Actual_Corr"] = merged_df[f"Future_Corr_45_{stock}"]
    #print(pred_converted_corr.head())
    mse_corr = mean_squared_error(pred_converted_corr["Actual_Corr"], pred_converted_corr["Predicted_Corr"])
    r2_corr = r2_score(pred_converted_corr["Actual_Corr"], pred_converted_corr["Predicted_Corr"])
    #print(f"Correlation MSE: {mse_corr:.6f} | R²: {r2_corr:.4f}")

    varpred_df = model_processor.rolling_xgb_pca_fold(
        x_data,
        y_variance,
        train_window=500,
        step=5,
        fold_interval=5,
        low_importance_ratio=0.3,
        n_pca_components=3
    )
    #print(varpred_df[0])

    # Smooth predictions by exponential moving average
    varpred_df[0]["Predicted"] = varpred_df[0]["Predicted"].ewm(span=10, adjust=False).mean()

    # From the merged_df, get the correlation for the stock by multiplying correlation with HAR_Pred_Corr_{stock} into new dataframe
    pred_converted = pd.DataFrame()
    # Merge first
    pred_converted = pd.merge(
        varpred_df[0],
        merged_df[[f"HAR_Pred_Var_{stock}"]],
        left_index=True,
        right_index=True,
        how="left"
    )
    # Merge again with 
    pred_converted = pd.merge(
        pred_converted,
        merged_df[[f"Future_Var_45_{stock}"]],
        left_index=True,
        right_index=True,
        how="left"
    )
    pred_converted["Predicted_Var"] = pred_converted["Predicted"] * pred_converted[f"HAR_Pred_Var_{stock}"]
    pred_converted["Actual_Var"] = merged_df[f"Future_Var_45_{stock}"]
    #print(pred_converted[["Predicted_Var", "Actual_Var", f"HAR_Pred_Var_{stock}"]].head())
    mse_var = mean_squared_error(pred_converted["Actual_Var"], pred_converted["Predicted_Var"])
    r2_var = r2_score(pred_converted["Actual_Var"], pred_converted["Predicted_Var"])
    #print(f"Variance MSE: {mse_var:.15f} | R²: {r2_var:.4f}")
    # Merge pred_df_spy_var and corrpred_df (The stock correlation to SPY) and varpred_df (the stock var) to get covariance prediction
    #print(pred_converted_corr)
    final_pred = pd.DataFrame()
    final_pred = pd.merge(
        var_index_pred_df[["Predicted_Index_Var"]],
        pred_converted_corr[["Predicted_Corr"]],
        left_index=True,
        right_index=True,
        how="left",
    )
    final_pred = pd.merge(
        final_pred,
        pred_converted[["Predicted_Var"]],
        left_index=True,
        right_index=True,
        how="left",
    )
    final_pred = pd.merge(
        final_pred,
        y_beta,
        left_index=True,
        right_index=True,
        how="left"
    )
    
    #print(final_pred.head())
    final_pred["Predicted_Covariance"] = np.sqrt(final_pred["Predicted_Index_Var"]) * final_pred["Predicted_Corr"] * np.sqrt(final_pred["Predicted_Var"])
    final_pred = pd.merge(
        final_pred,
        merged_df[[f"Future_Cov_45_{stock}"]],
        left_index=True,
        right_index=True,
        how="left"
    )
    final_pred["Beta_Predicted"] = final_pred["Predicted_Covariance"] / final_pred["Predicted_Index_Var"]
    final_pred.to_csv(f"results/predicted_covariance_{stock}.csv")
    
    # Beta mse
    mse_beta = mean_squared_error(y_beta[f"Future_Beta_45d_{stock}"], final_pred["Beta_Predicted"])
    r2_beta = r2_score(y_beta[f"Future_Beta_45d_{stock}"], final_pred["Beta_Predicted"])
    print(f"Beta MSE: {mse_beta:.15f} | R²: {r2_beta:.4f}")
    

    final_pred.to_csv("results/data/predicted_covariance_xgb.csv")
    # drop any na
    final_pred.dropna(inplace=True)
    mse_cov = mean_squared_error(final_pred[f"Future_Cov_45_{stock}"], final_pred["Predicted_Covariance"])
    r2_cov = r2_score(final_pred[f"Future_Cov_45_{stock}"], final_pred["Predicted_Covariance"])
    print(f"Covariance MSE: {mse_cov:.15f} | R²: {r2_cov:.4f}")
    
    # Save mse and r2 to a text file
    with open(f"results/metrics/metrics_{stock}.txt", "w") as f:
        f.write(f"Correlation MSE: {mse_corr:.15f} | R²: {r2_corr:.4f}\n")
        f.write(f"Variance MSE: {mse_var:.15f} | R²: {r2_var:.4f}\n")
        f.write(f"Covariance MSE: {mse_cov:.15f} | R²: {r2_cov:.4f}\n")
        f.write(f"Beta MSE: {mse_beta:.15f} | R²: {r2_beta:.4f}\n")

In [None]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_symbol(symbol):
    try:
        print(f"Running through symbol: {symbol}")
        run_through(symbol)
    except Exception as e:
        print(f"Error processing {symbol}: {e}")

csv_files = [f for f in os.listdir("data") if f.endswith("_data.csv")]
valid_symbols_to_run = [
    os.path.basename(f).replace("_data.csv", "")
    for f in csv_files
    if pd.read_csv(os.path.join("data", f)).shape[0] > 1300
]

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(process_symbol, symbol): symbol for symbol in valid_symbols_to_run}
    for future in as_completed(futures):
        symbol = futures[future]
        try:
            future.result()
        except Exception as e:
            print(f"{symbol} failed: {e}")


Running through symbol: AABA
Running through symbol: AAPL
Running through symbol: ABT
Running through symbol: ACN
Running through symbol: AET
Running through symbol: AGG
Running through symbol: AGN
Running through symbol: AIG
            Code_SPY Symbol_SPY  Close_SPY   Volume_SPY  \
Date                                                      
2010-01-05     59751        SPY     113.63  112000000.0   
2010-01-06     59751        SPY     113.71  116000000.0   
2010-01-07     59751        SPY     114.19  131000000.0   
2010-01-08     59751        SPY     114.57  126000000.0   
2010-01-11     59751        SPY     114.73  106000000.0   

            Adjustment Factor_SPY  Adj_Close_SPY  Log_Return_SPY  \
Date                                                               
2010-01-05                    1.0         113.63        0.002644   
2010-01-06                    1.0         113.71        0.000704   
2010-01-07                    1.0         114.19        0.004212   
2010-01-08          

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)
  combined_features.reset_index(inplace=True)



[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 520: Combining 386 low-importance features into 3 PCA components

[PCA Fold] Window 545: Combining 271 low-importance features into 3 PCA components

[PCA Fold] Window 545: Combining 271 low-importance features into 3 PCA components

[PCA Fold] Window 545: Combining 271 low-importance features into 3 PCA components

[PCA Fold] Window 545: Combining 271 low-importance features into 3 PCA com