In [141]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [142]:


import math
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import statsmodels.api as sm
from scipy.special import expit
from scipy import stats
import seaborn as sns


from sklearn.utils import shuffle as sklearn_shuffle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


# from misc_modules import dm_test, plot_double_standard


In [143]:
etfs = ['ICLN', 'PBD', 'QCLN']  # ETF symbols
sequence_size = 4  # Number of time steps in sequence
cross_window = 3  # Number of cross-validation windows

lags = [1]
predType = 'ahead_Return'
predLabel = 'Log-Return'

pred_size=250
model_name = f'reg-{predType}'

save_path = '../../results'
data_path = '../../data'
# 'GT Sent', 'INV Sent', ,
sent_dict =  {
    'SENT': [ 'log_ovx', 'log_return', 'log_navR', 'GT_VAL_SENT', 'INV_VAL_SENT'],
    'NO_SENT': [ 'log_ovx', 'log_return', 'log_navR']
}


In [162]:
def root_mean_squared_loss(y_true,y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

def huber_loss(y_true, y_pred, delta=1.0):
    return tf.keras.losses.Huber(delta=delta)(y_true, y_pred)

def mean_squared_log_error(y_true, y_pred):
    return tf.keras.losses.MeanSquaredLogarithmicError()(y_true, y_pred)


class model:
    def __init__(self, model_type, input_shape=None):
        self.model_type = model_type
        self.input_shape = input_shape
        self.model = self._choose_model()

    def _choose_model(self):
        """Choose and instantiate the model based on the provided model type."""
        if self.model_type == 'svr':
            return SVR(kernel='rbf', C=1.0, gamma='scale')
        elif self.model_type == 'random_forest':
            return RandomForestRegressor(n_estimators=100, max_depth=10,  random_state=42)
        elif self.model_type == 'xgboost':
            return xgb.XGBRegressor(learning_rate=0.01, max_depth=10, n_estimators=100, subsample=0.8, random_state=42)
        elif self.model_type == 'lightgbm':
            return lgb.LGBMRegressor(num_leaves=31, learning_rate=0.01, n_estimators=400, random_state=42,force_col_wise=True)
        elif self.model_type == 'catboost':
            return CatBoostRegressor(iterations=100, depth=10, learning_rate=0.05, verbose=0, random_state=42)
        else:
            raise ValueError(f"Invalid model type: {self.model_type}")


    def fit(self, X_train, y_train,epochs=100, batch_size=32, validation_split=0.2,shuffle=True):
        """Fit the model depending on the type."""

        
        if self.model_type in ['svr', 'random_forest', 'xgboost', 'lightgbm', 'catboost']:
            if shuffle:
                X_train, y_train = sklearn_shuffle(X_train, y_train, random_state=42)
            self.model.fit(X_train, y_train)

    def predict(self, X):
        """Predict using the chosen model."""
        if self.model_type in ['svr', 'random_forest', 'xgboost', 'lightgbm', 'catboost']:
            return self.model.predict(X)



In [145]:
def fetch(file, lag):
    data = pd.read_csv(f"{data_path}/{file}/{file}_INPUT.csv")
    scaler = StandardScaler()
    sent_cols = sent_dict['SENT']
    data[sent_cols] = scaler.fit_transform(data[sent_cols])
    data['ahead_Return'] = data['log_return'].shift(-1 * lag)
    data['ahead_vol'] = data['Garchvol'].shift(-1 * lag)
    data['ahead_mvol'] = data['MAvol'].shift(-1*(lag))
    data = data[:-1 * lag]  # Drop rows corresponding to lag
    data.reset_index(drop=True, inplace=True)
    return data

# Create sequences of input data
def sequences(X, y, timesteps):
    """
    Generate sequences for time series models.
    """
    X = np.asarray(X)
    y = np.asarray(y)
    alpha, beta = [], []
    n = timesteps
    for i in range(X.shape[0]):
        if i < n - 1:
            continue
        alpha.append(X[i - (n - 1):i + 1])
        beta.append(y[i])

    return np.asarray(alpha), np.asarray(beta)


In [146]:

# Define helper functions
def concat_results_and_mean(arrays):
    """
    Concatenate and compute mean across all arrays.
    """
    new_arrays = [np.array(single_arr).reshape(-1) for single_arr in arrays]
    mean_array = np.mean(new_arrays, axis=0)
    return mean_array

def getanalysis(y_true, y_pred):
    """
    Calculate MAE and Directional Accuracy.
    """
    mae = mean_absolute_error(y_true, y_pred)
    da = directional_accuracy(y_true, y_pred)
    return mae, da

def directional_accuracy(y_true, y_pred, mean_threshold=0.3):
    """
    Calculate Directional Accuracy (DA) using the mean as a threshold.
    
    Parameters:
    y_true : array-like
        True target values scaled between 0 and 1.
    y_pred : array-like
        Predicted values scaled between 0 and 1.
    mean_threshold : float
        The mean value (threshold) used to decide the direction.
    """
    
    # Determine the direction based on whether values are above or below the mean_threshold
    correct_directions = (y_true > mean_threshold) == (y_pred > mean_threshold)
    
    # Return the mean of the correctly predicted directions
    return np.mean(correct_directions)


In [147]:
def get_average_analysis(results_list, actual_list):
    """
    Calculate and return the average MAE and DA for a list of results.
    """
    mae_list = []
    da_list = []
    
    # Iterate over each segment of the results and actual values
    for results, actual in zip(results_list, actual_list):
        mae, da = getanalysis(actual, results)
        mae_list.append(mae)
        da_list.append(da)
    
    # Calculate the mean of MAE and DA
    avg_mae = np.mean(mae_list)
    avg_da = np.mean(da_list)
    
    return avg_mae, avg_da

def analyze_etfs(etfs, predType, data_path):
    for etf in etfs:
        # Fetch the data
        data = fetch(etf, 1)
        
        # Display basic info about the dataset
        # print(f"\nBasic Info for {etf}:\n", data.info())

        # Display descriptive statistics for the specified prediction type
        print(f"\nDescriptive Statistics for '{predType}' column in {etf}:\n")
        stats = pd.DataFrame(data[predType]).describe()
        print(stats)

        # Visualize distribution of the specified prediction type using a histogram
        plt.figure(figsize=(7, 6))
        sns.histplot(data[predType], kde=True, color='blue')
        plt.title(f'Distribution of {predType} for {etf}')
        plt.show()

        # Visualize boxplot for the specified prediction type
        plt.figure(figsize=(7, 6))
        sns.boxplot(x=data[predType], color='blue')
        plt.title(f'Boxplot for {predType} for {etf}')
        plt.show()

# Call the function
# analyze_etfs(etfs, predType, data_path)

In [164]:
def sliding_window_train(model,X ,Y , train_window=100, test_window = 10,  pred_size=pred_size,sequence_size=sequence_size) :

    result = []

    # .reshape(X[:-pred_size].shape[0],1,sequence_size,-1)
    model.fit(X[:-pred_size], Y[:-pred_size], epochs=200)

    if model.model_type in ['catboost','svr','xgboost'] : 
        # tw_start and tw_end , means training window start index and end index
        for tw_end in range(X.shape[0] - pred_size, X.shape[0], 10):
            tw_start = tw_end - train_window
        
            X_train, y_train = X[tw_start:tw_end], Y[tw_start:tw_end]
            # X_train = X_train.reshape(X_train.shape[0],1,sequence_size,-1)
            model.fit(X_train, y_train, epochs=50)
        
            test = X[tw_end:tw_end+test_window]  #.reshape(X[tw_end:tw_end+test_window].shape[0],1,sequence_size,-1)
            preds = model.predict(test)
            result.extend(preds)
            print(tw_end, end=' ')
    else :
        result.extend(model.predict(X[-pred_size:]))

    rmse_val = root_mean_squared_loss(Y[-pred_size:].reshape(-1), np.array(result).reshape(-1))

    print(f"model RMSE = {rmse_val}")

    return np.array(result), model

In [149]:
def train(etf, sequence_size, cross_window, model_type, sent_cols, no_sent_cols):
    lag = 1
    data = fetch(etf, lag) 

    SENT_X = data[sent_cols]
    # Y = data[[predType]]  
    NO_SENT_X = data[no_sent_cols]
    
    Y = np.array(data[[predType]])  

    SENT_X, SENT_Y = sequences(SENT_X, Y, timesteps=sequence_size)
    NO_SENT_X, NO_SENT_Y = sequences(NO_SENT_X, Y, timesteps=sequence_size)


    sent_shape = SENT_X.shape[1:]
    no_sent_shape = NO_SENT_X.shape[1:]

        # machine learning regression models
    SENT_X = SENT_X.reshape(SENT_X.shape[0], -1)
    NO_SENT_X = NO_SENT_X.reshape(NO_SENT_X.shape[0], -1)
    SENT_Y = SENT_Y.reshape(-1,)
    NO_SENT_Y = NO_SENT_Y.reshape(-1,)


    Y_PRED = Y[-pred_size:]  # Actual values for the prediction window
    sent_predictions, no_sent_predictions = [], []
    act_values = Y_PRED  # Actual values for this window

    sent_model = model(model_type,sent_shape)
    no_sent_model = model(model_type,no_sent_shape)
    
    # Perform cross validation over cross_window
    for k in range(cross_window):



        # Train and predict for SENT model
        sent_pred, sent_model = sliding_window_train(sent_model,SENT_X, SENT_Y)
        no_sent_pred, no_sent_model = sliding_window_train(no_sent_model,NO_SENT_X, NO_SENT_Y)
        
        sent_predictions.append(sent_pred)
        no_sent_predictions.append(no_sent_pred)

        mae_sent_temp, da_sent_temp = getanalysis(act_values, sent_pred)
        mae_no_sent_temp, da_no_sent_temp = getanalysis(act_values, no_sent_pred)
    
        # Print metrics for the current ETF
        print(etf , {
            "sent": {"mae": mae_sent_temp, "da": da_sent_temp},
            "no_sent": {"mae": mae_no_sent_temp, "da": da_no_sent_temp}
        })

    mean_sent = (sent_predictions[-1]) 
    mean_no_sent = (no_sent_predictions[-1])  

    # Get prediction metrics for SENT and NO_SENT
    mae_sent, da_sent = getanalysis(act_values, mean_sent)
    mae_no_sent, da_no_sent = getanalysis(act_values, mean_no_sent)

    # Print metrics for the current ETF
    print(etf , {
        "sent": {"mae": mae_sent, "da": da_sent},
        "no_sent": {"mae": mae_no_sent, "da": da_no_sent}
    })
    
    # Return the results for this ETF
    return {
        "act" : act_values,
        "sent": {"predictions": mean_sent},
        "no_sent": {"predictions": mean_no_sent}
    }

# Loop through ETFs and aggregate results
def run_all_etfs(etfs, model_type, sequence_size=4, cross_window=cross_window):
    sent_results, no_sent_results = [], []
    actual_array = []
    
    for etf in etfs:
        print(f"Training for ETF: {etf}")
        sent_cols = sent_dict['SENT']  # Columns for SENT model
        no_sent_cols = sent_dict['NO_SENT']  # Columns for NO_SENT model

        # Train and get results for the ETF
        result = train(etf, sequence_size, cross_window, model_type, sent_cols, no_sent_cols)
        
        # Store the predictions and actual values
        sent_results.append(result['sent']['predictions'])
        no_sent_results.append(result['no_sent']['predictions'])
        actual_array.append(result['act'])  # Collect actual values

    # mae_sent, da_sent = get_average_analysis(sent_results, actual_array)
    # mae_no_sent, da_no_sent = get_average_analysis(no_sent_results, actual_array)
    # Concatenate results across all ETFs
    sent_results = np.concatenate(sent_results, axis=0)
    no_sent_results = np.concatenate(no_sent_results, axis=0)
    actual_array = np.concatenate(actual_array, axis=0)

    print(sent_results.shape)
    
    # Calculate overall metrics
    mae_sent, da_sent = getanalysis(actual_array, sent_results)
    mae_no_sent, da_no_sent = getanalysis(actual_array, no_sent_results)

    # Print combined metrics
    print("Combined results:")
    print(f"SENT - MAE: {mae_sent}, DA: {da_sent}")
    print(f"NO_SENT - MAE: {mae_no_sent}, DA: {da_no_sent}")
    
    return sent_results, no_sent_results, actual_array


In [135]:
model_type = 'lightgbm'

sent_results, no_sent_results, actual_array = run_all_etfs(etfs, model_type)

Training for ETF: ICLN
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 1234, number of used features: 20
[LightGBM] [Info] Start training from score -0.020599
model RMSE = 0.8330390439088208
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 1234, number of used features: 12
[LightGBM] [Info] Start training from score -0.020599
model RMSE = 0.8395255374299379
ICLN {'sent': {'mae': 0.6441878445698258, 'da': 0.56992}, 'no_sent': {'mae': 0.6611462816342044, 'da': 0.570656}}
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 1234, number of used features: 20
[LightGBM] [Info] Start training from score -0.020599
model RMSE = 0.8330390439088208
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 1234, number of used features: 12
[LightGBM] [Info] Start training from score -0.020599
model RMSE = 0.8395255374299379
ICLN {'sent': {

In [136]:
model_type = 'svr'


sent_results, no_sent_results, actual_array = run_all_etfs(etfs, model_type,sequence_size=6)

Training for ETF: ICLN
1232 1242 1252 1262 1272 1282 1292 1302 1312 1322 1332 1342 1352 1362 1372 1382 1392 1402 1412 1422 1432 1442 1452 1462 1472 model RMSE = 0.8215029483313494
1232 1242 1252 1262 1272 1282 1292 1302 1312 1322 1332 1342 1352 1362 1372 1382 1392 1402 1412 1422 1432 1442 1452 1462 1472 model RMSE = 0.8228978883125811
ICLN {'sent': {'mae': 0.6485786235126231, 'da': 0.547104}, 'no_sent': {'mae': 0.652488768482483, 'da': 0.546368}}
1232 1242 1252 1262 1272 1282 1292 1302 1312 1322 1332 1342 1352 1362 1372 1382 1392 1402 1412 1422 1432 1442 1452 1462 1472 model RMSE = 0.8215029483313494
1232 1242 1252 1262 1272 1282 1292 1302 1312 1322 1332 1342 1352 1362 1372 1382 1392 1402 1412 1422 1432 1442 1452 1462 1472 model RMSE = 0.8228978883125811
ICLN {'sent': {'mae': 0.6485786235126231, 'da': 0.547104}, 'no_sent': {'mae': 0.652488768482483, 'da': 0.546368}}
1232 1242 1252 1262 1272 1282 1292 1302 1312 1322 1332 1342 1352 1362 1372 1382 1392 1402 1412 1422 1432 1442 1452 1462 1

In [137]:
model_type = 'catboost'                                          

sent_results, no_sent_results, actual_array = run_all_etfs(etfs, model_type )

Training for ETF: ICLN
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.7926274739728267
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.7873248931689658
ICLN {'sent': {'mae': 0.612501605002369, 'da': 0.582432}, 'no_sent': {'mae': 0.6078478436105695, 'da': 0.578752}}
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.7926274739728267
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.7873248931689658
ICLN {'sent': {'mae': 0.612501605002369, 'da': 0.582432}, 'no_sent': {'mae': 0.6078478436105695, 'da': 0.578752}}
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1

In [161]:
model_type = 'xgboost'

sent_results, no_sent_results, actual_array = run_all_etfs(etfs, model_type)

Training for ETF: ICLN
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.8041777527763126
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.8020937708383766
ICLN {'sent': {'mae': 0.6218705279684624, 'da': 0.569184}, 'no_sent': {'mae': 0.6179591764056483, 'da': 0.567712}}
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.8041777527763126
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464 1474 model RMSE = 0.8020937708383766
ICLN {'sent': {'mae': 0.6218705279684624, 'da': 0.569184}, 'no_sent': {'mae': 0.6179591764056483, 'da': 0.567712}}
1234 1244 1254 1264 1274 1284 1294 1304 1314 1324 1334 1344 1354 1364 1374 1384 1394 1404 1414 1424 1434 1444 1454 1464

In [165]:
model_type = 'random_forest'

sent_results, no_sent_results, actual_array = run_all_etfs(etfs, model_type)

Training for ETF: ICLN
model RMSE = 0.8007626303310837
model RMSE = 0.8093688856956608
ICLN {'sent': {'mae': 0.6127743323546626, 'da': 0.583168}, 'no_sent': {'mae': 0.6235114927442462, 'da': 0.58464}}
model RMSE = 0.8007626303310837
model RMSE = 0.8093688856956608
ICLN {'sent': {'mae': 0.6127743323546626, 'da': 0.583168}, 'no_sent': {'mae': 0.6235114927442462, 'da': 0.58464}}
model RMSE = 0.8007626303310837
model RMSE = 0.8093688856956608
ICLN {'sent': {'mae': 0.6127743323546626, 'da': 0.583168}, 'no_sent': {'mae': 0.6235114927442462, 'da': 0.58464}}
ICLN {'sent': {'mae': 0.6127743323546626, 'da': 0.583168}, 'no_sent': {'mae': 0.6235114927442462, 'da': 0.58464}}
Training for ETF: PBD
model RMSE = 0.7715674575224355
model RMSE = 0.7737981389282929
PBD {'sent': {'mae': 0.5778055195718544, 'da': 0.592}, 'no_sent': {'mae': 0.5787668246570582, 'da': 0.592}}
model RMSE = 0.7715674575224355
model RMSE = 0.7737981389282929
PBD {'sent': {'mae': 0.5778055195718544, 'da': 0.592}, 'no_sent': {'mae