In [None]:
#Basics
import os
import sys
sys.path.append("..")
#Toolkits
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib
import warnings
# Modelling
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
#Statistic tools - ACF/PACF
from statsmodels.tsa.stattools import acf
from statsmodels.graphics.tsaplots import plot_acf
#import our Base Class
from model_base import Prediction_Base
#ignore warnings from matplotlib.close - is needed to avoid overflow of notebook
warnings.filterwarnings("ignore")
#avoid matplot plots
%matplotlib agg

In [None]:
class BaseNN(Prediction_Base):
    def prepare_for_training(self):
        """
        Initiates the Model, and creates the predictions with the corresponding seed given.
        assigns rf, y_train_pred, y_test_pred
        """
        self.rf = MLPRegressor(hidden_layer_sizes=(50,23),random_state=self.seed)
        self.rf.fit(self.X_train, self.y_train)
        #all predictions
        self.y_train_pred = self.rf.predict(self.X_train)
        self.y_test_pred = self.rf.predict(self.X_test)
        
    def save_model(self):
        """
        Where the specific model is saved. Creates the model directory if it does not exist.
        Assigns: Nothing
        """
        if not os.path.isdir(self.experimentpath + f"saved_models/"):
            os.mkdir(self.experimentpath + f"saved_models/")
        joblib.dump(self.rf, self.experimentpath + f"saved_models/{self.seed}.joblib")
        pass

    def load_model(self, model_seed):
        """
        Loads the model from the directory
        Assigns: rf
        """
        self.rf = joblib.load(self.experimentpath + f"saved_models/{model_seed}.joblib")

    def register_mse_of_model(self):
        """
        compares the MSE between train and test.
        Assigns: stats_mse
        """
        self.stats_mse[f'{self.seed}'] = {
                                            'MSE_train_scaled':mean_squared_error(self.y_train_pred,self.y_train),
                                            'MSE_test_scaled': mean_squared_error(self.y_test_pred,self.y_test),
                                            'MSE_cumsum_test': mean_squared_error(self.y_test, self.y_test_pred)
                                         }
                                                         

    def save_mse_of_models(self):
        """
        saves the MSE of the models in statistics folder for this model.
        Creates the directory if not existing.
        Assigns: Nothing
        """
        if not os.path.isdir(self.experimentpath + f"statistics/"):
            os.mkdir(self.experimentpath + f"statistics/")
        try:
            with open(self.experimentpath+f'statistics/mse_of_models.json', 'w') as file:
                file.write(json.dump(str(self.stats_mse), file))
        except:
            pass
    
    def run_model_configuration(self):
        """
        The main loop for the model. We set the numpy and model seed here. 
        """
        self.read_in_data()
        self.raw_data.head()
        self.plot_data_for_comparison()
        self.acf_plot()
        self.create_lags()
        self.split_dataset_by_years()
        self.split_dataset_by_train_test()
        for i in self.random_seeds:
            self.set_seed(i)
            np.random.seed(i)
            self.prepare_for_training()
            self.return_to_original_values()
            self.plot_performance()
            regressors = self.append_regressors()
            self.save_model()
            self.register_mse_of_model()
            self.hit_ratio()
        self.save_mse_of_models()
        self.plot_train_comparison()
        self.plot_test_comparison()
        self.plot_test_comparison_Patrick()
        self.model_statistics_unscaled()
        self.write_stats_to_file()

years = [[2014, 2015, 2016], 
         [2015, 2016, 2017], 
         [2016, 2017, 2018], 
         [2017, 2018, 2019], 
         [2018, 2019, 2020], 
         [2019, 2020, 2021], 
         [2020, 2021, 2022], 
         [2021, 2022, 2023], 
         [2022, 2023, 2024],
         [2014, 2015], 
         [2015, 2016], 
         [2016, 2017],
         [2017, 2018], 
         [2018, 2019], 
         [2019, 2020], 
         [2020, 2021], 
         [2021, 2022], 
         [2022, 2023], 
         [2023, 2024],
         [2014, 2015, 2016, 2017], 
         [2015, 2016, 2017, 2018], 
         [2016, 2017, 2018, 2019], 
         [2017, 2018, 2019, 2020], 
         [2018, 2019, 2020, 2021], 
         [2019, 2020, 2021, 2022], 
         [2020, 2021, 2022, 2023], 
         [2021, 2022, 2023, 2024]]

for train_iter in years:
        for dataset in os.listdir('../data/'):
            print(f'Years {train_iter} on dataset {dataset} started running.')
            if dataset == '.ipynb_checkpoints':
                continue
            if dataset == 'btc_hist.csv' or dataset == 'eth_hist.csv':
                    iter_lags = [1, 7, 14, 21, 28, 'auto']
            else:
                    iter_lags = [1, 5, 10, 15, 20, 'auto']
            try:
                for lag in iter_lags:
                        model = BaseNN(dataname = dataset, 
                                        train_years=train_iter[:-1], 
                                        test_years=[train_iter[-1]],
                                        no_of_lag = lag,
                                        automate_lag= True if lag == 'auto' else False,
                                        model = 'NN'
                                        )
                        
                        model.run_model_configuration()
            except:
                pass
            print(f'Years {train_iter} on dataset {dataset} finished running.')


In [None]:
class top_models(BaseNN):
    """
    All methods from the model class aswell as the model_base are inherited. 
    The process includes finding the best 10 models, through the MSE of the training dataset.
    Various plots are made, used in the paper. 
    """
    def load_model_statistics(self, dataname):
        #folders to be ignored in subsequent runs
        self.ignore_folders = ['original_comparison_plot.png', 
                               'original_comaprisonplot.png',
                               'tex_table_top1_asset.txt']
        
        self.dataname = dataname
        self.top_seeds = []
        self.path = f'../results/{self.model}/{dataname}/'
        self.sorted_list_with_seed = []
        self.all_y_train_pred = {}
        self.all_y_test_pred = {}
        self.number_models = 10
        self.run_top_10_loop()

    def run_top_10_loop(self):
        try:
            for self.lag in os.listdir(self.path):
                if self.lag in self.ignore_folders:
                    continue
                for self.yearfolder in os.listdir(self.path+f'{self.lag}/'):
                        if self.yearfolder in self.ignore_folders:
                            continue
                        for self.contentfolder in os.listdir(self.path+f'{self.lag}/{self.yearfolder}/'): 
                            if self.contentfolder == 'statistics':
                                with open(f'{self.path}{self.lag}/{self.yearfolder}/{self.contentfolder}/mse_of_models.json','r') as file:
                                    self.loaded_mse_of_models = eval(json.load(file))
                                self.write_correlation_to_file()
                                self.discover_top_seeds()
                                self.load_top_models()
                                self.read_in_data()
                                self.split_years_create_lag_mplot()
                                self.plot_top_performance()
                                self.plot_top_performance_train()
                                self.hitratio_plot()
                                self.aggregate_and_plot()
        except:
            pass

    def split_years_create_lag_mplot(self):
        """
        In this function, four things happen:
        1. the train and test years are separated and evaluated into a proper list
        2. the lags are assigned. This happens through the folder, or in case of auto, through the model parameters
        3. the respective datasets are created in split_dataset_*
        4. mplot_* are assigned to be used later to plot
        """
        # making sure the train/testsplit is done properly
        self.train_years = eval(self.yearfolder[:-7])
        self.test_years = eval(self.yearfolder[-6:])
        self.mplot_sign = []
        self.mplot_prop = []
        
        #in case the lag is 'auto' as folder, the features from the provided model is taken as lag.
        if self.lag == 'auto':
            self.no_of_lag = self.models[next(iter(self.models))].n_features_in_
        else:
            self.no_of_lag = eval(self.lag)
        #self.automate_lag = False
        self.acf_plot()
        self.create_lags()

        
        self.split_dataset_by_years()
        self.split_dataset_by_train_test()
        self.prediction_data_for_plot_scaled = {}
        for list_entry in self.sorted_list_with_seed:
            for seed in list_entry:
                if int(seed) >= 1:
                    self.rf = self.models[seed]
                    self.predict(seed)
                    perf_sign = np.cumsum(self.y_test * np.sign(self.y_test_pred))  # Simplified
                    perf_prop = np.cumsum(self.y_test * self.y_test_pred / np.mean(self.y_test_pred))
        
                    self.mplot_sign.append(perf_sign)
                    self.mplot_prop.append(perf_prop)           

    def create_lags(self):
            """
            This function creates the lags, as decided in the function acf_plot2. 
            It creates a new dataset called self.lags.
            This dataset consists of the last X days, where X = number of lags.
            The values are shifted, after a diff and log transformation.
            It also isolates the first values of raw data. Its value is the same as the first value of train
            The lag columns are named after their day, increasing with distance.
            """
            print(f'-----------------#of lag created: {self.lag}------------------')
            self.first_value_raw_data = self.raw_data["Close"].iloc[0]
            self.raw_after_diff_and_log = pd.DataFrame(np.diff(np.log(self.raw_data["Close"])),
                                               index = self.raw_data.index[1:], columns=['Close'])
            
            self.lags = [self.raw_after_diff_and_log.shift(i) for i in range(self.no_of_lag+1)]
            #make self.lags into a dataframe again
            self.lags = pd.concat(self.lags, axis=1)
            self.lags.columns = ['Lag_'+str(i) for i in range(self.no_of_lag+1)]
            self.lags = self.lags.dropna()

    def discover_top_seeds(self, number_models = 10):
        """
        Based on the train MSE, we discover the best available seeds.
        If the number of models is lower than the requested amount, all models are returned to compensate.
        The models are sorted, lowest MSE first.

        Assigns: sorted_list_with_seed, top_seeds
        """
        self.top_seeds = []

        for seed in self.loaded_mse_of_models:
            self.top_seeds.append((seed, self.loaded_mse_of_models[seed]['MSE_train_scaled']))
        try:
            self.sorted_list_with_seed = sorted(self.top_seeds, key = lambda x: x[1], reverse = False)[:number_models]
        except:
            self.sorted_list_with_seed = sorted(self.top_seeds, key = lambda x: x[1], reverse = False)

    def load_top_models(self):
        """
        Based on the finding in discover_top_seeds, the best models are loaded for further use. 
        """
        self.models = {}
        for seed, mse in self.sorted_list_with_seed:
            self.models[seed] = (joblib.load(self.path + f"{self.lag}/{self.yearfolder}/saved_models/{seed}.joblib"))

    def predict(self, seed):
        """
        Predictions are made and saved in the respective variables. This can differ, directly for the model or for the different seeds, denoted by all_*
        """    
        self.y_train_pred = self.rf.predict(self.X_train)
        self.y_test_pred = self.rf.predict(self.X_test)
        self.all_y_train_pred[seed] = self.y_train_pred
        self.all_y_test_pred[seed] = self.y_test_pred


    def plot_top_performance(self):
        """
        In this function, the top cumulative performance is plotted. 

        Assigns: Nothing

        Plot saved under: "../images/RandomForest/btc_hist.csv/auto/[2022,2023]-[2024]/Top-Model-PerformancePlots/Best_Performances_[2022,2023]-[2024]_auto.png"
        """
        if not os.path.isdir(self.path +f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/'):
            os.mkdir(self.path +f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/')
        
        if len(self.y_test_pred) != len(self.y_test):
            raise ValueError("y_test_pred and target_out length mismatch. Check for leap year issues.")
        
        # Plotting
        plt.figure(figsize=(10, 6))
        for i in self.models:
            plt.plot(pd.DataFrame(np.cumsum(np.sign(self.models[i].predict(self.X_test))*self.y_test), index=self.X_test.index))
        plt.plot(pd.DataFrame(np.cumsum(self.y_test),index=self.X_test.index), label='Buy & Hold', color = 'black', linewidth=2)
        plt.legend()
        plt.title(f"{self.model}, Lag {self.no_of_lag}", fontdict = {'fontsize' : 30})
        plt.ylabel('Cumulative Performance', fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.savefig(self.path + f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/Best_Performances_{self.yearfolder}_{self.lag}.png')
        plt.close()

    def plot_top_performance_train(self):
        """
        In this function, the top cumulative performance for train is plotted. 

        Assigns: Nothing

        Plot saved under: "../images/RandomForest/btc_hist.csv/auto/[2022,2023]-[2024]/Top-Model-PerformancePlots/Best_Performances_[2022,2023]-[2024]_auto.png"
        """
        if not os.path.isdir(self.path +f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/'):
            os.mkdir(self.path +f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/')
        
        if len(self.y_test_pred) != len(self.y_test):
            raise ValueError("y_test_pred and target_out length mismatch. Check for leap year issues.")
        
        # Plotting
        plt.figure(figsize=(10, 6))
        for i in self.models:
            plt.plot(pd.DataFrame(np.cumsum(np.sign(self.models[i].predict(self.X_train))*self.y_train), index=self.X_train.index))
        plt.plot(pd.DataFrame(np.cumsum(self.y_train),index=self.X_train.index), label='Buy & Hold', color = 'black', linewidth=2)
        plt.legend()
        plt.title(f"{self.model}, Lag {self.no_of_lag}", fontdict = {'fontsize' : 30})
        plt.ylabel('Cumulative Performance', fontsize=15)
        plt.xticks(fontsize=10)
        plt.yticks(fontsize=10)
        plt.savefig(self.path + f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/Best_Performances_{self.yearfolder}_{self.lag}_train.png')
        plt.close()
    
    def hitratio_plot(self):
        """
        We create a plot using the Hitratios provided by the models. The models are chosen by train MSE, see above for more details.
        The plot of the best 10 models is then saved.
        """
        if not os.path.isdir(self.path +f'{self.lag}/{self.yearfolder}/Top-Hitratio/'):
            os.mkdir(self.path +f'{self.lag}/{self.yearfolder}/Top-Hitratio/')
        hitratios = {}
        hitratio_values_for_boxplot_test = []
        for list_entry in self.sorted_list_with_seed:
                 for seed in list_entry:
                    if int(seed) >= 1:
                        with open(self.path + f'{self.lag}/{self.yearfolder}/hitratio/{seed}','r') as file:
                            hitratios[seed] = eval(json.load(file))
        
        for seed in hitratios:
            #plots the hitratio as a regular plot.
            plt.plot(hitratios[seed]['Hit Ratio Test'], label = seed, marker='o')
            #appends values for test
            hitratio_values_for_boxplot_test.append(hitratios[seed]['Hit Ratio Test'])
        plt.legend()
        plt.savefig(self.path + f'{self.lag}/{self.yearfolder}/Top-Hitratio/Best-Hitratios-Test_{self.yearfolder}_{self.lag}.png')
        plt.close()
        
        ## Boxplot for Hitratios
        pd.DataFrame(hitratio_values_for_boxplot_test, columns=[f'Hit-Ratios {self.dataname.split("_")[0]}']).boxplot(grid=False, fontsize=15)
        plt.savefig(self.path + f'{self.lag}/{self.yearfolder}/Top-Hitratio/Best-Hitratios-Test_Boxplot_{self.yearfolder}_{self.lag}.png')
        plt.close()

    def plot_actual_performance(self):
        """
        Legacy, not further used. 
        """
        y_test_pred=pd.DataFrame(self.y_test_pred, index=self.y_test.index)
        plt.plot(y_test_pred, label='Prediction')
        plt.plot(self.y_test, label='True', color = 'black', linewidth = 2)
        #plt.legend()
        plt.savefig(self.path + f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/Best_Performances_{self.yearfolder}_{self.lag}_Actual_Performance.png')
        plt.close()
        
    def write_correlation_to_file(self):
        """
        Pearson Correlation is written into a file. For each model-lag-year combination, the MSE correlation is written into a file. 
        """
        mse_train = []
        mse_test = []
 
        # Extract MSE values from the loaded JSON data
        for seed, mse_values in self.loaded_mse_of_models.items():
            mse_train.append(mse_values['MSE_train_scaled'])
            mse_test.append(mse_values['MSE_test_scaled'])
        
        # Results in a 2x2 Matrix, we're only interested in Pearson Corr [0,1]
        correlation = np.corrcoef(mse_train, mse_test)[0, 1]
        
        with open(self.path +f'{self.lag}/{self.yearfolder}/statistics/mse_corr.json', "w") as file:
            json.dump(round(correlation,6), file)
        
    def aggregate_and_plot(self):
        """
        The best 10 seeds are used to create a plot that uses the mean (eg. the most recommended course of action) for the current date. 
        
        """
        # Convert lists to numpy arrays for easier manipulation
        mplot_sign = np.column_stack(self.mplot_sign)
        mplot_prop = np.column_stack(self.mplot_prop)
        # print(mplot_sign)
        # Aggregate performance across all models
        self.perf_agg_sign_mse = np.mean(mplot_sign, axis=1)
        self.perf_agg_prop_mse = np.mean(mplot_prop, axis=1)
        # Prepare data for plotting
        cumulative_y_test = np.cumsum(self.y_test)
        
        plt.figure(figsize=(10, 5))
        plt.plot(cumulative_y_test, label='Buy & Hold', color='black', linewidth = 2)
        plt.plot(pd.DataFrame(self.perf_agg_sign_mse, index=self.X_test.index), label='Sign Aggregate', color='red')
        #plt.plot(pd.DataFrame(self.perf_agg_prop_mse, index=self.X_test.index), label='Proportional Aggregate', color='blue')
        plt.title(f'Aggregate Performances Over {self.number_models} Seeds')
        plt.xlabel('Time')
        plt.ylabel('Cumulative Performance')
        plt.legend()
        plt.savefig(self.path + f'{self.lag}/{self.yearfolder}/Top-Model-PerformancePlots/Aggregated_Performances_{self.yearfolder}_{self.lag}.png')
        plt.close()

In [None]:
               
for i in os.listdir('../data/'):
    if i in ['.ipynb_checkpoints', 'tex_table_top1_asset.txt']:
        continue
    try:
        t10 = top_models(model='NN', dataname=i)
        print(f'Processing dataset {i}')
        t10.load_model_statistics(dataname = i)
        t10.load_top_models()
    except:
        pass
    print(f'Finished dataset {i}')


#images/RandomForest/btc_hist.csv/[2014, 2015]-[2016]/saved_models/1.joblib