In [97]:
import numpy as np
import pandas as pd
import gym
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, accuracy_score

from datetime import datetime, timedelta

# Read closing prices for stocks
daily_prices = pd.read_csv('../data/Untitled Folder/price.csv', index_col=0)
daily_prices['Dates'] = pd.to_datetime(daily_prices['Dates'])
daily_prices = daily_prices.set_index('Dates')

LABEL = "PX_OPEN"
# LABEL = "PX_LAST"
LABEL_ADJUSTED = LABEL + '_ADJUSTED'
# SPLIT_YEAR = 2010
SPLIT_YEAR = 2018

# name = str(SPLIT_YEAR) + 'split_open_prices_with_sp'
name = str(SPLIT_YEAR) + 'split_open_prices'
# name = str(SPLIT_YEAR) + 'split'
# name = "olivier"

In [98]:
symbols = ["AMM", "CIMB", "DIGI", "GAM", "GENM", "GENT", "HLBK", "IOI", "KLK", "MAY", "MISC", "NESZ", "PBK", "PEP", "PETD", "PTG", "RHBBANK", "ROTH", "T", "TNB"]

# Change from closing prices to opening prices
for stock in symbols:
    df = pd.read_csv(f'../data/Day Data with Volatility/{stock} MK Equity.csv')
    df['Dates'] = pd.to_datetime(df['Dates'])
    df = df.set_index('Dates')
    daily_prices[stock] = df[LABEL]

In [99]:
# print("TRAIN PREDICTIONS | r^2 | meanstd model acc | best model acc")
print("TRAIN PREDICTIONS | model acc")
print("="*60)
for sym in daily_prices:
    
    df = pd.read_csv(f'../data/directions/{name}/Directions {sym}.csv')
    df['Dates'] = pd.to_datetime(df.Dates)
    df = df.set_index('Dates')
    
    # Get model predictions corresponding to training data
    train = df[:datetime(SPLIT_YEAR,1,1)]
    
    # Get prices corresponding to training data. .loc[train.index] because in the process of adding US stock prices
    # as features, we lost a few trading days
    subset = daily_prices[datetime(2000,4,24):datetime(SPLIT_YEAR,1,1)].loc[train.index]
    
    # Percent change between consecutive (actual) prices
    y = subset[sym].values[1:]/subset[sym].values[:-1] - 1
    
    # Get the 10 model predictions, after dropping the 1st data point (presumably because taking the difference in 
    # the previous step meant that the first point is unusable)
    arrs = train[[f'MODEL_{i+1}' for i in range(10)]].values[1:]
    
    # get indices where all models agree?
    concur_mask = np.array([np.all(arr == arr[0]) for arr in arrs])    
    
    change_mask = y != 0

#     name='2018split_open_prices_with_sp'
#     df2 = pd.read_csv(f'../data/directions/{name}/Directions {sym}.csv')
#     df2['Dates'] = pd.to_datetime(df2.Dates)
#     df2 = df2.set_index('Dates')
#     r2s = []
#     train = df2[:datetime(SPLIT_YEAR,1,1)]
#     for i in range(10):
        
#         r2s.append(r2_score(y, train[f'PCT_{i+1}'][:-1]))
#     print(sym, 0, accuracy_score(np.sign(y), train['AVG'][:-1]), accuracy_score(np.sign(y[concur_mask]), arrs[concur_mask][:,0]))
#     print(sym, accuracy_score(np.sign(y[concur_mask]), arrs[concur_mask][:,0]))
    print(accuracy_score(np.sign(y[change_mask]), np.sign(np.mean(arrs[change_mask], axis = 1))))

    
    
    
print("\n\n")
print("EVAL PREDICTIONS | model acc")
print("="*60)
for sym in daily_prices:
#     name = str(SPLIT_YEAR) + 'split_open_prices_with_sp'
#     name = str(SPLIT_YEAR) + 'split'
    
    df = pd.read_csv(f'../data/directions/{name}/Directions {sym}.csv')
    df['Dates'] = pd.to_datetime(df.Dates)
    df = df.set_index('Dates')
    
    # Get model predictions corresponding to training data
    train = df[datetime(SPLIT_YEAR,1,1):datetime(2020,1,1)]
    
    # Get prices corresponding to training data. .loc[train.index] because in the process of adding US stock prices
    # as features, we lost a few trading days
    subset = daily_prices[datetime(SPLIT_YEAR,1,1):datetime(2020,1,1)].loc[train.index]
    
    # Percent change between consecutive (actual) prices
    y = subset[sym].values[1:]/subset[sym].values[:-1] - 1
    
    # Get the 10 model predictions, after dropping the 1st data point (presumably because taking the difference in 
    # the previous step meant that the first point is unusable)
    arrs = train[[f'MODEL_{i+1}' for i in range(10)]].values[1:]
    
    # get indices where all models agree?
    concur_mask = np.array([np.all(arr == arr[0]) for arr in arrs])    

    change_mask = y != 0

    print(accuracy_score(np.sign(y[change_mask]), np.sign(np.mean(arrs[change_mask], axis = 1))))
#     print(sym)


    
# print("\n\n")
# print("EVAL PREDICTIONS | r^2 | meanstd model acc | best model acc")
# print("="*60)
# for sym in daily_prices:
#     name = str(SPLIT_YEAR) + 'split_open_prices_with_sp'
#     df = pd.read_csv(f'../data/directions/{name}/Directions {sym}.csv')
#     df['Dates'] = pd.to_datetime(df.Dates)
#     df = df.set_index('Dates')
#     train = df[datetime(SPLIT_YEAR,1,1):datetime(2020,1,1)]
#     subset = daily_prices[datetime(SPLIT_YEAR,1,1):datetime(2020,1,1)].loc[train.index]
#     y = subset[sym].values[1:]/subset[sym].values[:-1]-1
#     arrs = train[[f'MODEL_{i+1}' for i in range(10)]].values[1:]
#     concur_mask = np.array([np.all(arr == arr[0]) for arr in arrs])    
#     name='2018split_open_prices_with_sp'
#     df2 = pd.read_csv(f'../data/directions/{name}/Directions {sym}.csv')
#     df2['Dates'] = pd.to_datetime(df2.Dates)
#     df2 = df2.set_index('Dates')
#     r2s = []
#     train = df2[datetime(SPLIT_YEAR,1,1):datetime(2020,1,1)]
# #     for i in range(10):
# #         r2s.append(r2_score(y, train[f'PCT_{i+1}'][1:]))
#     print(sym, 0, accuracy_score(np.sign(y), train['AVG'][:-1]), accuracy_score(np.sign(y[concur_mask]), arrs[concur_mask][:,0]))


TRAIN PREDICTIONS | model acc
0.19377251672671128
0.21175224986765484
0.21654093322157028
0.19617353812988414
0.20321984692531012
0.1976470588235294
0.22773722627737225
0.21331521739130435
0.2393854748603352
0.21267605633802816
0.24433249370277077
0.2788794460182562
0.210128055878929
0.21907641561297417
0.22300734878462408
0.25571030640668524
0.20261437908496732
0.2024851736797515
0.20476586311997783
0.20123491439797922



EVAL PREDICTIONS | model acc
0.21395348837209302
0.2224824355971897
0.22247191011235956
0.17117117117117117
0.21478060046189376
0.22857142857142856
0.2565789473684211
0.2488479262672811
0.26521739130434785
0.23218390804597702
0.2657952069716776
0.23127753303964757
0.25389755011135856
0.25565610859728505
0.2472885032537961
0.2675438596491228
0.23893805309734514
0.24839400428265523
0.16591928251121077
0.21076233183856502


In [100]:
for sym in daily_prices:
    name = '2018split_open_prices_with_sp'
    df = pd.read_csv(f'../data/directions/{name}/Directions {sym}.csv')
    df['Dates'] = pd.to_datetime(df.Dates)
    df = df.set_index('Dates')
    train = df[datetime(2018,1,1):datetime(2020,1,1)]
#     print(train.index)
    subset = daily_prices[datetime(2018,1,1):datetime(2020,1,1)].loc[train.index]
    print(subset.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 503 entries, 2018-01-02 to 2019-12-31
Data columns (total 20 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   AMM      503 non-null    float64
 1   CIMB     503 non-null    float64
 2   DIGI     503 non-null    float64
 3   GAM      503 non-null    float64
 4   GENM     503 non-null    float64
 5   GENT     503 non-null    float64
 6   HLBK     503 non-null    float64
 7   IOI      503 non-null    float64
 8   KLK      503 non-null    float64
 9   MAY      503 non-null    float64
 10  MISC     503 non-null    float64
 11  NESZ     503 non-null    float64
 12  PBK      503 non-null    float64
 13  PEP      503 non-null    float64
 14  PETD     503 non-null    float64
 15  PTG      503 non-null    float64
 16  RHBBANK  503 non-null    float64
 17  ROTH     503 non-null    float64
 18  T        503 non-null    float64
 19  TNB      503 non-null    float64
dtypes: float64(20)
memory usage: 82.5 K

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 503 entries, 2018-01-02 to 2019-12-31
Data columns (total 20 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   AMM      503 non-null    float64
 1   CIMB     503 non-null    float64
 2   DIGI     503 non-null    float64
 3   GAM      503 non-null    float64
 4   GENM     503 non-null    float64
 5   GENT     503 non-null    float64
 6   HLBK     503 non-null    float64
 7   IOI      503 non-null    float64
 8   KLK      503 non-null    float64
 9   MAY      503 non-null    float64
 10  MISC     503 non-null    float64
 11  NESZ     503 non-null    float64
 12  PBK      503 non-null    float64
 13  PEP      503 non-null    float64
 14  PETD     503 non-null    float64
 15  PTG      503 non-null    float64
 16  RHBBANK  503 non-null    float64
 17  ROTH     503 non-null    float64
 18  T        503 non-null    float64
 19  TNB      503 non-null    float64
dtypes: float64(20)
memory usage: 82.5 K

In [None]:
from typing import List
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
import numpy as np

from dataset import DailyDataset, FEATURES

TRADING_DAYS_IN_YEAR = 261


# WHOLE FILE IS DEPRECATED. NOT USED BY WITH ANY OTHER CODE AS OF SEP 17 2022.
# WE NEED A NEW EVALUATION CODE FOR THE PREDICTIONS PART, AFTER WHICH THIS FILE SHOULD BE DELETED.


class ModelEvaluation():
    def __init__(self, daily_returns: List[float], baseline_returns: List[float], conf_matrix: np.ndarray) -> None:
        if len(daily_returns) != len(baseline_returns):
            raise ValueError()
        self.daily_returns = np.array(daily_returns)
        self.baseline_returns = np.array(baseline_returns)
        self.time_length = len(daily_returns)
        self.conf_matrix = conf_matrix

    @property
    def mean_return(self) -> float:
        return np.mean(self.daily_returns)

    @property
    def std_return(self) -> float:
        return np.std(self.daily_returns, ddof=1)

    @property
    def mean_baseline_return(self) -> float:
        return np.mean(self.baseline_returns)

    @property
    def std_baseline_return(self) -> float:
        return np.std(self.baseline_returns, ddof=1)

    @property
    def sharpe(self) -> float:
        return 0 if self.std_return == 0 else self.mean_return / self.std_return * np.sqrt(TRADING_DAYS_IN_YEAR)

    @property
    def baseline_sharpe(self) -> float:
        return 0 if self.std_baseline_return == 0 else self.mean_baseline_return / self.std_baseline_return * np.sqrt(TRADING_DAYS_IN_YEAR)

    @property
    def total_return(self) -> float:
        return np.prod(self.daily_returns + 1) - 1
    
    @property
    def total_baseline_return(self) -> float:
        return np.prod(self.baseline_returns + 1) - 1
    
    @property
    def annualized_return(self) -> float:
        return (1 + self.total_return) ** (TRADING_DAYS_IN_YEAR / self.time_length) - 1

    @property
    def annualized_baseline_return(self) -> float:
        return (1 + self.total_baseline_return) ** (TRADING_DAYS_IN_YEAR / self.time_length) - 1

def evaluate_daily_model(model, val_ds: DailyDataset, device: torch.device) -> ModelEvaluation:
    daily_returns = []
    daily_bnh_returns = []
    conf_matrix = np.zeros((2, 2))

    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)
    for X, _, move in tqdm(val_loader, desc="Eval", leave=False):
        X = X.to(device)
        move = move.numpy()
        y_pred = model(X)
        y_pred = y_pred.cpu()
        X = X.cpu()
        direction = np.where(y_pred >= X[:, -1, 0], 1, -1)
        daily_returns += list(direction * move)
        daily_bnh_returns += list(move)
        true_direction = np.where(move >= 0, 1, -1)
        
        conf_matrix[0, 0] += np.sum((direction == -1) & (true_direction == -1))
        conf_matrix[0, 1] += np.sum((direction == 1) & (true_direction == -1))
        conf_matrix[1, 0] += np.sum((direction == -1) & (true_direction == 1))
        conf_matrix[1, 1] += np.sum((direction == 1) & (true_direction == 1))
    return ModelEvaluation(daily_returns, daily_bnh_returns,conf_matrix)



class EnsembleModel(Module):
    def __init__(self, models) -> None:
        super(EnsembleModel, self).__init__()
        self.models = ModuleList(models)
    
    def forward(self, X, return_std=False) -> torch.Tensor:
        preds = torch.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            preds[:, i] = self.models[i](X)
        if return_std:
            return torch.std_mean(preds, dim=1, unbiased=True)
        return torch.mean(preds, dim=1)
