In [None]:
# pip install watermark lightgbm plotly cufflinks numpy pandas optuna torch pandas_ta gluonts pandas_datareader

In [None]:
# pip install -U git+https://github.com/unit8co/darts.git@master

In [None]:
# pip install pytorch-forecasting==0.10.2

In [None]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%reload_ext watermark
%config InlineBackend.figure_format='retina'

In [None]:
%watermark

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# conda install -c conda-forge 'u8darts'

### Library imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import darts
import pandas as pd
import numpy as np 
from datetime import datetime
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go

# pip install matplotlib==3.1.2
import matplotlib
import matplotlib.pyplot as plt

import plotly.offline
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
import copy
from pathlib import Path
import warnings

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

import torch
import torch.nn.functional as F

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [None]:
# pip install -U "u8darts[torch]"

In [None]:
import pytorch_forecasting
pytorch_forecasting.__version__

### Reproducibility

In [None]:
pl.seed_everything(0)

import random
random.seed(0)

import numpy as np
np.random.seed(0)

import torch
torch.manual_seed(0)


In [None]:
from pathlib import Path
data_path = Path.cwd().parent / "data" 
df_m6 = pd.read_csv(data_path / "template/M6_Universe.csv", index_col=0)
df_m6.head(5)

In [None]:
stocks = df_m6[df_m6["class"]=="Stock"]["symbol"].values
etfs = df_m6[df_m6["class"]=="ETF"]["symbol"].values

In [None]:
SAMPLE_SIZE = 100
FORECAST_HORIZON = 20 #days
PERIODS = 20

In [None]:
# import numpy as np
# import pandas as pd
# import yfinance as yf
# import warnings

# warnings.filterwarnings("ignore")
# pd.options.display.float_format = '{:.4%}'.format

# # Date range
# start = '2020-01-01'
# end = '2022-04-30'

# # Tickers of assets
# df_m6 = pd.read_csv("M6_Universe.csv", index_col=0)
# df_m6.head(5)
# assets = list(df_m6["symbol"].values)

# # Downloading data
# data = yf.download(assets, start = start, end = end)
# data = data.loc[:,('Adj Close', slice(None))]
# data.columns = assets

In [None]:
pwd 

In [None]:
import pyrootutils
root = pyrootutils.setup_root("..", dotenv=True, pythonpath=True)

In [None]:
from tqdm.notebook import tqdm
from src.io import get_ticker_historical_data
import pandas_datareader as pdr
pdr.__version__

directory = './tickers'
save = False

if not os.path.exists(directory):
    os.makedirs(directory)

tickers = df_m6["symbol"].str.replace("FB", "META").to_list()
tickers_data = dict()
from_date = pd.to_datetime("2018-01-01")

to_date = pd.Timestamp.today()
to_date.tz_localize(tz='Europe/Moscow').tz_convert(tz='America/New_York')
to_date.replace(hour=0, minute=0, second=0, microsecond=0)

# to_date = pd.to_datetime("2022-04-30")
interval = '1d'

for ticker in tqdm(tickers[:]): 
#     data = get_ticker_historical_data(ticker=ticker,
#                                       from_date=from_date,
#                                       to_date=to_date,
#                                       interval=interval
#                                       )
    # This returns a data frame of scraped stock data from yahoo
    data = pdr.DataReader(ticker, 'yahoo', from_date, to_date)
    tickers_data[ticker] = data
    if save:
        data.reset_index().to_csv(os.path.join(directory,f'{ticker}_{interval}.csv'))

In [None]:
from typing import Optional
from src.time_features import reindex_weekdays

for k, df in tickers_data.items():
    tickers_data[k] = reindex_weekdays(df, start_index=pd.to_datetime("2018-01-01"))
#     tickers_data[k] = tickers_data[k].fillna("bfill")

In [None]:
tickers_data["DRE"].tail()

In [None]:
from src.ticket_features import calculate_pct_returns, calculate_log_returns, calculate_cum_log_returns, calculate_cum_pct_returns

df = pd.DataFrame.from_dict({k: v['Adj Close'] for k, v in tickers_data.items()})
df_cum_log_returns = df.apply(calculate_cum_log_returns, periods=PERIODS, axis=0)
df_cum_prt_returns = df.apply(calculate_cum_pct_returns, periods=PERIODS, axis=0)
df_log_returns = df.apply(calculate_log_returns, periods=PERIODS, axis=0)
df_prc_returns = df.apply(calculate_pct_returns, periods=PERIODS, axis=0)

### Reindex dates and fill in with previous values 

In [None]:
# df_stock_returns_quantiles = reindex_weekdays(df.copy(), drop_nonweekdays=True)
df_stock_returns_quantiles = (df.copy()
                              .apply(calculate_pct_returns, periods=PERIODS, axis=0)
                              .apply(lambda x: x + np.random.normal(0, 1e-12, size=(100)), axis=1)
                              .dropna()
                              .rank(1, ascending=True, method='min') // (20.+1e-12) + 1).clip(upper=5).astype(int)
df_stock_returns_quantiles -= 1

In [None]:
# Make a pipeline with the steps
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from src.transformers import DateTimeTransformer, periodic_spline_transformer
from src.reduce_memory import ReduceMemoryTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from src.time_features import get_datetime_covariates
from src.strategy import CustomStrategy1, CustomStrategy
from src.ticket_features import upper_shadow, lower_shadow, upper_shadow_percent, lower_shadow_percent

tickers_data_enriched = {}

date_time_transforms = make_pipeline(
    DateTimeTransformer()
)

memory_transforms = make_pipeline(
    ReduceMemoryTransformer()
)

for k, v in tickers_data.items():
    df = v.copy()
    #df = reindex_weekdays(df, drop_nonweekdays=True, start_index=pd.to_datetime("2018-01-01"))
    # df.ta.strategy(CustomStrategy)
    # df.ta.percent_return(cumulative=False, append=True)
    # df.ta.percent_return(cumulative=False, length=PERIODS, append=True)
    # df[f"cum_log_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_cum_log_returns, periods=PERIODS, axis=0).values
    # df[f"log_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_log_returns, periods=PERIODS, axis=0).values
    # df[f"cum_prc_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_cum_pct_returns, periods=PERIODS, axis=0).values
    # df[f"prc_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_pct_returns, periods=PERIODS, axis=0).values
    df['high2low'] = df['High'] / df['Low']
    df['high_low'] = df['High'] - df['Low']
    # df[f'var_{PERIODS}'] = df['Adj Close'].rolling(20).var()
#     df['target_var'] = df[f'PCTRET_{PERIODS}'].var()
    df['upper_shadow'] = upper_shadow(df)
    df['lower_shadow'] = lower_shadow(df)
    df['upper_shadow_percent'] = upper_shadow_percent(df)
    df['lower_shadow_percent'] = lower_shadow_percent(df)    
    df["log_volume"] = np.log(df["Volume"] + 1e-8)
    df["log_high"] = np.log(df["High"] + 1e-8)
    df["log_low"] = np.log(df["Low"] + 1e-8)
    df = df.fillna(method="bfill")
    df = memory_transforms.fit_transform(df)
    
#     df["GICS_sector/ETF_type"] = df_m6[df_m6["symbol"]==k]["GICS_sector/ETF_type"].values[0]
#     df["GICS_industry/ETF_subtype"] = df_m6[df_m6["symbol"]==k]["GICS_industry/ETF_subtype"].values[0]
    df["group"] = k
#     df["ticket"] = "stock" if k in stocks else "etf"
    #df["month"] = df.index.month #.astype(str).astype("category")  # categories have be strings
    #df["day_of_week"] = df.index.day_of_week #.astype(str).astype("category")  # categories have be strings
    #     scaler = MinMaxScaler() #StandardScaler()
    #     df_scaled = pd.DataFrame(data=scaler.fit_transform(df), 
    #                              index=df.index,
    #                              columns=df.columns)
    #     df_scaled.dropna(inplace=True)
    #     tickers_data_enriched[k] = df_scaled
    tickers_data_enriched[k] = df#[df_stock_returns_quantiles.index[0]:]

In [None]:
start_index = pd.to_datetime("2019-12-01") #df_stock_returns_quantiles.index[0]
end_index = df_stock_returns_quantiles.index[-1]

covariates = get_datetime_covariates(start_index, end_index)
covariates = reindex_weekdays(covariates, drop_nonweekdays=True)

In [None]:
# tickers_data_enriched = {k: tickers_data_enriched[k].drop("group", axis=1) for k in tickers_data_enriched.keys()}

In [None]:
data = pd.concat([pd.concat(
    [df_stock_returns_quantiles[[k]].astype(int).rename(columns={k: "target"}),
     tickers_data_enriched[k].shift(PERIODS), 
     #covariates.shift(PERIODS)
     ], axis=1).dropna(how="all", axis=0)
                  for k in tickers_data_enriched.keys()])

In [None]:
data = data.dropna()

In [None]:
data.index[int((data.shape[0]/100)*0.5)]

In [None]:
from datetime import timedelta

train_start = data.index[0] #data[data["group"]=="OGN"].dropna().index[0]
test_end = data.index[-1] #data[data["group"]=="IUMO.L"].dropna().index[-1]

# train_start = data.loc[data.index[data.index.get_level_values(0)=="OGN"]].index#[0][1]
# test_end = groupby.loc[groupby.index[groupby.index.get_level_values(0)=="IUMO.L"]].index[-1][1]
#train_start = pd.Timestamp('2021-07-10 00:00:00')
training_cutoff = data.index[int((data.shape[0]/100)*0.5)] #data.index.max() - timedelta(days=3*FORECAST_HORIZON)
test_cutoff = training_cutoff + timedelta(days=1) - timedelta(days=20)
training_cutoff

In [None]:
groupby = data.groupby("group")
from sklearn.model_selection import train_test_split

# Split data into 50% train and 50% test subsets
# X_train, X_test, y_train, y_test = train_test_split(
#     data, digits.target, test_size=0.5, shuffle=False
# )

groupby_train = groupby.apply(lambda group: group.loc[train_start:training_cutoff].drop(["group"], axis=1))
groupby_test = groupby.apply(lambda group: group.loc[test_cutoff:test_end].drop(["group"], axis=1))

In [None]:
groupby_train.index.get_level_values(1).dayofweek.unique()

In [None]:
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler()

X_train = X_scaler.fit_transform(groupby_train.drop(["target"], axis=1).values)

# print(X_scaler.data_max_)

X_test = X_scaler.transform(groupby_test.drop(["target"], axis=1).values)

In [None]:
groupby_train.loc[:, groupby_train.columns!="target"] = X_train
groupby_test.loc[:, groupby_test.columns!="target"] = X_test

In [None]:
import tqdm 
dates = groupby_train.index.get_level_values(1).unique()
tickets = groupby_train.index.get_level_values(0).unique()

train_batches_X, train_batches_Y = [], []
LOOK_BACK = 20
for i in tqdm.tqdm(range(len(dates)-LOOK_BACK)):
    start = dates[i].date()
    end = dates[i+FORECAST_HORIZON-1].date()
    next_target = dates[i+FORECAST_HORIZON].date()
    print(f"{start}:{end} => {next_target}")
    train_batch_X, train_batch_Y = [], []
    for ticket in tickets:
        df_ = groupby_train.xs(slice(start, end), level=1)
        df_X = df_.loc[df_.index==ticket, groupby_train.columns!="target"].astype(np.float16)
        if df_X.shape!=(LOOK_BACK, groupby_train.shape[1]-1):
            print(f"{ticket} shape: {df_X.shape}")
        df_ = groupby_train.xs(slice(next_target, next_target), level=1)
        df_Y = df_.loc[df_.index==ticket, groupby_train.columns=="target"].astype(int)
        train_batch_X.append(df_X.values.tolist())
        train_batch_Y.append(df_Y.values.tolist())
    train_batches_X.append(train_batch_X)
    train_batches_Y.append(train_batch_Y)

# for index in groupby_train.index.get_level_values(1)[:-20:]:
#     print(f"{index.date()}:{index.date() + timedelta(days=20)}")

In [None]:
import tqdm 
dates = groupby_test.index.get_level_values(1).unique()
tickets = groupby_test.index.get_level_values(0).unique()

test_batches_X, test_batches_Y = [], []
LOOK_BACK = 20
for i in tqdm.tqdm(range(len(dates)-LOOK_BACK)):
    start = dates[i].date()
    end = dates[i+FORECAST_HORIZON-1].date()
    next_target = dates[i+FORECAST_HORIZON].date()
    print(f"{start}:{end} => {next_target}")
    test_batch_X, test_batch_Y = [], []
    for ticket in tickets:
        df_ = groupby_test.xs(slice(start, end), level=1)
        df_X = df_.loc[df_.index==ticket, groupby_test.columns!="target"].astype(np.float16)
        if df_X.shape!=(LOOK_BACK, groupby_train.shape[1]-1):
            print(f"{ticket} shape: {df_X.shape}")
        df_ = groupby_test.xs(slice(next_target, next_target), level=1)
        df_Y = df_.loc[df_.index==ticket, groupby_test.columns=="target"].astype(int)
        if df_Y.empty:
            print(f"{ticket}")#" shape: {df_Y.shape}")
            print(df_Y.values)
        test_batch_X.append(df_X.values.tolist())
        test_batch_Y.append(df_Y.values.tolist())
    test_batches_X.append(test_batch_X)
    test_batches_Y.append(test_batch_Y)

# for index in groupby_train.index.get_level_values(1)[:-20:]:
#     print(f"{index.date()}:{index.date() + timedelta(days=20)}")

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

train_batches_X = torch.tensor(train_batches_X, dtype=torch.float32, device=device)
train_batches_Y = torch.tensor(train_batches_Y, dtype=torch.int64, device=device)
test_batches_X = torch.tensor(test_batches_X, dtype=torch.float32, device=device)
test_batches_Y = torch.tensor(test_batches_Y, dtype=torch.int64, device=device)

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from src.mv_cnn_train_utils import RPSLoss, train_model, run_epoch, batchify_data

batch_size = 32

X_train, y_train, X_test, y_test = train_batches_X, train_batches_Y, test_batches_X, test_batches_Y
# Split into train and dev
dev_split_index = int(8 * len(X_train) / 10)
X_dev = X_train[dev_split_index:,:,:,:]
y_dev = y_train[dev_split_index:,:,:,:]
X_train = X_train[:dev_split_index,:,:,:]
y_train = y_train[:dev_split_index,:,:,:]

# Split dataset into batches
train_batches = batchify_data(X_train, y_train, batch_size)
dev_batches = batchify_data(X_dev, y_dev, batch_size)
test_batches = batchify_data(X_test, y_test, batch_size)

In [None]:
from src.mv_cnn_model import MultivariateCNN, MultivariateMLP
import torch
# Load model
N_CLASSES = 5
N_SERIES = train_batches_X[0].shape[0]
N_DIM1 = train_batches_X[0].shape[1]
N_DIM2 = train_batches_X[0].shape[2]

model = MultivariateCNN(input_dimension=(N_DIM1, N_DIM2), in_channels=N_SERIES, n_outputs=N_CLASSES, 
                        n_cnn_layers=2, conv_kernel_size=3, pool_kernel_size=2
                       ).to(device)

# model = MultivariateMLP(input_dimension=(N_DIM1, N_DIM2), in_channels=N_SERIES, n_outputs=N_CLASSES).to(device)
print(model)

# We optimize with SGD
# optimizer = torch.optim.SGD(
#     model.parameters(), lr=0.1, momentum=0.9, nesterov=False
# )
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train
model = train_model(train_batches, dev_batches, model, optimizer, n_epochs=100)

# Save model
# torch.save(model, 'cnn_model.pt')

In [None]:
## Evaluate the model on test data
loss = run_epoch(test_batches, model.eval(), None)
print(f"Valid | mean loss {np.mean(loss):.6f} | std loss: {np.std(loss):.6f}")

In [None]:
## Evaluate the model on test data
loss = run_epoch(test_batches, model.eval(), None)
[print('Valid | loss{}: {:.6f} '.format(i, loss[i])) for i in range(len(loss))]

In [None]:
np.where(np.array(loss)<0.15, np.array(loss), 0.16).mean()

In [None]:
x, y = train_batches[0]['x'], train_batches[0]['y']
preds = model(x)
preds#.shape

In [None]:
F.softmax(preds[0], dim=1)

In [None]:
diff = torch.cumsum(F.softmax(preds[0], dim=-1), dim=-1) - torch.cumsum(F.one_hot(y[:,1,0,0].to(torch.int64), num_classes=5), dim=-1)

In [None]:
torch.mean(torch.stack([torch.mean(diff**2, dim=0), torch.mean(diff**2, dim=0)]))#, axis=1)

In [None]:
torch.mean(diff, axis=1)#.detach().numpy().mean()