In [None]:
# pip install watermark lightgbm plotly cufflinks numpy pandas optuna torch pandas_ta gluonts pandas_datareader

In [None]:
# pip install -U git+https://github.com/unit8co/darts.git@master

In [None]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%reload_ext watermark
%config InlineBackend.figure_format='retina'

In [None]:
%watermark

In [None]:
# conda install -c conda-forge 'u8darts'

### Library imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import darts
import pandas as pd
import numpy as np 
from datetime import datetime
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go

# pip install matplotlib==3.1.2
import matplotlib
import matplotlib.pyplot as plt

import plotly.offline
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
# pip install -U "u8darts[torch]"

In [None]:
darts.__version__

### Reproducibility

In [None]:
import random
random.seed(0)

import numpy as np
np.random.seed(0)

import torch
torch.manual_seed(0)

In [None]:
df_m6 = pd.read_csv("M6_Universe.csv", index_col=0)
df_m6.head(5)

In [None]:
stocks = df_m6[df_m6["class"]=="Stock"]["symbol"].values
etfs = df_m6[df_m6["class"]=="ETF"]["symbol"].values

In [None]:
SAMPLE_SIZE = 100
FORECAST_HORIZON = 28 #days
PERIODS = 28

In [None]:
%%time 

from tqdm.notebook import tqdm
from utils import get_ticker_historical_data
import pandas_datareader as pdr

directory = './tickers'
save = False

if not os.path.exists(directory):
    os.makedirs(directory)

tickers = df_m6["symbol"].to_list()
tickers_data = dict()
from_date = pd.to_datetime("2000-01-01")

to_date = pd.Timestamp.today()
to_date.tz_localize(tz='Europe/Moscow').tz_convert(tz='America/New_York')
to_date.replace(hour=0, minute=0, second=0, microsecond=0)

to_date = pd.to_datetime("2022-05-01")
interval = '1d'

for ticker in tqdm(tickers[:SAMPLE_SIZE]): 
#     data = get_ticker_historical_data(ticker=ticker,
#                                       from_date=from_date,
#                                       to_date=to_date,
#                                       interval=interval
#                                       )
    # This returns a data frame of scraped stock data from yahoo
    data = pdr.DataReader(ticker, 'yahoo', from_date, to_date)
    tickers_data[ticker] = data
    if save:
        data.reset_index().to_csv(os.path.join(directory,f'{ticker}_{interval}.csv'))

In [None]:
def calculate_pct_returns(x: pd.Series, periods: int) -> pd.Series:
    return (1 + x.pct_change(periods=periods))

def calculate_cum_pct_returns(x: pd.Series, periods: int) -> pd.Series:
    return (((1 + x.pct_change(periods=periods)).cumprod() - 1))*100

def calculate_cum_log_returns(x: pd.Series, periods: int) -> pd.Series:
    return (np.log(1 + x.pct_change(periods=periods)).cumsum())

def calculate_log_returns(x: pd.Series, periods: int) -> pd.Series:
    return np.log(1 + x.pct_change(periods=periods))

df = pd.DataFrame.from_dict({k: v['Adj Close'] for k, v in tickers_data.items()})
df_stock_cum_log_returns = df.apply(calculate_cum_log_returns, periods=PERIODS, axis=0)
df_stock_cum_prt_returns = df.apply(calculate_cum_pct_returns, periods=PERIODS, axis=0)
df_stock_log_returns = df.apply(calculate_log_returns, periods=PERIODS, axis=0)
df_stock_prc_returns = df.apply(calculate_pct_returns, periods=PERIODS, axis=0)

In [None]:
df_stock_returns = df_stock_prc_returns.copy()

#### Predicting Ranks

In [None]:
# df_stock_returns_quantiles = df_stock_returns.dropna().apply(lambda x: (x.rank(ascending=True) // 20 +1).clip(upper=5), axis=0).astype(int)
# for ticket in df_stock_returns_quantiles.columns:
#     df_stock_returns_quantiles[[ticket]].plot()#(kind='hist')
#     plt.show()

### Reindex dates and fill in with previous values 

In [None]:
from gluonts.time_feature.holiday import (
    squared_exponential_kernel,
    SpecialDateFeatureSet,
    NEW_YEARS_DAY,
    MARTIN_LUTHER_KING_DAY,
    PRESIDENTS_DAY,
    GOOD_FRIDAY,
    MEMORIAL_DAY,
    INDEPENDENCE_DAY,
    LABOR_DAY,
    THANKSGIVING,
    CHRISTMAS_DAY,
    SUPERBOWL,
    CHRISTMAS_EVE,
    EASTER_SUNDAY,
    EASTER_MONDAY,
    MOTHERS_DAY,
    COLUMBUS_DAY,
    NEW_YEARS_EVE,
    BLACK_FRIDAY,
    CYBER_MONDAY
)

# Example use for using a squared exponential kernel:
kernel = squared_exponential_kernel(alpha=1.0)
sfs = SpecialDateFeatureSet([NEW_YEARS_DAY,
                             MARTIN_LUTHER_KING_DAY,
                             PRESIDENTS_DAY,
                             GOOD_FRIDAY,
                             MEMORIAL_DAY,
                             INDEPENDENCE_DAY,
                             LABOR_DAY,
                             THANKSGIVING,
                             CHRISTMAS_DAY],
                            kernel)

sfs2 = SpecialDateFeatureSet([SUPERBOWL,
                              CHRISTMAS_EVE,
                              EASTER_SUNDAY,
                              EASTER_MONDAY,
                              MOTHERS_DAY,
                              COLUMBUS_DAY,
                              NEW_YEARS_EVE,
                              BLACK_FRIDAY,
                              CYBER_MONDAY],
                            kernel)

In [None]:
import pandas_ta as ta

# Create our own Custom Strategy
CustomStrategy = ta.Strategy(
    name="Momo and Volatility",
    description="SMA 50,200, BBANDS, RSI, MACD and Volume SMA 20",
    ta=[
        {"kind": "sma", "length": 20, "close": "Adj Close"},
        {"kind": "sma", "length": 5, "close": "Adj Close"},
        #{"kind": "sma", "length": 200, "close": "Adj Close"},
        {"kind": "ema", "length": 8, "close": "Adj Close"},
        {"kind": "ema", "length": 21, "close": "Adj Close"},
#         {"kind": "ema", "length": 50, "close": "Adj Close"},
        {"kind": "bbands", "length": 20, "close": "Adj Close"},
        {"kind": "rsi", "close": "Adj Close"},
        {"kind": "stochrsi", "length": 14, "close": "Adj Close"},
        {"kind": "macd", "fast": 8, "slow": 21, "close": "Adj Close"},
        {"kind": "stoch", "fast": 9, "slow": 6, "close": "Adj Close"},
        {"kind": "macd", "fast": 12, "slow": 26, "close": "Adj Close"},
        {"kind": "sma", "close": "Volume", "length": 20, "prefix": "Volume"},
    ]
)

# calculate different KPI
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']
                
def upper_shadow_percent(df): return (df['High'] / np.maximum(df['Close'], df['Open'])) -1
def lower_shadow_percent(df): return (np.minimum(df['Close'], df['Open']) / df['Low']) -1
                        

In [None]:
# Make a pipeline with the steps
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from transformers import DateTimeTransformer, periodic_spline_transformer
from reduce_memory import ReduceMemoryTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

date_time_transforms = make_pipeline(
    DateTimeTransformer()
)

memory_transforms = make_pipeline(
    ReduceMemoryTransformer()
)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from nyse_holidays import NYSECalendar

def get_datetime_covariates(start_index, end_index):
    calendar = NYSECalendar()
    index = pd.date_range(start=start_index, end=end_index, freq='D')
    holiday_dates = calendar.holidays(start_index, end_index, return_name=True).index
    covariates = pd.DataFrame(index=index)
    covariates.loc[:, ['one_hot_weekends', 'one_hot_holidays']] = 0
    covariates.loc[covariates.index.isin(holiday_dates), 'one_hot_holidays'] = 1 
    covariates.loc[covariates.index.day_name().isin(['Saturday', 'Sunday']),'one_hot_weekends'] = 1
    covariates.loc[:,'kernel_holidays'] = sfs(covariates.index).max(axis=0) # np.prod(sfs(covariates.index), axis=1)
    covariates.loc[:,'kernel_other_holidays'] = sfs2(covariates.index).max(axis=0)
    covariates = covariates.round(3)

    covariates = date_time_transforms.fit_transform(covariates)
    month_splines = periodic_spline_transformer(12, n_splines=6).fit_transform(covariates[['month']])
    weekday_splines = periodic_spline_transformer(7, n_splines=3).fit_transform(covariates[['day_of_week']])
    splines = np.concatenate((month_splines, weekday_splines), axis=1)
    spline_names = [f"spline_{i}" for i in range(splines.shape[1])]
    covariates.loc[:, spline_names] = splines
    covariates = memory_transforms.fit_transform(covariates)
    
    scaler = MinMaxScaler() #StandardScaler()
    covariates = pd.DataFrame(data=scaler.fit_transform(covariates), 
                              index=covariates.index, 
                              columns=covariates.columns)

    return covariates

In [None]:
start_index = df_stock_returns.index[0]
end_index = df_stock_returns.index[-1]
# end_index = pd.Timestamp("2022-03-06") 
df_stock_returns = (df_stock_returns
        .reindex(pd.date_range(start=start_index, end=end_index, freq='D'))
        .fillna(method='ffill')
    )

In [None]:
covariates = get_datetime_covariates(start_index, end_index)

In [None]:
np.unique(df_m6[["GICS_sector/ETF_type"]].values)

In [None]:
, 

In [None]:
from sklearn import preprocessing
le_sector = preprocessing.LabelEncoder()
le_sector.fit(np.unique(df_m6[["GICS_sector/ETF_type"]].values))
le_industry = preprocessing.LabelEncoder()
le_industry.fit(np.unique(df_m6[["GICS_industry/ETF_subtype"]].values))

In [None]:
tickers_data_enriched = {}

for k, v in tickers_data.items():
    df = v.copy()
    df.ta.strategy(CustomStrategy)
    df.ta.percent_return(cumulative=False, append=True)
#     df.ta.percent_return(cumulative=False, length=PERIODS, append=True)
    df = (df
        .reindex(pd.date_range(start=df.index[0], end=end_index, freq='D'))
        .fillna(method='ffill')
        .fillna(method='bfill')
    )
    df['high2low'] = df['High'] / df['Low']
    df['std'] = df['Adj Close'].std()
    df['var'] = df['Adj Close'].var()

    df[f"cum_log_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_cum_log_returns, periods=PERIODS, axis=0).values
    df[f"log_returns_{PERIODS}"] = df[["Adj Close"]].apply(calculate_log_returns, periods=PERIODS, axis=0).values
    df['std'] = df['Adj Close'].std()
    df['var'] = df['Adj Close'].var()
    df['upper_shadow'] = upper_shadow(df)
    df['lower_shadow'] = lower_shadow(df)
    df['upper_shadow_percent'] = upper_shadow_percent(df)
    df['lower_shadow_percent'] = lower_shadow_percent(df)    
    
    df["GICS_sector/ETF_type"] = le_sector.transform(df_m6[df_m6["symbol"]==k]["GICS_sector/ETF_type"].values)[0]
    df["GICS_industry/ETF_subtype"] = le_industry.transform(df_m6[df_m6["symbol"]==k]["GICS_industry/ETF_subtype"].values)[0]
    #     df["group"] = k
    df["ticket"] = 1 if k in stocks else 0
    df["log_volume"] = np.log(df["Volume"] + 1e-8)
    
    df = memory_transforms.fit_transform(df)

    scaler = MinMaxScaler() #StandardScaler()
    df_scaled = pd.DataFrame(data=scaler.fit_transform(df), 
                             index=df.index,
                             columns=df.columns)
    df_scaled.dropna(inplace=True)
    tickers_data_enriched[k] = df_scaled

In [None]:
from darts import TimeSeries
from darts.dataprocessing.transformers import (
    Scaler,
    MissingValuesFiller,
    Mapper,
    InvertibleMapper,
)
scaled_series = list()
future_covariates = list()
past_covariates = list()
scalers = list()

for column in tqdm(df_stock_returns.columns): 
    df = df_stock_returns[[column]].copy()
    scaler = Scaler()
    filler = MissingValuesFiller()
    
    df.dropna(axis=0, inplace=True)
    future_cov = covariates.copy()
    future_cov = future_cov.loc[df.index[0]:df.index[-1],:]
    
    past_cov = tickers_data_enriched[column].copy()
    past_cov = past_cov.loc[df.index[0]:df.index[-1],:]

    serie = TimeSeries.from_dataframe(df.reset_index(), 
                                      time_col='index',
                                      fill_missing_dates=False,
                                      freq='D'
                                     )
    scaled_serie = scaler.fit_transform(serie)
    filled = filler.transform(scaled_serie, method="quadratic")
    
    past_cov_series = TimeSeries.from_dataframe(past_cov.reset_index(), 
                                                time_col='index',
                                                fill_missing_dates=False,
                                                freq='D'
                                                )
    future_cov_series = TimeSeries.from_dataframe(future_cov.reset_index(), 
                                                  time_col='index',
                                                  fill_missing_dates=False,
                                                  freq='D'
                                                 )
    
    scalers.append(scaler)
    scaled_series.append(filled)
    future_covariates.append(future_cov_series)
    past_covariates.append(past_cov_series)

In [None]:
# from darts.utils.statistics import plot_acf, check_seasonality

# for serie in scaled_series[:1]:
#     plot_acf(serie, m=125, alpha=0.05, max_lag=540)

In [None]:
# for serie in scaled_series:
#     for m in range(2, 25):
#         is_seasonal, period = check_seasonality(serie, m=m, alpha=0.05)
#         if is_seasonal:
#             print("There is seasonality of order {}.".format(period))
#     print("")

In [None]:
# [serie.plot() for serie in scaled_series[:]]

In [None]:
# scaled_series[0].pd_dataframe()
# scaled_series[0].values()
# scaled_series[0].all_values()

### Future covariates

In [None]:
# from darts import concatenate
# from darts.utils.timeseries_generation import datetime_attribute_timeseries as dt_attr
# from darts.utils.timeseries_generation import holidays_timeseries as holiday_attr
# from darts.utils.timeseries_generation import linear_timeseries

# future_covs = [concatenate(
#                         [
#                             dt_attr(series.time_index, "month", one_hot=True, dtype=np.float32),
#                             #dt_attr(series.time_index, "month", cyclic=True, dtype=np.float32),
#                             dt_attr(series.time_index, "week", one_hot=True, dtype=np.float32),
#                             #dt_attr(series.time_index, "week", cyclic=True, dtype=np.float32),
#                             dt_attr(series.time_index, "weekday", one_hot=True, dtype=np.float32),
#                             #dt_attr(series.time_index, "weekday", cyclic=True, dtype=np.float32),
#                             dt_attr(series.time_index, "day", one_hot=True, dtype=np.float32),
#                             #dt_attr(series.time_index, "day", cyclic=True, dtype=np.float32),
#                             (dt_attr(series.time_index, "year", dtype=np.float32) - 2000) / 12,
#                             holiday_attr(series.time_index, country_code="US", dtype=np.float32),
#                             linear_timeseries(start=series.time_index[0], end=series.time_index[-1], dtype=np.float32)
#                         ],
#                             axis="component",
#                         ) for series in scaled_series]

# future_covs = [concatenate(
#                         [
#                             dt_attr(series.time_index, "month", cyclic=True, dtype=np.float32),
#                             dt_attr(series.time_index, "week", cyclic=True, dtype=np.float32),
#                             dt_attr(series.time_index, "weekday", cyclic=True, dtype=np.float32),
#                             dt_attr(series.time_index, "day_of_week", cyclic=True, dtype=np.float32),
#                             (dt_attr(series.time_index, "year", dtype=np.float32) - 2000) / 12,
#                             holiday_attr(series.time_index, country_code="US", dtype=np.float32),
#                             linear_timeseries(start=series.time_index[0], end=series.time_index[-1], dtype=np.float32)
#                         ],
#                             axis="component",
#                         ) for series in scaled_series]

# future_covs.plot()
# plt.title(
#     "one multivariate time series of 2 dimensions, containing covariates for the air series:"
# );

In [None]:
forecast_horizon = 28
start_split = 0.8
input_chunk_length = 28
quantiles = [0.05, 0.5, 0.95]

splited_series = [serie.split_before(start_split) for serie in scaled_series] # if serie.n_timesteps > 500]
splited_past_covariates = [serie.split_before(start_split) for serie in past_covariates]
splited_future_covariates = [serie.split_before(start_split) for serie in future_covariates]


In [None]:
from typing import List
from utils import print_error_metrics
from darts.metrics import mape, r2_score, mse, rmse, mae
import matplotlib.pyplot as plt 
from tqdm import tqdm

def backtest_local_models(models, 
                          scaled_series, 
                          past_covariates, 
                          future_covariates,
                          forecast_horizon, 
                          start_split,
                          verbose=False) -> List:
    backtests = [model.historical_forecasts(series=serie,
                                            past_covariates=past_cov,
                                            future_covariates=future_cov,
                                            forecast_horizon=forecast_horizon,
                                            start=start_split,
                                            stride=1,
                                            retrain=False,
                                            last_points_only=True,
                                            overlap_end=True,
                                            verbose=verbose)
                 for model, serie, past_cov, future_cov in tqdm(list(zip(models, 
                                                                    scaled_series, 
                                                                    past_covariates, 
                                                                    future_covariates)))]
    return backtests

def backtest_global_model(model, 
                          scaled_series, 
                          past_covariates, 
                          future_covariates,
                          forecast_horizon, 
                          start_split,
                          verbose=False) -> List:
    backtests = [model.historical_forecasts(series=serie,
                                            past_covariates=past_cov,
                                            future_covariates=future_cov,
                                            forecast_horizon=forecast_horizon,
                                            start=start_split,
                                            stride=1,
                                            num_samples=100,
                                            retrain=False,
                                            last_points_only=True,
                                            overlap_end=False,
                                            verbose=verbose)
                 for serie, past_cov, future_cov in tqdm(list(zip(scaled_series, 
                                                             past_covariates, 
                                                             future_covariates)))]
    return backtests

def fit_local_models(models, 
                     scaled_series, 
                     past_covariates, 
                     future_covariates) -> List:
    for model, serie, past_cov, future_cov in tqdm(list(zip(models, 
                                                       scaled_series, 
                                                       past_covariates, 
                                                       future_covariates))):
        model.fit(series=serie[0], 
                  past_covariates=past_cov[0],
                  future_covariates=future_cov[0],
             )
    return models

def full_fit_local_models(models, 
                          scaled_series, 
                          past_covariates, 
                          future_covariates) -> List:
    for model, serie, past_cov, future_cov in tqdm(list(zip(models, 
                                                            scaled_series, 
                                                            past_covariates, 
                                                            future_covariates))):
        model.fit(series=serie, 
                  past_covariates=past_cov,
                  future_covariates=future_cov,
             )
    return models

def predict_local_models(models,
                         forecast_horizon,
                         past_covariates, 
                         future_covariates) -> List:
    predictions = [model.predict(
                          n=forecast_horizon,
                          past_covariates=past_cov,
                          future_covariates=future_cov
                          ) for model, past_cov, future_cov in tqdm(list(zip(models, 
                                                                        past_covariates, 
                                                                        future_covariates)))]
    return predictions

def predict_global_model(model,
                         targets,
                         forecast_horizon,
                         past_covariates, 
                         future_covariates) -> List:
    predictions = [model.predict(
                          n=forecast_horizon,
                          past_covariates=past_cov,
                          future_covariates=future_cov
                          ) for targets, past_cov, future_cov in tqdm(list(zip(targets, 
                                                                          past_covariates, 
                                                                          future_covariates)))]
    return predictions

def fit_global_model(model, 
                     scaled_series, 
                     past_covariates, 
                     future_covariates) -> List:
    for serie, past_cov, future_cov in tqdm(list(zip(scaled_series, 
                                                past_covariates, 
                                                future_covariates))):
        model.fit(series=serie[0], 
                  past_covariates=past_cov[0],
                  future_covariates=future_cov[0]
                 )
    return model

def calculate_loss(scalers, 
                   splited_series, 
                   backtests,
                   log=False,
                   scaling=False) -> float:
    rmse_losses = list()
    mae_losses = list()
    for scaler, serie_list, backtest in tqdm(list(zip(scalers, splited_series, backtests))):
        
        val_serie = serie_list[1]
        
        if scaling:
            val_serie = scaler.inverse_transform(val_serie)
            backtest = scaler.inverse_transform(backtest)            

        if log:
            val_serie = val_serie.map(lambda x: (np.exp(x) - 1))
            backtest = backtest.map(lambda x: (np.exp(x) - 1))

        rmse_losses.append(
            rmse(val_serie.slice_intersect(backtest), backtest)
        )
        mae_losses.append(
            mae(val_serie.slice_intersect(backtest), backtest)
        )
    mean_rmse, std_rmse = np.mean(rmse_losses), np.std(rmse_losses)
    mean_mae, std_mae = np.mean(mae_losses), np.std(mae_losses)
    print(f"rmse_mean = {mean_rmse}, rmse_std = {std_rmse}")
    print(f"mae_mean = {mean_mae}, mae_std = {std_mae}")
    return mean_rmse
    
def inverse_forecasts(scalers, forecasts, log=False):
    scaled_forecasts = []
    for scaler, forecast in list(zip(scalers, forecasts)):
        forecast = scaler.inverse_transform(forecast)
        if log:
            forecast = forecast.map(lambda x: (np.exp(x) - 1))
        scaled_forecasts.append(forecast)
    return scaled_forecasts


def get_residuals(scalers, forecasts, series, log=False):
    residuals = []
    for scaler, forecast, serie in tqdm(list(zip(scalers, forecasts, series))):
        forecast = scaler.inverse_transform(forecast)
        serie = scaler.inverse_transform(serie)
        
        if log:
            forecast = forecast.map(lambda x: (np.exp(x) - 1))
            serie = serie.map(lambda x: (np.exp(x) - 1))

        residuals.append(
            (forecast - serie.slice_intersect(forecast)).pd_dataframe()
        )
    return residuals

def plot_prediction_forecasts(scalers, series, forecasts, slicing=True, log=False, scaling=False) -> None:
    for scaler, serie, forecast in tqdm(list(zip(scalers, series, forecasts))):
        
        if scaling:
            serie = scaler.inverse_transform(serie)
            forecast = scaler.inverse_transform(forecast)

        if log:
            serie = serie.map(lambda x: (np.exp(x) - 1))
            forecast = forecast.map(lambda x: (np.exp(x) - 1))

        if slicing:
            serie.slice_intersect(forecast).plot(label='data')
        else:
            serie.plot(label='data')
        forecast.plot(lw=2, label='forecast')
        plt.legend()
        plt.show()

def plot_backtest_forecasts(scalers, splited_series, backtests, slicing=True, log=False, scaling=False) -> None:
    for scaler, serie_list, backtest in tqdm(list(zip(scalers, splited_series, backtests))):
        val_serie = serie_list[1]
        
        if scaling:
            val_serie = scaler.inverse_transform(val_serie)
            backtest = scaler.inverse_transform(backtest)

        if log:
            val_serie = val_serie.map(lambda x: (np.exp(x) - 1))
            backtest = backtest.map(lambda x: (np.exp(x) - 1))

        if slicing:
            val_serie.slice_intersect(backtest).plot(label='data')
        else:
            val_serie.plot(label='data')

        backtest.plot(lw=2, label='forecast')
        #covs.slice_intersect(backtest)[:slice_size].plot(label='covariates')
        #error = print_error_metrics(val_serie.slice_intersect(backtest).values(), backtest.values())

        #plt.title(f' MAE: {mae(val_serie,backtest)}, RMSE: {rmse(val_serie, backtest)}')
        #plt.title(error)
        plt.legend()
        plt.show()

In [None]:
# from sklearn.linear_model import RidgeCV
# from darts.models.forecasting.regression_ensemble_model import RegressionEnsembleModel
# from darts.utils.utils import ModelMode, SeasonalityMode
# from darts.models import Theta, RegressionModel, ExponentialSmoothing

# stat_models = [RegressionEnsembleModel(
#                                 forecasting_models=[
#                                                     ExponentialSmoothing(trend=ModelMode.ADDITIVE, 
#                                                                          seasonal=SeasonalityMode.NONE,
#                                                                          seasonal_periods=7,
#                                                                         ), 
#                                                     Theta(theta=2, 
#                                                           seasonality_period=7, 
#                                                           season_mode=SeasonalityMode.ADDITIVE
#                                                     )
#                                                    ], 
#                                 regression_train_n_points=int(len(scaled_series[0])*0.5*(1-start_split)),
#                                 regression_model=KernelRidge()
# ) 
#                 for model in range(len(scaled_series))]

### Model per series 

In [None]:
from darts.models.forecasting.gradient_boosted_model import LightGBMModel
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from darts.models import RegressionModel
# from darts.models.forecasting.regression_model import RegressionModel

from sklearn.linear_model import (RidgeCV, 
                                  #TweedieRegressor, 
                                  SGDRegressor, 
                                  LassoCV, 
                                  HuberRegressor, 
                                  ElasticNetCV,
                                  #BayesianRidge,
                                 )
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR, LinearSVR, NuSVR

# init the models 
models = [RegressionModel(lags=forecast_horizon, 
                          lags_past_covariates=forecast_horizon,
                          lags_future_covariates=(int(forecast_horizon/2),7),
                          output_chunk_length=forecast_horizon,
                          model=RidgeCV()#(l1_ratio=0.4),
                          
                         )
          for model in range(len(scaled_series))]

# models = [LightGBMModel(lags=7, 
#                         lags_past_covariates=7,
#                         lags_future_covariates=(7,2),
#                         output_chunk_length=forecast_horizon
      
#                          )
#           for model in range(len(scaled_series))]

model = RegressionModel(lags=forecast_horizon, 
                        lags_past_covariates=forecast_horizon,
                        lags_future_covariates=(int(forecast_horizon/2), 7),
                        output_chunk_length=forecast_horizon,
                        model=RidgeCV(),
                        )


# models = [RegressionEnsembleModel(
#                                 forecasting_models=[RegressionModel(lags=14, 
#                                                                     lags_past_covariates=14,
#                                                                     lags_future_covariates=(7,2),
#                                                                     model=RidgeCV())
#                                                    #, LassoCV(), HuberRegressor()
#                                                    ], 
#                                 regression_train_n_points=int(len(scaled_series[0])*0.5*(1-start_split)),
#                                 regression_model=ElasticNet()
#                 ) 
#                 for model in range(len(scaled_series))
# ]
# estimators = [
#     ('lr', RidgeCV()),
#     ('svr', LassoCV()),
#     ('huber', HuberRegressor())
# ]

# models = [RegressionModel(lags=14, 
#                           lags_past_covariates=14,
#                           lags_future_covariates=(7,2),
#                           model=StackingRegressor(
#                                               estimators=estimators,
#                                               final_estimator=RandomForestRegressor(n_estimators=10)
#                             )
#                         )
#            for model in range(len(scaled_series))
# ]

local = True

if local:
    models = full_fit_local_models(models, scaled_series, past_covariates, future_covariates)
    print("start_backtesting")
    backtests = backtest_local_models(models, scaled_series, past_covariates, 
                                      future_covariates, forecast_horizon, start_split=start_split, verbose=True)
else:
    model = fit_global_model(model, splited_series, splited_past_covariates, splited_future_covariates)
    backtests = backtest_global_model(model, scaled_series, past_covariates, 
                                      future_covariates, forecast_horizon, start_split=start_split, verbose=True)

In [None]:
loss = calculate_loss(scalers, splited_series, backtests)

In [None]:
plot_backtest_forecasts(scalers, splited_series, backtests)

## Submission 

In [None]:
# init the models 
from darts.models import RegressionModel
from sklearn.linear_model import RidgeCV, TweedieRegressor, ElasticNet, BayesianRidge, LassoCV

models = [RegressionModel(lags=28, 
                          lags_past_covariates=28,
                          lags_future_covariates=(7,2),
                          model=RidgeCV(),
                          output_chunk_length=forecast_horizon
                         )
          for model in range(len(scaled_series))]
models = full_fit_local_models(models, scaled_series, past_covariates, future_covariates)

In [None]:
# from datetime import timedelta
# predict_past_covariates = [covariate.slice(pd.Timestamp("2022-03-07")-timedelta(days=7), 
#                                            pd.Timestamp("2022-02-06")) for covariate in past_covariates]

In [None]:
# from datetime import timedelta
# predict_covariates = get_datetime_covariates(pd.Timestamp("2022-02-07")-timedelta(days=7), 
#                                              pd.Timestamp("2022-02-06")+timedelta(days=forecast_horizon+2))

In [None]:
predict_covariates = get_datetime_covariates(pd.Timestamp("2022-04-30"), pd.Timestamp("2022-05-27"))

In [None]:
predict_future_covariates = [TimeSeries.from_dataframe(predict_covariates.reset_index(), 
                                                       time_col='index',
                                                       fill_missing_dates=False,
                                                       freq='D'
                              ) for i in range(100)]

In [None]:
from darts import concatenate
new_predict_future_covariates = [future_covariates[i].concatenate(predict_future_covariates[i], axis=0) for i in range(len(future_covariates))
]

In [None]:
forecasts = predict_local_models(models=models,
                                 forecast_horizon=forecast_horizon,
                                 past_covariates=past_covariates, #predict_past_covariates, 
                                 future_covariates=new_predict_future_covariates)

In [None]:
plot_backtest_forecasts(scalers, splited_series, forecasts, slicing=False, scaling=True)

In [None]:
scaled_forecast_dfs = [scaled_forecast.pd_dataframe() - 1 for scaled_forecast in inverse_forecasts(scalers, forecasts)]

In [None]:
return_forecasts = pd.concat(scaled_forecast_dfs, axis=1)
return_forecasts.reset_index().to_csv("./results/means_sub3.csv", index=False)

In [None]:
return_forecasts.iplot()

In [None]:
backtests = backtest_local_models(models, scaled_series, past_covariates, 
                                  future_covariates, forecast_horizon, 
                                  start_split=start_split, verbose=True)

In [None]:
def get_residuals(scalers, forecasts, series, log=False):
    residuals = []
    for scaler, forecast, serie in list(zip(scalers, forecasts, series)):
        forecast = scaler.inverse_transform(forecast)
        serie = scaler.inverse_transform(serie)
        #print(forecast.values().shape, serie.values().shape)
        if log:
            forecast = forecast.map(lambda x: (np.exp(x) - 1))
            serie = serie.map(lambda x: (np.exp(x) - 1))

        residuals.append(
            (forecast[-60:] - serie[-60:]).pd_dataframe()
        )
    return residuals

In [None]:
residuals = get_residuals(scalers, backtests, scaled_series)
pd.concat(residuals, axis=1).reset_index().to_csv("./results/residuals_sub3.csv", index=False) #.cov().

In [None]:
# scaled_series_dfs = [scaled_serie.pd_dataframe() for scaled_serie in inverse_forecasts(scalers, scaled_series)]
# df_serie = pd.concat(scaled_series_dfs, axis=1) #.reset_index() # .to_csv("./means.csv", index=False)

In [None]:
# returns = (df_serie.iloc[-1,:] - pd.concat(scaled_forecast_dfs, axis=1).iloc[-1,:])/pd.concat(scaled_forecast_dfs, axis=1).iloc[-1,:]

In [None]:
return_forecasts.iloc[-1].reset_index().to_csv("./results/returns_sub3.csv", index=False)

In [None]:
return_forecasts.iloc[-1]

In [None]:
cov = pd.concat(residuals, axis=1).cov().values
mean = return_forecasts.iloc[-1].values
samples = np.random.multivariate_normal(mean, cov, size=100, check_valid='warn', tol=1e-8)

In [None]:
pd.DataFrame(data=samples).iloc[:,0]

In [None]:
group_names = ['strong sell', 'sell', 'hold', 'buy', 'strong buy']

out = pd.qcut(pd.DataFrame(data=samples).values.reshape(-1), q=[0, .2, .4, .6, .8, 1.], labels=group_names)

In [None]:
counts_df = pd.DataFrame(np.array(list(out)).reshape(100,100))
counts_df.head(2)

In [None]:
counts_df_norm = (counts_df.apply(pd.value_counts)/100).T
counts_df_norm = counts_df_norm[['strong sell', 'sell', 'hold', 'buy', 'strong buy']].fillna(0.0)

In [None]:
df_submission = pd.read_csv("template.csv", index_col=0)
df_submission.head(5)

In [None]:
df_submission.iloc[:,:-1] = counts_df_norm.values

In [None]:
df_submission.to_csv("./results/submission_sub3.csv")

In [None]:
# from metrics import portfolio_rps
# df = pd.DataFrame(df_stock_prc_returns.iloc[-4,:]).T
# idxs = df.T.round(4).apply(lambda x: (x.rank(ascending=True) // 20 +1).clip(upper=5), axis=0).astype(int)
# a = np.zeros((100,5))
# np.put_along_axis(a, idxs.values-1, 1, axis=1)
# probs = pd.read_csv("./results/pilot_submission.csv", index_col=0).iloc[:,:-1].values
# print(portfolio_rps(probs=probs, outcome=a))


from metrics import RPS_calculation, IR_calculation
#Read asset prices data (as provided by the M6 submission platform)
asset_data = pd.read_csv("assets_m6.csv")

#Read submission file (similar to the template provided by the M6 submission platform)
# submission_data = pd.read_csv("template.csv")
submission_data = pd.read_csv("./results/pilot_submission.csv")#, index_col=0)

hist_data = asset_data
submission = submission_data

#Run evaluation
RPS_calculation(hist_data = asset_data , submission = submission_data)['RPS']

# IR_calculation(hist_data, submission)['IR']

In [None]:
from darts.models import RNNModel
from darts.models import TFTModel
from darts.models import TCNModel
from darts.models import TransformerModel
from darts.models import BlockRNNModel

from darts.utils.likelihood_models import (
    GaussianLikelihood,
    PoissonLikelihood,
    NegativeBinomialLikelihood,
    BernoulliLikelihood,
    GammaLikelihood,
    GumbelLikelihood,
    LaplaceLikelihood,
    BetaLikelihood,
    ExponentialLikelihood,
    DirichletLikelihood,
    GeometricLikelihood,
    CauchyLikelihood,
    ContinuousBernoulliLikelihood,
    HalfNormalLikelihood,
    LogNormalLikelihood,
    WeibullLikelihood,
    QuantileRegression,
)



brnn_no_cov = BlockRNNModel(input_chunk_length=input_chunk_length,
                            output_chunk_length=forecast_horizon,
                            n_rnn_layers=2,
                            likelihood=QuantileRegression(quantiles=quantiles),
                            random_state=42)

deepar = RNNModel(input_chunk_length=38, 
                  output_chunk_length=19, 
                  n_rnn_layers=2, 
                  model="LSTM",
                  #hidden_dim=20,
                  dropout=0.1,
                  batch_size=256,
                  n_epochs=10,
                  optimizer_kwargs={"lr": 1e-3},
                  random_state=0,
                  likelihood=BetaLikelihood()#QuantileRegression(quantiles=quantiles),
                 )

deepar = TFTModel(
        input_chunk_length=input_chunk_length,
        output_chunk_length=forecast_horizon,
        hidden_size=64,
        lstm_layers=1,
        num_attention_heads=4,
        dropout=0.1,
        batch_size=256,
        n_epochs=10,
        add_relative_index=False,
        add_encoders=None,
        likelihood=QuantileRegression(
            quantiles=quantiles
        ),  # QuantileRegression is set per default
        # loss_fn=MSELoss(),
        random_state=42,
)

deeptcn = TCNModel(
    input_chunk_length=input_chunk_length,
    output_chunk_length=forecast_horizon,
    batch_size=256,
    kernel_size=2,
    num_filters=4,
    dilation_base=2,
    dropout=0.1,
    random_state=0,
    likelihood=QuantileRegression(
            quantiles=quantiles
        )
)

trans_model = TransformerModel(
    input_chunk_length=input_chunk_length,
    output_chunk_length=forecast_horizon,
    batch_size=256,
    n_epochs=10,
    model_name="transformer",
    nr_epochs_val_period=1,
    d_model=16,
    nhead=8,
    num_encoder_layers=2,
    num_decoder_layers=2,
    dim_feedforward=128,
    dropout=0.1,
    activation="relu",
    random_state=42,
    save_checkpoints=True,
    force_reset=True,
)


from darts.models import NBEATSModel

encoders = {
    "cyclic": {"future": ["month","day"]},
    "datetime_attribute": {"future": ["dayofweek", "day"]},
    "position": {"future": ["relative"]},
    #"position": {"past": ["absolute"], "future": ["relative"]},
    #"custom": {"past": [lambda idx: (idx.day - 1950) / 50]},
    "transformer": Scaler(),
}


model_nbeats = NBEATSModel(
    input_chunk_length=input_chunk_length,
    output_chunk_length=forecast_horizon,
    generic_architecture=True,
    num_stacks=10,
    num_blocks=1,
    num_layers=4,
    layer_widths=512,
    n_epochs=10,
    nr_epochs_val_period=1,
    batch_size=256,
    model_name="nbeats_run",
    likelihood=QuantileRegression(quantiles=quantiles),
)

for serie, past_cov, future_cov in list(zip(splited_series, 
                                            splited_past_covariates, 
                                            splited_future_covariates)):
    model_nbeats.fit(series=serie[0], 
                     past_covariates=past_cov[0],
                     val_series=serie[1].drop_after(0.5),
                     val_past_covariates=past_cov[1].drop_after(0.5),
                     verbose=True)

for serie, past_cov, future_cov in list(zip(splited_series, 
                                            splited_past_covariates, 
                                            splited_future_covariates)):
    deepar.fit(series=serie[0], 
               #past_covariates=past_cov[0],
               future_covariates=future_cov[0],
               val_series=serie[1].drop_after(0.5),
               #val_past_covariates=past_cov[1].drop_after(0.5),
               val_future_covariates=future_cov[1].drop_after(0.5),
               verbose=True,
               #epochs=10
              )
    


### Naive models

In [None]:
from darts.models import NaiveEnsembleModel
from darts.models import NaiveSeasonal
from darts.models import NaiveDrift
from sklearn.linear_model import RidgeCV, TweedieRegressor, ElasticNet, BayesianRidge, LassoCV
# TweedieRegressor(power=2, alphas = np.logspace(-6, 6, 25))
from darts.models.forecasting.linear_regression_model import LinearRegressionModel


naive_models = [RegressionEnsembleModel(
                                forecasting_models=[NaiveDrift(), NaiveSeasonal(14), NaiveSeasonal(7)], 
                                regression_train_n_points=int(len(scaled_series[0])*0.5*(1-start_split)),
                                regression_model=ElasticNet()
                ) 
                for model in range(len(scaled_series))
]

for model, serie, past_cov, future_cov in list(zip(naive_models, 
                                                   splited_series, 
                                                   splited_past_covariates, 
                                                   splited_future_covariates)):
    model.fit(series=serie[0], 
              #past_covariates=past_cov[0],
              future_covariates=future_cov[0],
              #verbose=True
         )

backtests = [model.historical_forecasts(series=serie,
                                        start=start_split+0.5*(1-start_split),
                                        #past_covariates=past_cov,
                                        future_covariates=future_cov,
                                        forecast_horizon=forecast_horizon,
                                        stride=1,
                                        retrain=True,
                                        last_points_only=True,
                                        verbose=True)
             
             for model, serie, past_cov, future_cov in list(zip(naive_models, 
                                                                scaled_series, 
                                                                past_covariates, 
                                                                future_covariates))]

In [None]:
from utils import print_error_metrics

calculate_loss(scalers, splited_series, backtests)

for scaler, serie_list, backtest, covs in list(zip(scalers, splited_series, backtests, past_covariates)):
    val_serie = serie_list[1]
    
    val_serie = scaler.inverse_transform(val_serie)
    backtest = scaler.inverse_transform(backtest)
    
    val_serie = val_serie.map(lambda x: (np.exp(x) - 1))
    backtest = backtest.map(lambda x: (np.exp(x) - 1))
    
    slice_size = 1000
    val_serie.slice_intersect(backtest)[:slice_size].plot(label='data')
    backtest[:slice_size].plot(lw=2, label='forecast')
    #covs.slice_intersect(backtest)[:slice_size].plot(label='covariates')
    error = print_error_metrics(val_serie.slice_intersect(backtest).values(), backtest.values())
    
    plt.title(f'MAPE: {mape(val_serie,backtest)}, RMSE: {rmse(val_serie, backtest)}')
    plt.title(error)
    plt.legend()
    plt.show()
    
# print(
#     f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
#     f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
# )

In [None]:
# backtest the models 
from darts.utils.statistics import plot_hist

for serie, past_cov, future_cov in list(zip(scaled_series, past_covariates, future_covs)):
    raw_errors = model.backtest(
        series=serie,
        past_covariates=past_cov,
        future_covariates=future_cov,
        start=start_split,
        forecast_horizon=forecast_horizon,
        stride=1,
        retrain=False,
        last_points_only=True,
        #metric=mape, 
        reduction=None, 
        verbose=True,
    )

    plot_hist(
        raw_errors,
        bins=np.arange(0, max(raw_errors), 1),
        title="Individual backtest error scores (histogram)",
    )


In [None]:
def eval_model(model):
    model.fit(train)
    forecast = model.predict(len(val))
    print("model {} obtains MAPE: {:.2f}%".format(model, mape(val, forecast)))
    
from darts.utils.statistics import plot_residuals_analysis, plot_hist, display_forecast
pred_series = model_nbeats.historical_forecasts(
    series,
    start=pd.Timestamp("20170901"),
    forecast_horizon=7,
    stride=5,
    retrain=False,
    verbose=True,
)
display_forecast(pred_series, series, "7 day", start_date=pd.Timestamp("20170901"))


plot_residuals_analysis(best_theta_model.residuals(series))


In [None]:
raw_errors = best_theta_model.backtest(
    series, start=0.6, forecast_horizon=3, metric=mape, reduction=None, verbose=True
)

from darts.utils.statistics import plot_hist

plot_hist(
    raw_errors,
    bins=np.arange(0, max(raw_errors), 1),
    title="Individual backtest error scores (histogram)",
)
