# Models For Prediction (Comparison)

In [71]:
#
import os
import numpy as np
import pandas as pd
from pandas import Timestamp
import seaborn as sns
import matplotlib.pyplot as plt
#
from talib import RSI, BBANDS, MACD, ATR
#
from zipline.api import order_target_percent, record, symbol, set_benchmark, get_open_orders
import zipline
import datetime
import pytz
from zipline.data.bundles import load
from trading_calendars import get_calendar
import pyfolio as pf
#
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost
#
from matplotlib.ticker import FuncFormatter
%load_ext zipline

The zipline extension is already loaded. To reload it, use:
  %reload_ext zipline


## Load and Prep Data

### General Factors

In [46]:
df = pd.read_csv("../data/signals/General_Factors.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.index = pd.MultiIndex.from_frame(df.iloc[:,0:2])
df = df.drop(["Symbol", "Date"], axis=1)
df = df.dropna()

# what are lagged returns used for
df = df.drop([c for c in df.columns if 'lag' in c], axis=1)
df = df.drop(['Open', 'High', 'Low', 'Close'], axis=1)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Volume,rsi,bb_high,bb_low,atr,macd,return_1d,return_2d,return_3d,return_4d,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADA,2018-06-11,1.004216e+08,33.512455,0.216036,0.021987,-0.404601,-0.349924,0.011576,-0.062333,-0.044647,-0.038902,...,0,0,0,1,0,0,0,0,0,0
ADA,2018-06-12,6.949217e+07,30.063208,0.269702,-0.021351,-0.405151,-0.383543,-0.079092,-0.034822,-0.067953,-0.053377,...,0,0,0,1,0,0,0,0,0,0
ADA,2018-06-13,7.014928e+07,28.857432,0.291623,-0.010164,-0.409419,-0.413876,-0.032376,-0.056023,-0.034007,-0.059183,...,0,0,0,1,0,0,0,0,0,0
ADA,2018-06-14,1.431842e+08,35.174686,0.254994,0.077835,-0.408323,-0.416873,0.075489,0.020132,-0.014077,-0.007726,...,0,0,0,1,0,0,0,0,0,0
ADA,2018-06-15,8.906098e+07,33.089452,0.282673,0.061770,-0.420484,-0.426485,-0.046255,0.012789,-0.002495,-0.022222,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TFUEL,2022-03-16,2.340533e+07,47.924529,0.091847,0.112104,0.145477,-0.495777,0.020510,0.004641,0.011658,0.000422,...,1,0,0,0,0,0,0,0,0,0
TFUEL,2022-03-17,2.249906e+07,50.268468,0.071962,0.129365,0.125531,-0.423620,0.022745,0.021627,0.010640,0.014418,...,1,0,0,0,0,0,0,0,0,0
TFUEL,2022-03-18,2.254040e+07,52.445625,0.054721,0.145075,0.130008,-0.340504,0.021005,0.021875,0.021420,0.013221,...,1,0,0,0,0,0,0,0,0,0
TFUEL,2022-03-19,2.129223e+07,53.545190,0.050250,0.152796,0.115289,-0.260929,0.010329,0.015653,0.018012,0.018636,...,1,0,0,0,0,0,0,0,0,0


### NFT and Sentiment  Data

In [47]:
nft_df = pd.read_csv("../data/NFT_markettracker.csv").drop("Unnamed: 0", axis=1)
nft_df["Date"] = pd.to_datetime(nft_df["Date"])
nft_df.head()

Unnamed: 0,Date,Number of sales,Sales USD,Average USD,Active market wallets,Primary Sales,Secondary sales,Primary sales USD,Secondary sales USD,Unique buyers,Unique sellers
0,2017-10-02,0,0.0,,0,0,0,0.0,0.0,0,0
1,2017-10-03,0,0.0,,0,0,0,0.0,0.0,0,0
2,2017-10-04,1,58.61,58.61,2,0,1,0.0,58.61,1,1
3,2017-10-05,0,0.0,,0,0,0,0.0,0.0,0,0
4,2017-10-06,0,0.0,,0,0,0,0.0,0.0,0,0


In [48]:
sent_df = pd.read_csv("../data/signals/Reddit_NFT_Agg.csv")
sent_df["Date"] = pd.to_datetime(nft_df["Date"])
sent_df.head()

Unnamed: 0,Date,Polarity_Sum,Polarity_Mean,Sentiment
0,2017-10-02,0.0,0.0,0
1,2017-10-03,0.0,0.0,0
2,2017-10-04,0.0,0.0,0
3,2017-10-05,0.0,0.0,0
4,2017-10-06,0.0,0.0,0


Combine NFT and Sentiment with general factors and returns

In [49]:
initial_date = df.index.get_level_values("Date")[0]
end_date = df.index.get_level_values("Date")[-1]

nft_df = nft_df[(nft_df["Date"] >= initial_date) & (nft_df["Date"] <= end_date)].drop("Date", axis=1).reset_index().drop("index", axis=1)
sent_df = sent_df[(sent_df["Date"] >= initial_date) & (sent_df["Date"] <= end_date)].drop("Date", axis=1).reset_index().drop("index", axis=1)

len(nft_df) == len(df.loc["ADA",:]) and len(sent_df) == len(df.loc["ADA",:])

True

In [50]:
tiled_df1 = pd.DataFrame(np.tile(nft_df, (7,1)), columns=nft_df.columns)
tiled_df2 = pd.DataFrame(np.tile(sent_df, (7,1)), columns=sent_df.columns)

len(tiled_df1) == len(df) and len(tiled_df2) == len(df)

True

In [51]:
df[tiled_df1.columns] = tiled_df1.values
df[tiled_df2.columns] = tiled_df2.values
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Volume,rsi,bb_high,bb_low,atr,macd,return_1d,return_2d,return_3d,return_4d,...,Active market wallets,Primary Sales,Secondary sales,Primary sales USD,Secondary sales USD,Unique buyers,Unique sellers,Polarity_Sum,Polarity_Mean,Sentiment
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADA,2018-06-11,1.004216e+08,33.512455,0.216036,0.021987,-0.404601,-0.349924,0.011576,-0.062333,-0.044647,-0.038902,...,660.0,2864.0,517.0,20244.22,22356.97,492.0,280.0,0.00,0.00,0.0
ADA,2018-06-12,6.949217e+07,30.063208,0.269702,-0.021351,-0.405151,-0.383543,-0.079092,-0.034822,-0.067953,-0.053377,...,692.0,2296.0,607.0,18200.13,18781.24,504.0,294.0,0.00,0.00,0.0
ADA,2018-06-13,7.014928e+07,28.857432,0.291623,-0.010164,-0.409419,-0.413876,-0.032376,-0.056023,-0.034007,-0.059183,...,718.0,1671.0,601.0,23514.14,27754.78,527.0,283.0,0.00,0.00,0.0
ADA,2018-06-14,1.431842e+08,35.174686,0.254994,0.077835,-0.408323,-0.416873,0.075489,0.020132,-0.014077,-0.007726,...,566.0,1556.0,370.0,14447.34,31315.95,432.0,205.0,0.00,0.00,0.0
ADA,2018-06-15,8.906098e+07,33.089452,0.282673,0.061770,-0.420484,-0.426485,-0.046255,0.012789,-0.002495,-0.022222,...,641.0,4665.0,588.0,26308.40,24206.09,450.0,280.0,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TFUEL,2022-03-16,2.340533e+07,47.924529,0.091847,0.112104,0.145477,-0.495777,0.020510,0.004641,0.011658,0.000422,...,14188.0,2802.0,8912.0,2356432.46,52330293.33,7755.0,7031.0,75.51,0.11,1.0
TFUEL,2022-03-17,2.249906e+07,50.268468,0.071962,0.129365,0.125531,-0.423620,0.022745,0.021627,0.010640,0.014418,...,14414.0,3109.0,9892.0,3957555.73,98901027.48,8040.0,7481.0,77.64,0.13,1.0
TFUEL,2022-03-18,2.254040e+07,52.445625,0.054721,0.145075,0.130008,-0.340504,0.021005,0.021875,0.021420,0.013221,...,15830.0,3121.0,10624.0,5639611.59,80177042.29,8538.0,8322.0,68.56,0.12,1.0
TFUEL,2022-03-19,2.129223e+07,53.545190,0.050250,0.152796,0.115289,-0.260929,0.010329,0.015653,0.018012,0.018636,...,17014.0,4497.0,11419.0,5475732.05,82829439.82,9487.0,8720.0,58.61,0.10,1.0


In [52]:
print([i for i in df.columns])

['Volume', 'rsi', 'bb_high', 'bb_low', 'atr', 'macd', 'return_1d', 'return_2d', 'return_3d', 'return_4d', 'return_5d', 'return_6d', 'return_7d', 'return_14d', 'return_28d', 'target_1d', 'target_2d', 'target_3d', 'target_4d', 'target_5d', 'target_6d', 'target_7d', 'target_14d', 'target_28d', 'year_2018', 'year_2019', 'year_2020', 'year_2021', 'year_2022', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'Number of sales', 'Sales USD', 'Average USD', 'Active market wallets', 'Primary Sales', 'Secondary sales', 'Primary sales USD', 'Secondary sales USD', 'Unique buyers', 'Unique sellers', 'Polarity_Sum', 'Polarity_Mean', 'Sentiment']


In [63]:
# Machine Learning for Algorithmic Trading Chapter 6
class MultipleTimeSeriesCV:
    """Generates tuples of train_idx, test_idx pairs
    Assumes the MultiIndex contains levels 'symbol' and 'date'
    purges overlapping outcomes"""

    def __init__(self,
                 n_splits=3,
                 train_period_length=126,
                 test_period_length=21,
                 lookahead=None,
                 date_idx='Date',
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle
        self.date_idx = date_idx

    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values(self.date_idx).unique()
        days = sorted(unique_dates, reverse=True)
        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx])

        dates = X.reset_index()[[self.date_idx]]
        for train_start, train_end, test_start, test_end in split_idx:

            train_idx = dates[(dates[self.date_idx] > days[train_start])
                              & (dates[self.date_idx] <= days[train_end])].index
            test_idx = dates[(dates[self.date_idx] > days[test_start])
                             & (dates[self.date_idx] <= days[test_end])].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx.to_numpy(), test_idx.to_numpy()

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [66]:
def plot_preds_scatter(df, ticker=None):
    if ticker is not None:
        idx = pd.IndexSlice
        df = df.loc[idx[ticker, :], :]
    j = sns.jointplot(x='predicted', y='actuals',
                      robust=True, ci=None,
                      line_kws={'lw': 1, 'color': 'k'},
                      scatter_kws={'s': 1},
                      data=df,
                      kind='reg')
    j.ax_joint.yaxis.set_major_formatter(
        FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
    j.ax_joint.xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{:.1%}'.format(x)))
    j.ax_joint.set_xlabel('Predicted')
    j.ax_joint.set_ylabel('Actuals')

In [67]:
def plot_ic_distribution(df, ax=None):
    if ax is not None:
        sns.distplot(df.ic, ax=ax)
    else:
        ax = sns.distplot(df.ic)
    mean, median = df.ic.mean(), df.ic.median()
    ax.axvline(0, lw=1, ls='--', c='k')
    ax.text(x=.05, y=.9,
            s=f'Mean: {mean:8.2f}\nMedian: {median:5.2f}',
            horizontalalignment='left',
            verticalalignment='center',
            transform=ax.transAxes)
    ax.set_xlabel('Information Coefficient')
    sns.despine()
    plt.tight_layout()

In [68]:
def plot_rolling_ic(df):
    fig, axes = plt.subplots(nrows=2, sharex=True, figsize=(14, 8))
    rolling_result = df.sort_index().rolling(21).mean().dropna()
    mean_ic = df.ic.mean()
    rolling_result.ic.plot(ax=axes[0],
                           title=f'Information Coefficient (Mean: {mean_ic:.2f})',
                           lw=1)
    axes[0].axhline(0, lw=.5, ls='-', color='k')
    axes[0].axhline(mean_ic, lw=1, ls='--', color='k')

    mean_rmse = df.rmse.mean()
    rolling_result.rmse.plot(ax=axes[1],
                             title=f'Root Mean Squared Error (Mean: {mean_rmse:.2%})',
                             lw=1,
                             ylim=(0, df.rmse.max()))
    axes[1].axhline(df.rmse.mean(), lw=1, ls='--', color='k')
    sns.despine()
    plt.tight_layout()

## Models

Target and Features

In [64]:
y = df.filter(like='target')
X = df.drop(y.columns, axis=1)
X = X.drop(['Volume'], axis=1)

Cross-validation indices

In [69]:
MONTH = 12
YEAR = 12 * MONTH
train_period_length = 63
test_period_length = 10
n_splits = int(3 * YEAR/test_period_length)
lookahead = 1 # return period

# cross-validation
cv = MultipleTimeSeriesCV(n_splits=n_splits,
    test_period_length=test_period_length,
    lookahead=lookahead,
    train_period_length=train_period_length)

In [None]:
target = f'target_{lookahead}d'
lr_predictions, lr_scores = [], []

model_names = ("ols", "rf", "xgb")
eval_dict = {}

lr = LinearRegression()
for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):
    X_train, y_train, = X.iloc[train_idx], y[target].iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y[target].iloc[test_idx]
    lr.fit(X=X_train, y=y_train)
    y_pred = lr.predict(X_test)

    preds = y_test.to_frame('actuals').assign(predicted=y_pred)
    preds_by_day = preds.groupby(level='Date')
    scores = pd.concat([preds_by_day.apply(lambda x: spearmanr(x.predicted,
                                                               x.actuals)[0] * 100)
                        .to_frame('ic'),
                        preds_by_day.apply(lambda x: np.sqrt(mean_squared_error(y_pred=x.predicted,
                                                                                y_true=x.actuals)))
                        .to_frame('rmse')], axis=1)

    lr_scores.append(scores)
    lr_predictions.append(preds)

lr_scores = pd.concat(lr_scores)
lr_predictions = pd.concat(lr_predictions)