# OLS / Regression / Prediction

In [1]:
#
import os
import numpy as np
import pandas as pd
from pandas import Timestamp
import seaborn as sns
import matplotlib.pyplot as plt
#
from talib import RSI, BBANDS, MACD, ATR
#
from zipline.api import order_target_percent, record, symbol, set_benchmark, get_open_orders
import zipline
import datetime
import pytz
from zipline.data.bundles import load
from trading_calendars import get_calendar
import pyfolio as pf
#
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
#
%load_ext zipline

## Load Data

In [13]:
df = pd.read_csv("../data/signals/General_Factors.csv")
df.index = pd.MultiIndex.from_frame(df.iloc[:,0:2])
df = df.drop(["Symbol", "Date"], axis=1)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,rsi,bb_high,bb_low,atr,macd,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADA,2017-10-02,0.024969,0.030088,0.019969,0.025808,8.310030e+07,,,,,,...,0,0,0,0,0,0,0,1,0,0
ADA,2017-10-03,0.025671,0.027172,0.020575,0.020827,2.971484e+07,,,,,,...,0,0,0,0,0,0,0,1,0,0
ADA,2017-10-04,0.020828,0.022962,0.020828,0.021973,1.243013e+07,,,,,,...,0,0,0,0,0,0,0,1,0,0
ADA,2017-10-05,0.022035,0.022245,0.020839,0.021512,7.055960e+06,,,,,,...,0,0,0,0,0,0,0,1,0,0
ADA,2017-10-06,0.021286,0.021576,0.018071,0.018481,6.207887e+06,,,,,,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TFUEL,2022-04-13,0.164733,0.172376,0.163245,0.169795,2.146501e+07,43.725938,0.210931,0.072361,0.157398,-0.253717,...,0,1,0,0,0,0,0,0,0,0
TFUEL,2022-04-14,0.169287,0.170751,0.160710,0.163625,2.503801e+07,41.011059,0.235502,0.054108,0.143556,-0.338698,...,0,1,0,0,0,0,0,0,0,0
TFUEL,2022-04-15,0.163616,0.167434,0.163308,0.167315,2.003002e+07,43.279151,0.223333,0.084948,0.103987,-0.378491,...,0,1,0,0,0,0,0,0,0,0
TFUEL,2022-04-16,0.167315,0.167884,0.162870,0.165885,1.721647e+07,42.595503,0.227485,0.091741,0.071255,-0.415208,...,0,1,0,0,0,0,0,0,0,0


In [None]:
# what are lagged returns used for
df = df.drop([c for c in uni_prices.columns if 'lag' in c], axis=1)

In [None]:
# Machine Learning for Algorithmic Trading Chapter 6
class MultipleTimeSeriesCV:
    """Generates tuples of train_idx, test_idx pairs
    Assumes the MultiIndex contains levels 'symbol' and 'date'
    purges overlapping outcomes"""

    def __init__(self,
                 n_splits=3,
                 train_period_length=126,
                 test_period_length=21,
                 lookahead=None,
                 date_idx='date',
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle
        self.date_idx = date_idx

    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values(self.date_idx).unique()
        days = sorted(unique_dates, reverse=True)
        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx])

        dates = X.reset_index()[[self.date_idx]]
        for train_start, train_end, test_start, test_end in split_idx:

            train_idx = dates[(dates[self.date_idx] > days[train_start])
                              & (dates[self.date_idx] <= days[train_end])].index
            test_idx = dates[(dates[self.date_idx] > days[test_start])
                             & (dates[self.date_idx] <= days[test_end])].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx.to_numpy(), test_idx.to_numpy()

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [None]:
X = df

In [None]:
MONTH = 12
YEAR = 12 * MONTH
train_period_length = 63
test_period_length = 10
n_splits = int(3 * YEAR/test_period_length)
lookahead =1

# cross-validation
cv = MultipleTimeSeriesCV(n_splits=n_splits,
    test_period_length=test_period_length,
    lookahead=lookahead,
    train_period_length=train_period_length)