In [None]:
#!pip install lightgbm
#!pip install pandas
#!pip install yfinance
#!pip install tqdm
#!pip install pip install scikit-optimize
#!pip install matplotlib
#!sudo apt-get install gcc -y
#!pip install bt

In [5]:
import lightgbm as lgb
import pandas as pd
from pandas import Timestamp as ts
import yfinance as yf
from tqdm import tqdm
import numpy as np
import skopt 
import skopt.plots
from skopt.callbacks import CheckpointSaver
from IPython.display import clear_output
import matplotlib.pyplot as plt
import bt
import os
from google.cloud import storage
from io import StringIO
# CONSTANTS
HYPERPARAMETER_OPTIMIZATION = False

In [6]:
# Downloadig S&P500 information from wikipedia and Yahoo Finance
companyMetadata = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

#if not os.path.isfile('history.csv'):
tickers = yf.Tickers(list(companyMetadata['Symbol']))
history = tickers.history(period="20y")
history.to_csv('history.csv')


[*********************100%***********************]  505 of 505 completed

2 Failed downloads:
- BRK.B: No data found, symbol may be delisted
- BF.B: No data found for this date range, symbol may be delisted


In [7]:
cleanHistory = history[['Open','Close','High','Low','Volume']] \
        .reorder_levels([1,0],axis=1)\
        .sort_index(axis=1)\
        .dropna(axis= 1, how= 'all')\
        .sort_index()
tickers = cleanHistory.columns.get_level_values(0).unique()

In [None]:
def extractFeatures(companyHistory, days):
    features = {}#pd.DataFrame()
    calculateLabel = lambda series: series/series.shift(1)-1
    calculateRatioFeature = lambda series1,series2,days: series1.shift(1+days)/series2.shift(1)

    features['label'] = calculateLabel(companyHistory['Open'])
    for price in ['Open','Close','High','Low']:
        for day in range(days):
            features[f'{price}-{day}/Close'] = calculateRatioFeature(companyHistory[price],companyHistory['Close'],day+1)
    for day in range(days):
        features[f'Volume-{day}/Volume'] = calculateRatioFeature(companyHistory['Volume'],companyHistory['Volume'],day+1)
    return pd.DataFrame.from_dict(features).dropna(axis=0,how='any')

DAYS = 20

features = []
for ticker in tqdm(tickers):
    features.append(extractFeatures(cleanHistory[ticker],DAYS))
features = pd.concat(features,  keys= tickers).reorder_levels([1,0]).sort_index(level=0)
train = features.loc[:ts('2018-01-1')]
valid = features.loc[ts('2018-01-1'):ts('2020-01-1')]
test = features.loc[ts('2020-01-1'):]
del features
bucket = storage.Client().bucket('sp500-bucket')
bucket.blob('train.csv').upload_from_string(train.to_csv(), 'text/csv')
bucket.blob('valid.csv').upload_from_string(valid.to_csv(), 'text/csv')
bucket.blob('test.csv' ).upload_from_string(test.to_csv() , 'text/csv')

## Downloading From Google Cloud Platform

In [None]:
bucket = storage.Client().bucket('sp500-bucket')
train = pd.read_csv(StringIO(bucket.blob('train.csv').download_as_string()))
valid = pd.read_csv(StringIO(bucket.blob('valid.csv').download_as_string()))
test  = pd.read_csv(StringIO(bucket.blob('test.csv' ).download_as_string()))

# Hyperparameter Optimization

In [7]:
search_space = [
skopt.space.Integer(100,500, name = 'num_leaves'),
skopt.space.Integer(8, 50, name ='max_depth'),
skopt.space.Real(0.001,0.5, name = 'learning_rate'),
skopt.space.Real(0.01,1,name = 'colsample_bytree') ,
skopt.space.Real(0.01,1, name = 'subsample'),
skopt.space.Real(0, 1000, name = 'reg_alpha'),
skopt.space.Real(0, 1000, name = 'reg_lambda'), 
# min_split_gain = skopt.space.Real(name = 'min_split_gain')
# min_child_weight = skopt.space.Real(name = 'min_child_weight') 
# min_child_samples = skopt.space.Integer(name = 'min_child_samples') 
# subsample_freq = skopt.space.Integer(name = 'subsample_freq') 
]

@skopt.utils.use_named_args(search_space)
def objective(**params):
    model = lgb.LGBMRegressor(n_estimators = 1000, **params)
    eval_result = {}
    model.fit(train.drop('label', axis = 1),train['label'], 
        eval_set = (valid.drop('label', axis = 1),valid['label']),
        #verbose=False,
        callbacks= [lgb.record_evaluation(eval_result),
            lgb.early_stopping(10, verbose=True)])
    return min(eval_result['valid_0']['l2'])


class ConvergencePlotCallback(object):
    def __init__(self, figsize=(12,8)):
        self.fig = plt.figure(figsize=figsize)

    def __call__(self, res):
        clear_output(wait=True)
        skopt.plots.plot_convergence(res, yscale="log")
        plt.show()
plot_callback = ConvergencePlotCallback(figsize=(12,8))
checkpoint_callback = CheckpointSaver("checkpoint.pkl")

if HYPERPARAMETER_OPTIMIZATION:
    results = skopt.gp_minimize(objective,                  # the function to minimize
                  search_space,      # the bounds on each dimension of x
                  n_calls=1000,         # the number of evaluations of f
                  callback=[checkpoint_callback,plot_callback],
                  noise=0.1**2)

    skopt.plots.plot_convergence(results)

# Model Taining

## Load and Fit Optimal Model 

In [8]:
if os.path.isfile('checkpoint.pkl'):
    optimalParams = dict(zip ([i.name for i in skopt.load('checkpoint.pkl')['space']],
                          skopt.load('checkpoint.pkl')['x']))
else:
    print("Running Base Model")
    optimalParams = {}
model = lgb.LGBMRegressor(**optimalParams)
eval_result = {}
model.fit(train.drop('label', axis = 1),train['label'], 
    eval_set = (valid.drop('label', axis = 1),valid['label']),
    callbacks= [lgb.record_evaluation(eval_result),
        lgb.log_evaluation(),
        lgb.early_stopping(10, verbose=True)])

[1]	valid_0's l2: 0.000262269
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.000235695
[3]	valid_0's l2: 0.000215279
[4]	valid_0's l2: 0.000199517
[5]	valid_0's l2: 0.000189404
[6]	valid_0's l2: 0.00017958
[7]	valid_0's l2: 0.000171994
[8]	valid_0's l2: 0.000166203
[9]	valid_0's l2: 0.000161706
[10]	valid_0's l2: 0.000158243
[11]	valid_0's l2: 0.000155625
[12]	valid_0's l2: 0.000153608
[13]	valid_0's l2: 0.000152017
[14]	valid_0's l2: 0.000150789
[15]	valid_0's l2: 0.000149849
[16]	valid_0's l2: 0.0001491
[17]	valid_0's l2: 0.000148556
[18]	valid_0's l2: 0.000148117
[19]	valid_0's l2: 0.000147786
[20]	valid_0's l2: 0.000147553
[21]	valid_0's l2: 0.000147443
[22]	valid_0's l2: 0.000147248
[23]	valid_0's l2: 0.000147149
[24]	valid_0's l2: 0.000147049
[25]	valid_0's l2: 0.000146966
[26]	valid_0's l2: 0.000146907
[27]	valid_0's l2: 0.000146909
[28]	valid_0's l2: 0.000146875
[29]	valid_0's l2: 0.000146844
[30]	valid_0's l2: 0.000146819
[31]	valid_0's l2: 0.

LGBMRegressor(colsample_bytree=0.8711716053585946,
              learning_rate=0.12123094384928863, max_depth=14, num_leaves=452,
              reg_alpha=6.335223311452995, reg_lambda=640.0933782281086,
              subsample=0.4551545343587453)

## Pedictions

In [9]:
score = pd.DataFrame(model.predict(test.drop('label', axis = 1)),index = test.index)
score.index.rename('Ticker',level=1, inplace = True)
score = pd.pivot_table(score, values=0, index='Date', columns='Ticker')
close = cleanHistory.reorder_levels([1,0],axis=1)['Close']

# Backtesting

In [38]:
class TopkDropoutStrategy(bt.Algo):
    def __init__(self, score, ):
        self.score = score
        self.holding = None
        self.weights = None
        
    def topkDropoutStrategy(score , holding, k = 100, drop = 1):
        if holding is None:
            selected = list(score.index[:k]) #Score is already sorted
        else:
            holdingSorted = score[holding].sort_values(ascending=False)
            continueToBeHeld = holdingSorted[:-drop]
            rest = score.drop(continueToBeHeld.index)\
                        .sort_values(ascending=False)
            bought = rest.index[:drop]
            selected = list(continueToBeHeld.index)+list(bought)
        return selected
    
    def __call__(self, target):
        if target.now in self.score.index:
            score_sorted = self.score.loc[target.now].sort_values(ascending=False)
            selected = TopkDropoutStrategy.topkDropoutStrategy(score_sorted,self.holding)
                    
            target.temp['selected'] = selected
            target.temp['holding'] = self.holding
            self.holding = selected
            # return True because we want to keep on moving down the stack of algs
            return True

class Weight(bt.Algo):
    def __init__(self):
        self.weights = None
        
    def setWeights(holding, selected, past_weights):
        weights = {}
        numTickers = len(selected)
        if past_weights is None:
            for ticker in selected:
                weights[ticker] = 1/numTickers
        else:
            weights = {}
            bought = set(selected)-set(holding)
            sold = set(holding)-set(selected)
            
            soldCapital = 0
            for ticker in sold:
                soldCapital += past_weights.pop(ticker)
            buyCapitalPerSecurity = soldCapital/len(sold)
            
            for ticker in selected:
                if ticker in holding:
                    weights[ticker] = past_weights[ticker]
                else:
                    weights[ticker] = buyCapitalPerSecurity
        return weights

    def __call__(self, target):
        holding, selected = target.temp['holding'], target.temp['selected']
        weights = Weight.setWeights(holding, selected, self.weights)
        target.temp['weights'] = weights
        self.weights = weights
        return True
    
# first we create the Strategy
s = bt.Strategy('topk', [TopkDropoutStrategy(score),
                         Weight(),
                         bt.algos.Rebalance(),
                         bt.algos.RunMonthly(),#bt.algos.RunQuarterly
                         bt.algos.WeighEqually(),
                         bt.algos.Rebalance(),
                         #bt.algos.PrintTempData(),
                        ])

benchmark = bt.Strategy('benchmark',[bt.algos.SelectAll(),
                                    bt.algos.WeighEqually(),
                                    bt.algos.Rebalance()])

# now we create the Backtest
def commissions(q,p):
    return abs(q)*p*0.02
t = bt.Backtest(s, close.loc[ts('2020-01-1'):], 
                commissions= commissions,
                integer_positions=False,
                )
b = bt.Backtest(benchmark, bt.get('spy').loc[ts('2020-01-1'):])

# and let's run it!
res = bt.run(t,b)

In [None]:
%matplotlib inline
res.plot()

<AxesSubplot:title={'center':'Equity Progression'}>

In [None]:
res.display()