In [2]:
# Import libraries
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Set paths
fig_path = './figures/'
data_path = './data/'

------------------------------------
**Get the companies**
------------------------------------

In [3]:
# Read data

mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
snp = pd.read_csv(data_path+"snp_allstocks_2015_2019.csv", index_col='Date', parse_dates=True, date_parser=mydateparser)
info = pd.read_csv(data_path+'snp_info.csv', index_col=0)

snp500 = pd.read_csv(data_path+"snp_500_2015_2019.csv", index_col='Date', parse_dates=True, date_parser=mydateparser)
snp500 = snp500['Adj Close']

# https://www.slickcharts.com/sp500
# https://datahub.io/core/s-and-p-500-companies-financials
detailed_info = pd.read_csv(data_path+'constituents-financials.csv', index_col=0)
stocks_sorted = detailed_info.sort_values('Market Cap', ascending=False)['Sector']

In [4]:
stocks_by_sector = {sector: [] for sector in info['GICS Sector'].unique()}

for stock in stocks_sorted.index[:160]:
    if stock in ['PCLN', 'TWX', 'AET', 'MON', 'PX', 'ESRX']:
        continue
    stock = 'BRK-B' if stock == 'BRK.B' else stock
    
    sector = info.set_index('Symbol').loc[stock]['GICS Sector']
    stocks_by_sector[sector].append(stock)
# stocks

In [5]:
# Select top three stocks (by market cap) within each GICS Sector
stocks = {}
for sector in stocks_by_sector:
    stocks[sector] = stocks_by_sector[sector][:3]

In [6]:
# use a subset of stocks instead for easy understanding
# order them alphabetically
tmp = {}
tmp['Communication Services'] = stocks['Communication Services']
tmp['Consumer Discretionary'] = stocks['Consumer Discretionary']
tmp['Financials'] = stocks['Financials']
tmp['Information Technology'] = stocks['Information Technology']

stocks = tmp
stocks

{'Communication Services': ['GOOGL', 'GOOG', 'FB'],
 'Consumer Discretionary': ['AMZN', 'HD', 'MCD'],
 'Financials': ['JPM', 'BAC', 'WFC'],
 'Information Technology': ['AAPL', 'MSFT', 'V']}

------------------------------------
**Get the data**
------------------------------------
- Standardize individual asset names (e.g. asset_1)

In [7]:
sector_col = []
stock_col = []
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        sector_col.append(sector)
        stock_col.append(stock_name)

df = pd.DataFrame(columns=[sector_col, stock_col], index=snp.index)
df.columns.names = ['Sector', 'Asset']
# df.head()

In [8]:
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        df.loc[:,(sector, stock_name)] = snp[stock]
df.head()

Sector,Communication Services,Communication Services,Communication Services,Consumer Discretionary,Consumer Discretionary,Consumer Discretionary,Financials,Financials,Financials,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2015-01-02,529.549988,521.937744,78.449997,308.519989,103.43,93.260002,62.490002,17.9,54.700001,109.330002,46.759998,66.254997
2015-01-05,519.460022,511.057617,77.190002,302.190002,101.260002,92.230003,60.549999,17.379999,53.200001,106.25,46.330002,64.792503
2015-01-06,506.640015,499.212799,76.150002,295.290009,100.949997,92.400002,58.98,16.860001,52.09,106.260002,45.650002,64.375
2015-01-07,505.149994,498.357513,76.150002,298.420013,104.410004,94.010002,59.07,16.940001,52.400002,107.75,46.23,65.237503
2015-01-08,506.910004,499.928864,78.18,300.459991,106.720001,94.360001,60.389999,17.290001,53.560001,111.889999,47.59,66.112503


------------------------------------
**Represent DataFrame as one column Multi-index data**
------------------------------------

In [9]:
df_tensor = df.stack([0, 1])
df_tensor.head()

Date        Sector                  Asset  
2015-01-02  Communication Services  asset_1    529.549988
                                    asset_2    521.937744
                                    asset_3     78.449997
            Consumer Discretionary  asset_1    308.519989
                                    asset_2    103.430000
dtype: float64

In [10]:
# DON'T RUN -- for visualization and help
# df_tensor.reorder_levels([1,2,0])[:30]
# df_tensor.reorder_levels([1,2,0])['Information Technology']['asset_1']

------------------------------------
**Tensorize the data**
------------------------------------

In [11]:
# from hottbox.core import Tensor
# from hottbox.pdtools import pd_to_tensor

In [12]:
# tensor = pd_to_tensor(df_tensor)

In [13]:
# tensor

------------------------------------
**Create training samples and labels, i.e. training data, $\mathcal{D}$**
------------------------------------
Create the dataset with tensor samples and vector labels: 
- Samples are stock prices of each stock windowed at a particular window length up to the prediction date, $t$. Samples are represented as third order tensors, with modes being `['Sector', 'Asset', 'Date']`. The samples are of size (#Sectors, #Assets in each sector, Window length). 
- Labels are stock prices of each stock at the prediction date, $t$. They are represented as vectors.

In [14]:
from hottbox.core import Tensor
from hottbox.pdtools import pd_to_tensor

In [15]:
w = 22 # window length

## Creating Samples
X = []
for i in range(w, len(snp.index)):
    df_t = df.iloc[i-w:i]
    df_t = df_t.stack([0, 1])
    X_t = pd_to_tensor(df_t)
    
    X.append(X_t)

In [16]:
if sum(snp.index != snp500.index) == 0:
    print('Dates are the same, we are good to go!')

Dates are the same, we are good to go!


In [17]:
y = []

## Labels are all assets' price movements
for i in range(w, len(snp.index)):
    price_old = np.array(df.iloc[i-1])
    price_now = np.array(df.iloc[i])
    diff = price_now - price_old
    y_t = np.sign(np.sign(diff) + 1.)
    
    y.append(y_t)


## Labels are the S&P 500 Index movements
# for i in range(w, len(snp500.index)):
#     price_old = snp500[i-1]
#     price_now = snp500[i]
#     diff = price_now - price_old
#     y_t = str(np.sign(diff))
    
#     y.append(y_t)


## Labels are the individual asset price movements
# for i in range(w, len(snp.index)):
#     price_old = np.array(df.iloc[i-1])
#     price_now = np.array(df.iloc[i])
#     diff = price_now - price_old
#     y_t = np.sign(diff)
    
#     y.append(y_t[3])
    
    
y = np.array(y)


# y = y[:,3]

------------------------------------
**Tensor Ensemble Learning**
------------------------------------

In [18]:
from hottbox.core import Tensor, TensorTKD
from hottbox.algorithms.decomposition import HOSVD, HOOI
# from hottbox.utils.generation import residual_tensor
from hottbox.algorithms.classification import TelVI

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [19]:
# Helper function to display scores of multiclass classification
def print_scores(scores):
    result = []
    for score in scores:
        s = "{:.2f}%".format(score * 100)
        result.append(s)
        
    print('[' + ", ".join(result) + ']')

In [20]:
class GRTEL:
    def __init__(self, base_clf, n_classes=1, probability=False, verbose=False):
        self.probability = probability
        self.verbose = verbose
        self.n_classes = n_classes
        self.base_clf = base_clf
        
        self.models = [TelVI(base_clf=self.base_clf, probability=self.probability, verbose=self.verbose) for _ in range(self.n_classes)]
        
        
    def fit(self, X, y):
        if n_classes == 1:
            self.models[0].fit(X, y)
        elif n_classes > 1:
            for i in range(self.n_classes):
                self.models[i].fit(X, y[:,i])
        
        
    def score(self, X, y):
        if n_classes == 1:
            return self.models[0].score(X, y)
        elif n_classes > 1:
            scores = []
            for i in range(self.n_classes):
                scores.append(self.models[i].score(X, y[:, i]))
            return scores
    
    
    def grid_search(self, X, y, search_params):
        if n_classes == 1:
            self.models[0].grid_search(X, y, search_params)
        elif n_classes > 1:
            for i in range(self.n_classes):
                self.models[i].grid_search(X, y[:,i], search_params)

In [21]:
# Represent each sample in Tucker form and store it in a list
algo = HOSVD()
rank = (3,2,2)
X_tk = [algo.decompose(sample, rank=rank) for sample in X]

# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=0.33, random_state=42)

# Initialise classifier
R = np.sum(rank) # number of base classifiers required
n_classes = 1 if y.ndim == 1 else y.shape[1]

grtel = GRTEL(base_clf=[SVC(gamma='auto') for _ in range(R)],
              n_classes=n_classes,
              probability=True,
              verbose=False)


# Train classifer
grtel.fit(X_train, y_train)

In [22]:
score = grtel.score(X_train, y_train)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Train):")
print_scores(score)
print()

score = grtel.score(X_test, y_test)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Test):")
print_scores(score)
print()

Classification accuracy (Train):
[52.35%, 52.66%, 55.08%, 53.87%, 53.72%, 54.78%, 51.59%, 52.50%, 49.01%, 52.35%, 53.41%, 55.08%]

Classification accuracy (Test):
[54.15%, 53.23%, 50.77%, 58.46%, 51.08%, 53.85%, 50.15%, 51.38%, 49.85%, 50.15%, 54.46%, 56.00%]



In [23]:
search_params = [dict(gamma=[0.001, 0.01, 1, 10], C=[0.1, 1, 10, 100]) for _ in range(R)]

print("\n\tPerforming grid search for each base classifer")
grtel.grid_search(X_train, y_train, search_params)

print("\n\tTrain base classifiers with optimal hyperparameters")
grtel.fit(X_train, y_train)


	Performing grid search for each base classifer

	Train base classifiers with optimal hyperparameters


In [24]:
score = grtel.score(X_train, y_train)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Train):")
print_scores(score)
print()

score = grtel.score(X_test, y_test)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Test):")
print_scores(score)
print()

Classification accuracy (Train):
[52.35%, 52.66%, 55.08%, 53.87%, 53.72%, 54.78%, 51.59%, 52.50%, 49.01%, 52.35%, 53.41%, 55.08%]

Classification accuracy (Test):
[54.15%, 53.23%, 50.77%, 58.46%, 51.08%, 53.85%, 50.15%, 51.38%, 49.85%, 50.15%, 54.46%, 56.00%]

