In [19]:
# Import libraries
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Set paths
fig_path = './figures/'
data_path = './data/'

------------------------------------
**Get the companies**
------------------------------------

In [20]:
# Read data

mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
snp = pd.read_csv(data_path+"snp_allstocks_2015_2019.csv", index_col='Date', parse_dates=True, date_parser=mydateparser)
info = pd.read_csv(data_path+'snp_info.csv', index_col=0)

# https://www.slickcharts.com/sp500
# https://datahub.io/core/s-and-p-500-companies-financials
detailed_info = pd.read_csv(data_path+'constituents-financials.csv', index_col=0)
stocks_sorted = detailed_info.sort_values('Market Cap', ascending=False)['Sector']

In [3]:
stocks_by_sector = {sector: [] for sector in info['GICS Sector'].unique()}

for stock in stocks_sorted.index[:160]:
    if stock in ['PCLN', 'TWX', 'AET', 'MON', 'PX', 'ESRX']:
        continue
    stock = 'BRK-B' if stock == 'BRK.B' else stock
    
    sector = info.set_index('Symbol').loc[stock]['GICS Sector']
    stocks_by_sector[sector].append(stock)
# stocks

In [4]:
# Select top three stocks (by market cap) within each GICS Sector
stocks = {}
for sector in stocks_by_sector:
    stocks[sector] = stocks_by_sector[sector][:3]

In [5]:
# use a subset of stocks instead for easy understanding
# order them alphabetically
tmp = {}
tmp['Communication Services'] = stocks['Communication Services']
tmp['Consumer Discretionary'] = stocks['Consumer Discretionary']
tmp['Financials'] = stocks['Financials']
tmp['Information Technology'] = stocks['Information Technology']

stocks = tmp
stocks

{'Communication Services': ['GOOGL', 'GOOG', 'FB'],
 'Consumer Discretionary': ['AMZN', 'HD', 'MCD'],
 'Financials': ['JPM', 'BAC', 'WFC'],
 'Information Technology': ['AAPL', 'MSFT', 'V']}

------------------------------------
**Get the data**
------------------------------------
- Standardize individual asset names (e.g. asset_1)

In [6]:
sector_col = []
stock_col = []
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        sector_col.append(sector)
        stock_col.append(stock_name)

df = pd.DataFrame(columns=[sector_col, stock_col], index=snp.index)
df.columns.names = ['Sector', 'Asset']
# df.head()

In [7]:
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        df.loc[:,(sector, stock_name)] = snp[stock]
df

Sector,Communication Services,Communication Services,Communication Services,Consumer Discretionary,Consumer Discretionary,Consumer Discretionary,Financials,Financials,Financials,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2015-01-02,529.549988,521.937744,78.449997,308.519989,103.430000,93.260002,62.490002,17.900000,54.700001,109.330002,46.759998,66.254997
2015-01-05,519.460022,511.057617,77.190002,302.190002,101.260002,92.230003,60.549999,17.379999,53.200001,106.250000,46.330002,64.792503
2015-01-06,506.640015,499.212799,76.150002,295.290009,100.949997,92.400002,58.980000,16.860001,52.090000,106.260002,45.650002,64.375000
2015-01-07,505.149994,498.357513,76.150002,298.420013,104.410004,94.010002,59.070000,16.940001,52.400002,107.750000,46.230000,65.237503
2015-01-08,506.910004,499.928864,78.180000,300.459991,106.720001,94.360001,60.389999,17.290001,53.560001,111.889999,47.590000,66.112503
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-24,984.669983,976.219971,124.059998,1343.959961,158.139999,170.279999,92.139999,22.730000,43.599998,146.830002,94.129997,121.730003
2018-12-26,1047.849976,1039.459961,134.179993,1470.900024,168.279999,174.029999,95.959999,24.110001,45.590000,157.169998,100.559998,130.229996
2018-12-27,1052.900024,1043.880005,134.520004,1461.640015,170.320007,175.710007,97.040001,24.370001,45.529999,156.149994,101.180000,132.009995
2018-12-28,1046.680054,1037.079956,133.199997,1478.020020,170.220001,175.559998,96.830002,24.389999,45.779999,156.229996,100.389999,130.940002


------------------------------------
**Represent DataFrame as one column Multi-index data**
------------------------------------

In [8]:
df_tensor = df.stack([0, 1])
df_tensor

Date        Sector                  Asset  
2015-01-02  Communication Services  asset_1    529.549988
                                    asset_2    521.937744
                                    asset_3     78.449997
            Consumer Discretionary  asset_1    308.519989
                                    asset_2    103.430000
                                                  ...    
2018-12-31  Financials              asset_2     24.639999
                                    asset_3     46.080002
            Information Technology  asset_1    157.740005
                                    asset_2    101.570000
                                    asset_3    131.940002
Length: 12072, dtype: float64

In [9]:
# DON'T RUN -- for visualization and help
# df_tensor.reorder_levels([1,2,0])[:30]
# df_tensor.reorder_levels([1,2,0])['Information Technology']['asset_1']

------------------------------------
**Tensorize the data**
------------------------------------

In [10]:
# from hottbox.core import Tensor
# from hottbox.pdtools import pd_to_tensor

In [11]:
# tensor = pd_to_tensor(df_tensor)

In [12]:
# tensor

------------------------------------
**Create training samples and labels, i.e. training data, $\mathcal{D}$**
------------------------------------
Create the dataset with tensor samples and vector labels: 
- Samples are stock prices of each stock windowed at a particular window length up to the prediction date, $t$. Samples are represented as third order tensors, with modes being `['Sector', 'Asset', 'Date']`. The samples are of size (#Sectors, #Assets in each sector, Window length). 
- Labels are stock prices of each stock at the prediction date, $t$. They are represented as vectors.

In [13]:
from hottbox.core import Tensor
from hottbox.pdtools import pd_to_tensor

In [14]:
w = 22 # window length

X = []
y = []
for i in range(w, len(snp.index)):
    df_t = df.iloc[i-w:i]
    df_t = df_t.stack([0, 1])
    X_t = pd_to_tensor(df_t)
    
    price_old = np.array(df.iloc[i-1])
    price_now = np.array(df.iloc[i])
    diff = price_now - price_old
    y_t = np.sign(diff)
    
    X.append(X_t)
    y.append(y_t)

------------------------------------
**Tensor Ensemble Learning**
------------------------------------

In [15]:
from hottbox.core import Tensor, TensorTKD
from hottbox.algorithms.decomposition import HOSVD, HOOI
# from hottbox.utils.generation import residual_tensor
from hottbox.algorithms.classification import TelVI

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [16]:
# Represent each sample in Tucker form and store it in a list
algo = HOSVD()
rank = (5,2,2)
X_tk = [algo.decompose(sample, rank=rank) for sample in X]

# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=0.5)

In [17]:
# Initialise classifier
R = np.sum(rank) # number of base classifiers required

telvi = TelVI(base_clf=[SVC(gamma='auto') for _ in range(R)],
             probability = True,
             verbose=True)


In [18]:
# Train classifer
telvi.fit(X_train, y_train)

Base classifier #0 (SVC): Learning model parameters


ValueError: y should be a 1d array, got an array of shape (492, 12) instead.