In [1]:
# Import libraries
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Set paths
fig_path = './figures/'
data_path = './data/'

------------------------------------
**Get the companies**
------------------------------------

In [2]:
# Read data

mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
snp = pd.read_csv(data_path+"snp_allstocks_2015_2019.csv", index_col='Date', parse_dates=True, date_parser=mydateparser)
info = pd.read_csv(data_path+'snp_info.csv', index_col=0)

snp500 = pd.read_csv(data_path+"snp_500_2015_2019.csv", index_col='Date', parse_dates=True, date_parser=mydateparser)
snp500 = snp500['Adj Close']

# https://www.slickcharts.com/sp500
# https://datahub.io/core/s-and-p-500-companies-financials
detailed_info = pd.read_csv(data_path+'constituents-financials.csv', index_col=0)
stocks_sorted = detailed_info.sort_values('Market Cap', ascending=False)['Sector']

In [3]:
stocks_by_sector = {sector: [] for sector in info['GICS Sector'].unique()}

for stock in stocks_sorted.index[:160]:
    if stock in ['PCLN', 'TWX', 'AET', 'MON', 'PX', 'ESRX']:
        continue
    stock = 'BRK-B' if stock == 'BRK.B' else stock
    
    sector = info.set_index('Symbol').loc[stock]['GICS Sector']
    stocks_by_sector[sector].append(stock)
# stocks

In [4]:
# Select top three stocks (by market cap) within each GICS Sector
stocks = {}
for sector in stocks_by_sector:
    stocks[sector] = stocks_by_sector[sector][:3]

In [5]:
# use a subset of stocks instead for easy understanding
# order them alphabetically
tmp = {}
tmp['Communication Services'] = stocks['Communication Services']
tmp['Consumer Discretionary'] = stocks['Consumer Discretionary']
tmp['Financials'] = stocks['Financials']
tmp['Information Technology'] = stocks['Information Technology']

stocks = tmp
stocks

{'Communication Services': ['GOOGL', 'GOOG', 'FB'],
 'Consumer Discretionary': ['AMZN', 'HD', 'MCD'],
 'Financials': ['JPM', 'BAC', 'WFC'],
 'Information Technology': ['AAPL', 'MSFT', 'V']}

------------------------------------
**Get the data**
------------------------------------
- Standardize individual asset names (e.g. asset_1)

In [6]:
sector_col = []
stock_col = []
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        sector_col.append(sector)
        stock_col.append(stock_name)

df = pd.DataFrame(columns=[sector_col, stock_col], index=snp.index)
df.columns.names = ['Sector', 'Asset']
# df.head()

In [7]:
for sector in stocks:
    for i, stock in enumerate(stocks[sector]):
        stock_name = 'asset_' + str(i+1)
        df.loc[:,(sector, stock_name)] = snp[stock]
df.head()

Sector,Communication Services,Communication Services,Communication Services,Consumer Discretionary,Consumer Discretionary,Consumer Discretionary,Financials,Financials,Financials,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3,asset_1,asset_2,asset_3
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2015-01-02,529.549988,521.937744,78.449997,308.519989,103.43,93.260002,62.490002,17.9,54.700001,109.330002,46.759998,66.254997
2015-01-05,519.460022,511.057617,77.190002,302.190002,101.260002,92.230003,60.549999,17.379999,53.200001,106.25,46.330002,64.792503
2015-01-06,506.640015,499.212799,76.150002,295.290009,100.949997,92.400002,58.98,16.860001,52.09,106.260002,45.650002,64.375
2015-01-07,505.149994,498.357513,76.150002,298.420013,104.410004,94.010002,59.07,16.940001,52.400002,107.75,46.23,65.237503
2015-01-08,506.910004,499.928864,78.18,300.459991,106.720001,94.360001,60.389999,17.290001,53.560001,111.889999,47.59,66.112503


------------------------------------
**Represent DataFrame as one column Multi-index data**
------------------------------------

In [8]:
df_tensor = df.stack([0, 1])
df_tensor.head()

Date        Sector                  Asset  
2015-01-02  Communication Services  asset_1    529.549988
                                    asset_2    521.937744
                                    asset_3     78.449997
            Consumer Discretionary  asset_1    308.519989
                                    asset_2    103.430000
dtype: float64

In [9]:
# DON'T RUN -- for visualization and help
# df_tensor.reorder_levels([1,2,0])[:30]
# df_tensor.reorder_levels([1,2,0])['Information Technology']['asset_1']

------------------------------------
**Tensorize the data**
------------------------------------

In [10]:
# from hottbox.core import Tensor
# from hottbox.pdtools import pd_to_tensor

In [11]:
# tensor = pd_to_tensor(df_tensor)

In [12]:
# tensor

------------------------------------
**Helper Fuctions**
------------------------------------

In [13]:
### Evaluates the confidence in the predicted downturns
def downturn_confidence(actual, predicted):
    n = 0
    x = 0
    for i in range(len(actual)):
        if predicted[i] == 0:
            n += 1
            if predicted[i] == actual[i]:
                x += 1
    
    return None if n == 0 else (n, x, x/n)


# Helper function to display scores of multiclass classification
def print_scores(scores):
    result = []
    for score in scores:
        s = "{:.2f}%".format(score * 100)
        result.append(s)
        
    print('[' + ", ".join(result) + ']')
    
def print_1_percentage(y):
    percentages = sum(y == 1.)/len(y)
    percentages = list(percentages) if n_classes > 1 else [percentages]

    print_scores(percentages)

------------------------------------
**Create training samples and labels, i.e. training data, $\mathcal{D}$**
------------------------------------
Create the dataset with tensor samples and vector labels: 
- Samples are stock prices of each stock windowed at a particular window length up to the prediction date, $t$. Samples are represented as third order tensors, with modes being `['Sector', 'Asset', 'Date']`. The samples are of size (#Sectors, #Assets in each sector, Window length). 
- Labels are stock prices of each stock at the prediction date, $t$. They are represented as vectors.

In [14]:
from hottbox.core import Tensor
from hottbox.pdtools import pd_to_tensor

In [15]:
w = 66 # window length
future_days = 22 # number of days to the future we are predicting the price at
future_days -= 1 # for indexing

## Creating Samples
X = []
for i in range(w, len(snp.index) - future_days):
    df_t = df.iloc[i-w:i]
    df_t = df_t.stack([0, 1])
    X_t = pd_to_tensor(df_t)
    
    X.append(X_t)

In [16]:
if sum(snp.index != snp500.index) == 0:
    print('Dates are the same, we are good to go!')

Dates are the same, we are good to go!


In [17]:
y = []

## Labels are all assets' price movements
for i in range(w+future_days, len(snp.index)):
    price_old = np.array(df.iloc[i-future_days-1])
    price_future = np.array(df.iloc[i])
    diff = price_future - price_old
    y_t = np.sign(np.sign(diff) + 1.)
    
    y.append(y_t)


## Labels are the S&P 500 Index movements
# for i in range(w, len(snp500.index)):
#     price_old = snp500[i-1]
#     price_now = snp500[i]
#     diff = price_now - price_old
#     y_t = str(np.sign(diff))
    
#     y.append(y_t)


    
    
y = np.array(y)

------------------------------------
**Tensor Ensemble Learning**
------------------------------------

In [22]:
from hottbox.core import Tensor, TensorTKD
from hottbox.algorithms.decomposition import HOSVD, HOOI
# from hottbox.utils.generation import residual_tensor
from hottbox.algorithms.classification import TelVI

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [23]:
class GRTEL:
    def __init__(self, base_clfs, n_classes=1, probability=False, verbose=False):
        self.probability = probability
        self.verbose = verbose
        self.n_classes = n_classes
        self.models = [TelVI(base_clf=base_clfs[i], probability=self.probability, verbose=self.verbose) for i in range(self.n_classes)]
        
    def fit(self, X, y):
        if n_classes == 1:
            self.models[0].fit(X, y)
        elif n_classes > 1:
            for i in range(self.n_classes):
                print(i, end=" - ")
                self.models[i].fit(X, y[:,i])
            print()
        
    def score(self, X, y):
        if n_classes == 1:
            return self.models[0].score(X, y)
        elif n_classes > 1:
            scores = []
            for i in range(self.n_classes):
                scores.append(self.models[i].score(X, y[:, i]))
            return scores
    
    def grid_search(self, X, y, search_params):
        if n_classes == 1:
            self.models[0].grid_search(X, y, search_params)
        elif n_classes > 1:
            for i in range(self.n_classes):
                print(i, end=" - ")
                self.models[i].grid_search(X, y[:,i], search_params)
            print()
                
    def predict(self, X):
        predictions = []
        for i in range(self.n_classes):
            predictions.append(self.models[i].predict(X))
        return predictions
        

In [24]:
# Represent each sample in Tucker form and store it in a list
algo = HOSVD()
rank = (6,2,2)
X_tk = [algo.decompose(sample, rank=rank) for sample in X]

# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=0.33, random_state=42)

# test_size = 0.33
# k = int(len(X_tk) * (1. - 0.33))

# X_train, X_test = X_tk[:k], X_tk[k:]
# y_train, y_test = y[:k], y[k:]


# Initialise classifier
R = np.sum(rank) # number of base classifiers required per class
n_classes = 1 if y.ndim == 1 else y.shape[1]

base_clfs = []
for _ in range(n_classes):
    base_clfs.append([SVC(gamma='auto') for _ in range(R)])

grtel = GRTEL(base_clfs=base_clfs,
              n_classes=n_classes,
              probability=True,
              verbose=False)


# Train classifer
grtel.fit(X_train, y_train)


#Scores
score = grtel.score(X_train, y_train)
score = score if n_classes > 1 else [score]
print("\nClassification accuracy (Train):")
print_scores(score); print()

score = grtel.score(X_test, y_test)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Test):")
print_scores(score); print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test); print()

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 

Classification accuracy (Train):
[58.37%, 58.21%, 63.58%, 72.85%, 60.98%, 68.29%, 62.44%, 59.02%, 53.33%, 60.81%, 68.29%, 70.08%]

Classification accuracy (Test):
[63.49%, 62.83%, 63.82%, 75.66%, 60.53%, 62.17%, 62.50%, 60.20%, 54.93%, 56.91%, 71.05%, 67.11%]

Percentage of 1s (Test):
[63.16%, 62.17%, 63.82%, 75.66%, 60.53%, 62.17%, 62.50%, 59.87%, 54.93%, 56.25%, 71.05%, 67.11%]



In [25]:
search_params = [dict(gamma=[0.001, 0.01, 1, 10], C=[0.1, 1, 10, 100]) for _ in range(R)]

print("\tPerforming grid search for each base classifer")
grtel.grid_search(X_train, y_train, search_params)

print("\tTrain base classifiers with optimal hyperparameters")
grtel.fit(X_train, y_train); print()

score = grtel.score(X_train, y_train)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Train):")
print_scores(score); print()

score = grtel.score(X_test, y_test)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Test):")
print_scores(score); print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test); print()

	Performing grid search for each base classifer
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 
	Train base classifiers with optimal hyperparameters
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 

Classification accuracy (Train):
[93.17%, 100.00%, 98.54%, 79.02%, 99.84%, 100.00%, 92.52%, 100.00%, 100.00%, 100.00%, 90.89%, 100.00%]

Classification accuracy (Test):
[70.39%, 74.01%, 74.67%, 77.63%, 69.08%, 63.16%, 69.74%, 76.32%, 84.54%, 73.36%, 74.01%, 68.09%]

Percentage of 1s (Test):
[63.16%, 62.17%, 63.82%, 75.66%, 60.53%, 62.17%, 62.50%, 59.87%, 54.93%, 56.25%, 71.05%, 67.11%]



In [26]:
results = []
for i in range(12):
    results.append(downturn_confidence(y_test[:,i], grtel.models[i].predict(X_test)))
results

[(38, 30, 0.7894736842105263),
 (52, 43, 0.8269230769230769),
 (41, 36, 0.8780487804878049),
 (4, 4, 1.0),
 (28, 27, 0.9642857142857143),
 (1, 1, 1.0),
 (28, 26, 0.9285714285714286),
 (51, 44, 0.8627450980392157),
 (81, 72, 0.8888888888888888),
 (71, 62, 0.8732394366197183),
 (19, 19, 1.0),
 (1, 1, 1.0)]

In [28]:
# df_zeros = pd.DataFrame([grtel.models[i].predict(X_test) for i in range(n_classes)]).T

# print(len(df_zeros[df_zeros[0] == 0]))
# df_zeros[df_zeros[0] == 0]

In [29]:
results = []
for i in range(12):
    results.append(downturn_confidence(y_train[:,i], grtel.models[i].predict(X_train)))
results

[(173, 171, 0.9884393063583815),
 (232, 229, 0.9870689655172413),
 (183, 182, 0.994535519125683),
 (30, 30, 1.0),
 (185, 185, 1.0),
 (186, 186, 1.0),
 (150, 149, 0.9933333333333333),
 (161, 161, 1.0),
 (284, 284, 1.0),
 (251, 249, 0.9920318725099602),
 (105, 105, 1.0),
 (184, 184, 1.0)]

In [30]:
# df_zeros = pd.DataFrame([grtel.models[i].predict(X_train) for i in range(n_classes)]).T

# print(len(df_zeros[df_zeros[0] == 0]))
# df_zeros[df_zeros[0] == 0]