In [1]:
# Allows imports from other packages in the project
import sys
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd

In [3]:
snp_info_path = "../data/snp_info.csv"
info = (
    pd.read_csv(snp_info_path, index_col=0)
    .set_index("Symbol")
)

In [4]:
stocks_by_sector_df = pd.read_csv("../data/stocks_by_sector.csv")
stocks_by_sector_df

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,AMZN,WMT,MRO,BAC,PFE,CSX,AAPL
1,GOOGL,F,KO,KMI,WFC,MRK,GE,AMD
2,GOOG,GM,KR,XOM,C,GILD,DAL,NVDA
3,CMCSA,EBAY,PG,HAL,RF,BMY,AAL,MU
4,VZ,SBUX,MDLZ,WMB,JPM,BSX,CPRT,MSFT
5,NFLX,NKE,MO,COP,KEY,ABT,LUV,INTC
6,DIS,M,MNST,SLB,MS,CVS,FAST,CSCO
7,IPG,MGM,COTY,DVN,HBAN,ABBV,CAT,HPE
8,EA,TJX,WBA,CVX,SCHW,JNJ,JCI,ORCL
9,NWSA,TGT,PM,MPC,SYF,MDT,UAL,AMAT


In [5]:
ticker_to_idx = {
    ticker: (r, c)
    for r, row in enumerate(stocks_by_sector_df.values)
    for c, ticker in enumerate(row)
}

## Get Samples and Labels

In [6]:
samples = pd.read_csv(
    "../data/samples_sector.csv",
    index_col=0,
    header=[0, 1, 2],
    parse_dates=True
)
samples.head()

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,26.540785,66.984964,97.740045,-2.259955,0.069637,0.016062,0.047704,14024206,37.272999,58.189602,...,0.004329,-16351600,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,26.435045,63.461389,78.712858,-21.287142,0.093537,0.032122,0.030928,-26045860,36.681,49.067155,...,-0.017172,-33145200,16.91,47.80152,38.129494,-61.870506,-0.07806,-0.114917,0.013789,-91721800
2016-01-26,26.73716,68.86132,92.129624,-7.870376,0.12882,0.052022,0.044248,29134620,36.689499,49.199084,...,-0.003958,-13416300,17.35,57.746426,63.025232,-36.974768,-0.035484,-0.09857,0.035821,-75935100
2016-01-27,26.797583,69.888271,86.610891,-13.389109,0.157869,0.073679,0.051571,95682832,35.879002,38.286812,...,0.024354,-33983900,17.09,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,26.835346,70.587786,88.7029,-11.2971,0.180559,0.095447,0.03586,130350978,37.415001,58.442415,...,0.015522,-18511100,16.68,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400


In [7]:
labels = pd.read_csv(
    "../data/labels_sector.csv",
    index_col="Date",
    parse_dates=True,
)
labels.head()


Unnamed: 0_level_0,T,GOOGL,GOOG,CMCSA,VZ,NFLX,DIS,IPG,EA,NWSA,...,AAPL,AMD,NVDA,MU,MSFT,INTC,CSCO,HPE,ORCL,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
2016-01-25,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-26,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-27,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Tensor Ensemble Learning

In [8]:
# from grtel.decomposition import GLTD
GLTD = lambda x: print("TODO: Implement GLTD")
from grtel.utils import print_scores, print_1_percentage, confusion_matrix_metrics
from grtel.classification import GRTEL

from hottbox.core import Tensor, TensorTKD
from hottbox.pdtools import pd_to_tensor
from hottbox.algorithms.decomposition import HOOI

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [9]:
X: list[Tensor] = []
for i in range(len(samples)):
    # Set `Sector` as the third mode of the tensor
    X_t = samples.iloc[i].reorder_levels(['Metrics', 'Asset', 'Sector'])
    X.append(pd_to_tensor(X_t))

y = np.array(labels)

# Example of the first sample and label
print(X[0], "\n\n", y[0])

This tensor is of order 3 and consists of 640 elements.
Sizes and names of its modes are (8, 10, 8) and ['Metrics', 'Asset', 'Sector'] respectively. 

 [1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.
 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1.
 0. 1. 0. 0. 1. 1. 1. 1.]


In [10]:
# Represent each sample in Tucker form and store a list
use_hooi = True
algo = HOOI() if use_hooi else GLTD()
rank = (4, 5, 4)
X_tk: list[TensorTKD] = [algo.decompose(sample, rank=rank) for sample in X]

test_size = 0.25
is_random_split = False

if is_random_split:
    X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=test_size, random_state=42)
else:
    k = int(len(X_tk) * (1. - test_size))
    X_train, X_test = X_tk[:k], X_tk[k:]
    y_train, y_test = y[:k], y[k:]

  S = np.sqrt(S)


In [11]:
# Initialise the classifier

# number of base classifiers required per class
R = np.sum(rank)
n_classes = 1 if y.ndim == 1 else y.shape[1]

base_classifiers = [
    [DecisionTreeClassifier() for _ in range(R)]
    for _ in range(n_classes)
]

grtel = GRTEL(
    base_classifiers=base_classifiers,
    n_classes=n_classes,
    probability=True,
)
grtel.fit(X_train, y_train)


0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 


## Performance

In [12]:
# IMPORTANT NOTE:
# Line `y_pred = df.idxmax(axis=1).as_matrix()` has to be updated to
#   `y_pred = df.idxmax(axis=1).values` in the hottbox library
#   `BaseTensorEnsembleClassifier._proba_to_label` method.
# `as_matrix()`` is deprecated in pandas since 0.23.0

In [13]:
num_sectors = stocks_by_sector_df.shape[1]
num_assets = stocks_by_sector_df.shape[0]

# train accuracy
train_score = grtel.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(
    np.array(train_score).reshape(num_sectors, num_assets)
).T
train_accuracy_df.columns = stocks_by_sector_df.columns

# test accuracy
test_score = grtel.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(
    np.array(test_score).reshape(num_sectors, num_assets)
).T
test_accuracy_df.columns = stocks_by_sector_df.columns

# confusion matrices
conf_matrices = grtel.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(num_sectors, num_assets, 2, 2)

In [14]:
stocks_to_analyze = ["WMT", "TGT", "PFE", "MSFT", "CVX", "DVN", "JPM", "AAPL", "NVDA"]

# Performace metrics
performance_df = pd.DataFrame(
    columns=stocks_to_analyze,
    index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'],
    dtype=float,
)
for ticker in stocks_to_analyze:
    idx = ticker_to_idx[ticker]
    performance_df[ticker]['train acc'] = train_accuracy_df.iloc[idx]
    performance_df[ticker]['test acc'] = test_accuracy_df.iloc[idx]

    conf_matrix = conf_matrices[idx[1], idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)

    performance_df[ticker]['precision'] = precision
    performance_df[ticker]['recall'] = recall
    performance_df[ticker]['downturn_precision'] = downturn_precision
    performance_df[ticker]['specificity'] = specificity

performance_df

Unnamed: 0,WMT,TGT,PFE,MSFT,CVX,DVN,JPM,AAPL,NVDA
train acc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
test acc,0.568306,0.453552,0.420765,0.535519,0.459016,0.491803,0.442623,0.535519,0.415301
precision,0.587786,0.487179,0.648148,0.63,0.478261,0.433962,0.414013,0.531915,0.410853
recall,0.754902,0.587629,0.286885,0.567568,0.836957,0.267442,0.866667,0.797872,0.630952
downturn_precision,0.519231,0.393939,0.325581,0.421687,0.318182,0.515385,0.615385,0.547619,0.425926
specificity,0.333333,0.302326,0.688525,0.486111,0.076923,0.690722,0.148148,0.258427,0.232323


In [15]:
# Scores
print("Classification accuracy (Train):")
score = grtel.score(X_train, y_train)
print_scores(score)
print()

print("Classification accuracy (Test):")
score = grtel.score(X_test, y_test)
print_scores(score)
print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes)
print()

Classification accuracy (Train):
[100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%]

Classification accuracy (Test):
[57.92%, 46.99%, 44.26%, 55.19%, 41.53%, 49.73%, 49.73%, 36.61%, 49.18%, 45.36%, 53.01%, 55.74%, 62.30%, 53.01%, 53.55%, 40.98%, 62.84%, 43.17%, 49.73%, 45.36%, 56.83%, 58.47%, 51.91%, 60.66%, 53.55%, 51.37%, 53.5

## Grid Search

In [16]:
# auto does not work anymore
max_features = [
    # "auto",
    "sqrt",
    None,
    "log2",
]
max_depth = list(range(10, 70, 10)) + [None]
min_samples_split = [2, 5, 10, 20, 30]
min_samples_leaf = [1, 3, 5, 7, 12, 14]

search_grid = {
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
}

search_params = [search_grid for _ in range(R)]

print("Performing grid search for each base classifer and for each class...")
grtel.grid_search(X_train, y_train, search_params)

print("Train base classifiers with optimal hyperparameters...")
grtel.fit(X_train, y_train)

Performing grid search for each base classifer and for each class...
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 
Train base classifiers with optimal hyperparameters...
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 



## Performance after Grid Search

In [17]:
num_sectors = stocks_by_sector_df.shape[1]
num_assets = stocks_by_sector_df.shape[0]

# train accuracy
train_score = grtel.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(
    np.array(train_score).reshape(num_sectors, num_assets)
).T
train_accuracy_df.columns = stocks_by_sector_df.columns

# test accuracy
test_score = grtel.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(
    np.array(test_score).reshape(num_sectors, num_assets)
).T
test_accuracy_df.columns = stocks_by_sector_df.columns

# confusion matrices
conf_matrices = grtel.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(num_sectors, num_assets, 2, 2)

In [18]:
# Performace metrics
performance_df = pd.DataFrame(
    columns=stocks_to_analyze,
    index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'],
    dtype=float,
)
for ticker in stocks_to_analyze:
    idx = ticker_to_idx[ticker]
    performance_df[ticker]['train acc'] = train_accuracy_df.iloc[idx]
    performance_df[ticker]['test acc'] = test_accuracy_df.iloc[idx]

    conf_matrix = conf_matrices[idx[1], idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)

    performance_df[ticker]['precision'] = precision
    performance_df[ticker]['recall'] = recall
    performance_df[ticker]['downturn_precision'] = downturn_precision
    performance_df[ticker]['specificity'] = specificity

performance_df

Unnamed: 0,WMT,TGT,PFE,MSFT,CVX,DVN,JPM,AAPL,NVDA
train acc,0.992714,0.996357,0.985428,0.981785,1.0,1.0,0.994536,0.994536,0.981785
test acc,0.469945,0.453552,0.497268,0.57377,0.519126,0.535519,0.513661,0.530055,0.502732
precision,0.538462,0.5,0.664062,0.604938,0.5,0.571429,0.477876,0.536585,0.483516
recall,0.411765,0.659794,0.696721,0.882883,0.804348,0.093023,0.72,0.93617,0.52381
downturn_precision,0.428571,0.4,0.327273,0.380952,0.485714,0.538462,0.7,0.684211,0.565217
specificity,0.555556,0.255814,0.295082,0.111111,0.186813,0.938144,0.453704,0.146067,0.525253


In [19]:
# Scores
print("Classification accuracy (Train):")
score = grtel.score(X_train, y_train)
print_scores(score)
print()

print("Classification accuracy (Test):")
score = grtel.score(X_test, y_test)
print_scores(score)
print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes)
print()

Classification accuracy (Train):
[99.82%, 98.91%, 100.00%, 98.91%, 99.45%, 100.00%, 100.00%, 100.00%, 99.09%, 99.82%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 99.45%, 100.00%, 99.64%, 99.64%, 99.64%, 99.27%, 99.27%, 99.09%, 100.00%, 100.00%, 100.00%, 100.00%, 99.82%, 100.00%, 99.82%, 100.00%, 99.82%, 100.00%, 100.00%, 99.82%, 100.00%, 98.91%, 100.00%, 100.00%, 98.72%, 99.64%, 100.00%, 100.00%, 100.00%, 99.45%, 100.00%, 99.09%, 100.00%, 100.00%, 99.82%, 98.54%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 99.64%, 100.00%, 100.00%, 96.17%, 100.00%, 100.00%, 98.54%, 100.00%, 99.45%, 100.00%, 99.82%, 99.27%, 99.45%, 99.64%, 98.18%, 100.00%, 98.18%, 99.45%, 100.00%, 100.00%, 99.64%, 99.45%]

Classification accuracy (Test):
[56.28%, 50.82%, 42.62%, 43.72%, 37.70%, 37.16%, 47.54%, 51.91%, 42.08%, 51.91%, 49.18%, 38.25%, 65.03%, 56.83%, 61.20%, 55.74%, 60.66%, 44.26%, 49.73%, 45.36%, 46.99%, 51.37%, 54.10%, 48.63%, 53.01%, 55.19%, 55.74%, 61.20%, 45.36%, 52.46%, 55.74%, 3