In [1]:
# Allows imports from other packages in the project
import sys
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd

In [3]:
snp_info_path = "../data/snp_info.csv"
info = (
    pd.read_csv(snp_info_path, index_col=0)
    .set_index("Symbol")
)

In [4]:
stocks_by_cluster_df = pd.read_csv("../data/stocks_by_cluster.csv")
stocks_by_cluster_df

Unnamed: 0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8
0,TGT,PFE,DVN,BAC,MSFT,SLB,AMAT,GOOG
1,TJX,ABBV,MRO,JPM,ABT,CVX,MU,GOOGL
2,WBA,JNJ,XOM,MS,MGM,HAL,NVDA,AMZN
3,WMT,GILD,MPC,C,ORCL,COP,AAPL,BSX
4,CVS,BMY,WMB,RF,EBAY,KMI,NFLX,MDLZ


In [5]:
ticker_to_idx = {
    ticker: (r, c)
    for r, row in enumerate(stocks_by_cluster_df.values)
    for c, ticker in enumerate(row)
}

## Get Samples and Labels


In [6]:
samples = pd.read_csv(
    "../data/samples_cluster.csv",
    index_col=0,
    header=[0, 1, 2],
    parse_dates=True
)
samples.head()

Cluster,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,...,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_4,asset_4,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,69.720001,40.841917,34.171927,-65.828073,-0.609793,-0.426234,-0.023256,-28661600,34.474998,52.461434,...,0.024096,-48541400,41.599998,49.475466,40.25153,-59.74847,-0.223801,-0.216184,0.014882,-36938000
2016-01-25,68.919998,36.140205,25.786151,-74.213849,-0.624351,-0.467303,-0.048329,-33973800,34.380001,50.599646,...,0.023351,-57890800,40.200001,38.612846,10.901476,-89.098524,-0.26304,-0.225897,-0.035509,-50403200
2016-01-26,70.440002,49.009156,41.719109,-58.280891,-0.535363,-0.481309,-0.039804,-29921200,34.845001,58.848049,...,-0.023047,-68694300,41.349998,49.186828,37.193741,-62.806259,-0.219761,-0.224634,-0.018281,-42275700
2016-01-27,70.690002,50.887399,44.339654,-55.660346,-0.445382,-0.473958,-0.013261,-25959800,34.595001,53.324693,...,-0.018803,-78069200,41.119999,47.306391,42.857094,-57.142906,-0.198261,-0.219238,0.013057,-50018100
2016-01-28,70.989998,53.27066,54.381726,-45.618274,-0.352089,-0.449137,0.008237,-21886500,34.695,55.262643,...,-0.069383,-86157700,41.549999,51.320984,65.614033,-34.385967,-0.15225,-0.205594,0.004594,-38397600


In [7]:
labels = pd.read_csv(
    "../data/labels_cluster.csv",
    index_col="Date",
    parse_dates=True,
)
labels.head()


Unnamed: 0_level_0,TGT,TJX,WBA,WMT,CVS,PFE,ABBV,JNJ,GILD,BMY,...,AMAT,MU,NVDA,AAPL,NFLX,GOOG,GOOGL,AMZN,BSX,MDLZ
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-25,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-26,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-27,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-28,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Tensor Ensemble Learning

In [8]:
# from grtel.decomposition import GLTD
GLTD = lambda x: print("TODO: Implement GLTD")
from grtel.utils import print_scores, print_1_percentage, confusion_matrix_metrics
from grtel.classification import GRTEL

from hottbox.core import Tensor, TensorTKD
from hottbox.pdtools import pd_to_tensor
from hottbox.algorithms.decomposition import HOOI

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [9]:
X: list[Tensor] = []
for i in range(len(samples)):
    # Set `Cluster` as the third mode of the tensor
    X_t = samples.iloc[i].reorder_levels(['Metrics', 'Asset', 'Cluster'])
    X.append(pd_to_tensor(X_t))

y = np.array(labels)

# Example of the first sample and label
print(X[0], '\n\n', y[0])

This tensor is of order 3 and consists of 320 elements.
Sizes and names of its modes are (8, 5, 8) and ['Metrics', 'Asset', 'Cluster'] respectively. 

 [1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [10]:
# Represent each sample in Tucker form and store in a list
use_hooi = True
algo = HOOI() if use_hooi else GLTD()
rank = (4, 4, 4)
X_tk: list[TensorTKD] = [algo.decompose(sample, rank=rank) for sample in X]

test_size = 0.25
is_random_split = False

if is_random_split:
    X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=test_size, random_state=42)
else:
    k = int(len(X_tk) * (1. - test_size))
    X_train, X_test = X_tk[:k], X_tk[k:]
    y_train, y_test = y[:k], y[k:]

  S = np.sqrt(S)


In [11]:
# Initialise the classifier

# number of base classifiers required per class
R = np.sum(rank)
n_classes = 1 if y.ndim == 1 else y.shape[1]

base_classifiers = [
    [DecisionTreeClassifier() for _ in range(R)]
    for _ in range(n_classes)
]

grtel = GRTEL(
    base_classifiers=base_classifiers,
    n_classes=n_classes,
    probability=True,
)
grtel.fit(X_train, y_train)

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 


## Performance

In [12]:
# IMPORTANT NOTE:
# Line `y_pred = df.idxmax(axis=1).as_matrix()` has to be updated to
#   `y_pred = df.idxmax(axis=1).values` in the hottbox library
#   `BaseTensorEnsembleClassifier._proba_to_label` method.
# `as_matrix()`` is deprecated in pandas since 0.23.0

In [13]:
num_clusters = stocks_by_cluster_df.shape[1]
num_assets = stocks_by_cluster_df.shape[0]

# train accuracy
train_score = grtel.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(
    np.array(train_score).reshape(num_clusters, num_assets)
).T
train_accuracy_df.columns = stocks_by_cluster_df.columns

# test accuracy
test_score = grtel.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(
    np.array(test_score).reshape(num_clusters, num_assets)
).T
test_accuracy_df.columns = stocks_by_cluster_df.columns

# confusion matrices
conf_matrices = grtel.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(num_clusters, num_assets, 2, 2)

In [14]:
stocks_to_analyze = ["WMT", "TGT", "PFE", "MSFT", "CVX", "DVN", "JPM", "AAPL", "NVDA"]

# Performace metrics
performance_df = pd.DataFrame(
    columns=stocks_to_analyze,
    index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'],
    dtype=float,
)
for ticker in stocks_to_analyze:
    idx = ticker_to_idx[ticker]
    performance_df[ticker]['train acc'] = train_accuracy_df.iloc[idx]
    performance_df[ticker]['test acc'] = test_accuracy_df.iloc[idx]

    conf_matrix = conf_matrices[idx[1], idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)

    performance_df[ticker]['precision'] = precision
    performance_df[ticker]['recall'] = recall
    performance_df[ticker]['downturn_precision'] = downturn_precision
    performance_df[ticker]['specificity'] = specificity

performance_df

Unnamed: 0,WMT,TGT,PFE,MSFT,CVX,DVN,JPM,AAPL,NVDA
train acc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
test acc,0.486339,0.464481,0.42623,0.530055,0.480874,0.431694,0.508197,0.519126,0.404372
precision,0.528169,0.496241,0.613333,0.606838,0.490683,0.372549,0.444444,0.527273,0.40458
recall,0.735294,0.680412,0.377049,0.63964,0.858696,0.22093,0.8,0.617021,0.630952
downturn_precision,0.341463,0.38,0.296296,0.393939,0.409091,0.492424,0.6875,0.506849,0.403846
specificity,0.17284,0.22093,0.52459,0.361111,0.098901,0.670103,0.305556,0.41573,0.212121


In [15]:
# Scores
print("Classification accuracy (Train):")
score = grtel.score(X_train, y_train)
print_scores(score)
print()

print("Classification accuracy (Test):")
score = grtel.score(X_test, y_test)
print_scores(score)
print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes)
print()

Classification accuracy (Train):
[100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%]

Classification accuracy (Test):
[46.45%, 49.18%, 42.62%, 48.63%, 57.38%, 42.62%, 51.91%, 46.99%, 60.11%, 52.46%, 43.17%, 53.55%, 61.75%, 44.26%, 46.45%, 44.26%, 50.82%, 56.28%, 53.55%, 48.09%, 53.01%, 48.63%, 44.81%, 56.28%, 55.19%, 43.72%, 48.09%, 46.99%, 53.01%, 38.80%, 43.17%, 40.44%, 40.44%, 51.91%, 35.52%, 47.54%, 50.27%, 50.82%, 50.82%, 56.83%]

Percentage of 1s (Test):
[53.01%, 63.39%, 64.48%, 55.74%, 56.83%, 66.67%, 48.09%, 57.38%, 48.09%, 61.20%, 46.99%, 45.36%, 49.18%, 43.17%, 46.45%, 42.08%, 40.98%, 36.61%, 40.44%, 43.72%, 60.66%, 59.56%, 38.80%, 54.64%, 40.98%, 34.43%, 50.27%, 36.61%, 5

## Grid Search

In [16]:
# auto does not work anymore
max_features = [
    # "auto",
    "sqrt",
    None,
    "log2",
]
max_depth = list(range(10, 70, 10)) + [None]
min_samples_split = [2, 5, 10, 20, 30]
min_samples_leaf = [1, 3, 5, 7, 12, 14]

search_grid = {
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
}

search_params = [search_grid for _ in range(R)]

print("Performing grid search for each base classifer and for each class...")
grtel.grid_search(X_train, y_train, search_params)

print("Train base classifiers with optimal hyperparameters...")
grtel.fit(X_train, y_train)

Performing grid search for each base classifer and for each class...
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 
Train base classifiers with optimal hyperparameters...
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 


## Performance after Grid Search

In [17]:
num_sectors = stocks_by_cluster_df.shape[1]
num_assets = stocks_by_cluster_df.shape[0]

# train accuracy
train_score = grtel.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(
    np.array(train_score).reshape(num_sectors, num_assets)
).T
train_accuracy_df.columns = stocks_by_cluster_df.columns

# test accuracy
test_score = grtel.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(
    np.array(test_score).reshape(num_sectors, num_assets)
).T
test_accuracy_df.columns = stocks_by_cluster_df.columns

# confusion matrices
conf_matrices = grtel.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(num_sectors, num_assets, 2, 2)

In [18]:
# Performace metrics
performance_df = pd.DataFrame(
    columns=stocks_to_analyze,
    index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'],
    dtype=float,
)
for ticker in stocks_to_analyze:
    idx = ticker_to_idx[ticker]
    performance_df[ticker]['train acc'] = train_accuracy_df.iloc[idx]
    performance_df[ticker]['test acc'] = test_accuracy_df.iloc[idx]

    conf_matrix = conf_matrices[idx[1], idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)

    performance_df[ticker]['precision'] = precision
    performance_df[ticker]['recall'] = recall
    performance_df[ticker]['downturn_precision'] = downturn_precision
    performance_df[ticker]['specificity'] = specificity

performance_df

Unnamed: 0,WMT,TGT,PFE,MSFT,CVX,DVN,JPM,AAPL,NVDA
train acc,1.0,0.989071,0.989071,0.990893,1.0,0.990893,0.98725,0.996357,0.981785
test acc,0.535519,0.491803,0.382514,0.491803,0.502732,0.513661,0.442623,0.584699,0.453552
precision,0.545455,0.525,0.558824,0.571429,0.513761,0.428571,0.443478,0.601695,0.468966
recall,0.647059,0.649485,0.311475,0.36036,0.608696,0.069767,0.68,0.755319,0.809524
downturn_precision,0.419355,0.460317,0.269565,0.371681,0.513514,0.526627,0.647059,0.646154,0.578947
specificity,0.320988,0.337209,0.508197,0.583333,0.417582,0.917526,0.407407,0.47191,0.222222


In [19]:
# Scores
print("Classification accuracy (Train):")
score = grtel.score(X_train, y_train)
print_scores(score)
print()

print("Classification accuracy (Test):")
score = grtel.score(X_test, y_test)
print_scores(score)
print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes)
print()

Classification accuracy (Train):
[98.91%, 99.82%, 100.00%, 100.00%, 99.82%, 98.91%, 100.00%, 100.00%, 99.82%, 100.00%, 99.09%, 99.82%, 98.36%, 99.27%, 99.82%, 99.64%, 98.72%, 100.00%, 99.82%, 97.81%, 99.09%, 99.82%, 98.72%, 99.82%, 100.00%, 100.00%, 100.00%, 99.45%, 100.00%, 99.64%, 98.91%, 100.00%, 98.18%, 99.64%, 100.00%, 99.82%, 100.00%, 100.00%, 97.81%, 99.82%]

Classification accuracy (Test):
[49.18%, 59.56%, 50.82%, 53.55%, 48.63%, 38.25%, 48.63%, 43.72%, 51.37%, 43.72%, 51.37%, 48.63%, 57.92%, 43.17%, 40.98%, 50.82%, 44.26%, 49.73%, 48.63%, 43.72%, 49.18%, 58.47%, 42.62%, 61.20%, 57.38%, 62.30%, 50.27%, 39.34%, 52.46%, 48.63%, 46.99%, 39.89%, 45.36%, 58.47%, 47.54%, 46.45%, 54.64%, 42.62%, 67.21%, 53.01%]

Percentage of 1s (Test):
[53.01%, 63.39%, 64.48%, 55.74%, 56.83%, 66.67%, 48.09%, 57.38%, 48.09%, 61.20%, 46.99%, 45.36%, 49.18%, 43.17%, 46.45%, 42.08%, 40.98%, 36.61%, 40.44%, 43.72%, 60.66%, 59.56%, 38.80%, 54.64%, 40.98%, 34.43%, 50.27%, 36.61%, 54.10%, 54.10%, 36.61%, 36.