In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# %load_ext autoreload
# %autoreload 2

In [2]:
mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
info = pd.read_csv('../data/snp_info.csv', index_col=0)

In [3]:
stocks_analysis = ['KO', 'TGT', 'PFE', 'MSFT', 'CVX', 'DVN', 'DAL', 'JPM', 'PEP', 'AAPL']

In [4]:
stocks = pd.read_csv('stocks_by_sector.csv')
stocks

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,F,KO,MRO,BAC,PFE,GE,AAPL
1,TWTR,GM,KR,KMI,WFC,MRK,DAL,AMD
2,FB,EBAY,PG,XOM,C,GILD,CSX,MU
3,CMCSA,SBUX,WMT,HAL,RF,BMY,AAL,MSFT
4,VZ,NKE,MDLZ,WMB,JPM,BSX,LUV,INTC
5,NFLX,M,MO,COP,KEY,ABT,FAST,CSCO
6,DIS,MGM,COTY,SLB,MS,CVS,CAT,HPE
7,ATVI,TJX,WBA,DVN,HBAN,ABBV,JCI,ORCL
8,IPG,TGT,PM,CVX,SCHW,JNJ,UAL,NVDA
9,DISCA,NWL,PEP,COG,SYF,MDT,UNP,AMAT


In [5]:
idx_dict = {}
for i in range(len(stocks.index)):
    for j in range(len(stocks.columns)):
        stock = stocks.iloc[i,j]
        idx_dict[stock] = (i,j)

In [6]:
import itertools

all_stocks = []
for i in range(len(stocks.columns)):
    all_stocks.append(list(stocks.iloc[:,i]))
all_stocks = list(itertools.chain.from_iterable(all_stocks))

---------
Get Samples and Labels
---------

In [7]:
samples = pd.read_csv('samples_sector.csv', index_col=0, header=[0,1,2], parse_dates=True, date_parser=mydateparser)
samples.head()

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,35.139999,66.984998,97.740062,-2.259938,0.092199,0.021266,0.047704,10592300,17.84,30.079617,...,-0.039824,-9242700,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,35.0,63.461411,78.712861,-21.287139,0.123844,0.042529,0.030928,-19672100,17.02,23.781919,...,-0.042453,-16010900,16.91,47.80152,38.129494,-61.870506,-0.07806,-0.114917,0.013789,-91721800
2016-01-26,35.400002,68.861369,92.129714,-7.870286,0.170558,0.068877,0.044248,22005000,17.01,23.71206,...,-0.049039,-10312300,17.35,57.746426,63.025232,-36.974768,-0.035484,-0.09857,0.035821,-75935100
2016-01-27,35.48,69.888295,86.610888,-13.389112,0.209019,0.097551,0.051571,72268000,16.780001,21.997274,...,-0.030759,-16104800,17.09,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,35.529999,70.587814,88.702907,-11.297093,0.23906,0.126372,0.03586,98452400,16.49,19.903236,...,-0.021625,-23024200,16.68,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400


In [8]:
labels = pd.read_csv('labels_sector.csv', index_col='Date', parse_dates=True, date_parser=mydateparser)
labels.head()

Unnamed: 0_level_0,T,TWTR,FB,CMCSA,VZ,NFLX,DIS,ATVI,IPG,DISCA,...,AAPL,AMD,MU,MSFT,INTC,CSCO,HPE,ORCL,NVDA,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2016-01-25,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-26,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-27,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


------------------------------------
Tensor Ensemble Learning
------------------------------------

In [9]:
from GRTEL.decomposition import GLTD
from GRTEL.utils import downturn_confidence, print_scores, print_1_percentage, confusion_matrix_metrics
from GRTEL.classification import GRTEL

from hottbox.core import Tensor, TensorTKD
from hottbox.pdtools import pd_to_tensor
from hottbox.algorithms.decomposition import HOSVD, HOOI
from hottbox.utils.generation import residual_tensor
from hottbox.algorithms.classification import TelVI

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [10]:
X = []
for i in range(len(samples)):
    X_t = samples.iloc[i].reorder_levels(['Metrics', 'Asset', 'Sector']) #Make 'Sector' the third mode of the tensor
    X.append(pd_to_tensor(X_t))

y = np.array(labels)

print(X[0], '\n\n', y[0])

This tensor is of order 3 and consists of 640 elements.
Sizes and names of its modes are (8, 10, 8) and ['Metrics', 'Asset', 'Sector'] respectively. 

 [1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 0. 0. 1. 1. 1. 0. 1.]


In [28]:
# Represent each sample in Tucker form and store it in a list
algo = HOOI()
# algo = GLTD()
rank = (4,5,4)
X_tk = [algo.decompose(sample, rank=rank) for sample in X]


# Split into train and test set
# X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=0.25, random_state=42)

test_size = 0.25
k = int(len(X_tk) * (1. - test_size))

X_train, X_test = X_tk[:k], X_tk[k:]
y_train, y_test = y[:k], y[k:]

  S = np.sqrt(S)


In [29]:
# Initialise classifier
R = np.sum(rank) # number of base classifiers required per class
n_classes = 1 if y.ndim == 1 else y.shape[1]

base_clfs = []
for _ in range(n_classes):
    base_clfs.append([DecisionTreeClassifier() for _ in range(R)])

grtel = GRTEL(base_clfs=base_clfs,
              n_classes=n_classes,
              probability=True,
              verbose=False)


# Train classifer
grtel.fit(X_train, y_train)

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 


In [30]:
### Classifier performance results
# train accuracy
train_score = grtel.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(np.array(train_score).reshape(stocks.shape[1], stocks.shape[0])).T
train_accuracy_df.columns = stocks.columns

# test accuracy
test_score = grtel.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(np.array(test_score).reshape(stocks.shape[1], stocks.shape[0])).T
test_accuracy_df.columns = stocks.columns

# confusion matrices
conf_matrices = grtel.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(stocks.shape[1], stocks.shape[0], 2, 2)



## Print performance results
results = pd.DataFrame(columns=all_stocks, index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'], dtype=float)
for stock in all_stocks:
    idx = idx_dict[stock]
    results[stock]['train acc'] = train_accuracy_df.iloc[idx]
    results[stock]['test acc'] = test_accuracy_df.iloc[idx]
    
    conf_matrix = conf_matrices[idx[1],idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)
    
    results[stock]['precision'] = precision
    results[stock]['recall'] = recall
    results[stock]['downturn_precision'] = downturn_precision
    results[stock]['specificity'] = specificity
    
results

Unnamed: 0,T,TWTR,FB,CMCSA,VZ,NFLX,DIS,ATVI,IPG,DISCA,...,AAPL,AMD,MU,MSFT,INTC,CSCO,HPE,ORCL,NVDA,AMAT
train acc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
test acc,0.486339,0.47541,0.409836,0.448087,0.437158,0.431694,0.480874,0.398907,0.469945,0.568306,...,0.519126,0.311475,0.497268,0.491803,0.431694,0.469945,0.437158,0.464481,0.459016,0.448087
precision,0.447917,0.526316,0.439024,0.53,0.615385,0.442308,0.495146,0.392857,0.473684,0.513889,...,0.517442,0.465517,0.401515,0.560811,0.47482,0.547368,0.406593,0.509434,0.438017,0.306818
recall,0.511905,0.4,0.367347,0.495327,0.278261,0.5,0.542553,0.511628,0.48913,0.45679,...,0.946809,0.221311,0.80303,0.747748,0.680412,0.490566,0.430233,0.54,0.630952,0.402985
downturn_precision,0.528736,0.439252,0.386139,0.349398,0.366412,0.417722,0.4625,0.408451,0.465909,0.603604,...,0.545455,0.24,0.745098,0.2,0.295455,0.386364,0.467391,0.402597,0.5,0.578947
specificity,0.464646,0.566265,0.458824,0.381579,0.705882,0.362637,0.41573,0.298969,0.450549,0.656863,...,0.067416,0.491803,0.324786,0.097222,0.151163,0.441558,0.443299,0.373494,0.313131,0.474138


In [31]:
final_results = results[stocks_analysis].copy()
final_results['Average'] = final_results.mean(axis=1)
final_results['Overall'] = results.mean(axis=1)
final_results

Unnamed: 0,KO,TGT,PFE,MSFT,CVX,DVN,DAL,JPM,PEP,AAPL,Average,Overall
train acc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
test acc,0.431694,0.551913,0.530055,0.491803,0.540984,0.377049,0.469945,0.448087,0.508197,0.519126,0.486885,0.471311
precision,0.508621,0.563025,0.655172,0.560811,0.533333,0.294118,0.647059,0.402985,0.534591,0.517442,0.521716,0.471368
recall,0.556604,0.690722,0.622951,0.747748,0.695652,0.232558,0.205607,0.72,0.841584,0.946809,0.626023,0.492039
downturn_precision,0.298507,0.53125,0.313433,0.2,0.555556,0.426087,0.42953,0.571429,0.333333,0.545455,0.420458,0.458703
specificity,0.25974,0.395349,0.344262,0.097222,0.384615,0.505155,0.842105,0.259259,0.097561,0.067416,0.325268,0.443829


In [23]:
# #Scores
# score = grtel.score(X_train, y_train)
# print("\nClassification accuracy (Train):")
# print_scores(score); print()

# score = grtel.score(X_test, y_test)
# print("Classification accuracy (Test):")
# print_scores(score); print()

# print("Percentage of 1s (Test):")
# print_1_percentage(y_test, n_classes); print()

-------
Grid Search
------

In [28]:
max_features = ['auto', 'sqrt', None, 'log2']
max_depth = list(range(10, 70, 10))
max_depth.append(None)
min_samples_split = [2, 5, 10, 20, 30]
min_samples_leaf = [1, 3, 5, 7, 12, 14]

search_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

search_params = [search_grid for _ in range(R)]

print("\tPerforming grid search for each base classifer and for each class")
grtel.grid_search(X_train, y_train, search_params)

print("\tTrain base classifiers with optimal hyperparameters")
grtel.fit(X_train, y_train); print()

	Performing grid search for each base classifer and for each class
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 
	Train base classifiers with optimal hyperparameters
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 



In [29]:
### Classifier performance results
# train accuracy
train_score = grtel.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(np.array(train_score).reshape(stocks.shape[1], stocks.shape[0])).T
train_accuracy_df.columns = stocks.columns

# test accuracy
test_score = grtel.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(np.array(test_score).reshape(stocks.shape[1], stocks.shape[0])).T
test_accuracy_df.columns = stocks.columns

# confusion matrices
conf_matrices = grtel.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(stocks.shape[1], stocks.shape[0], 2, 2)



## Print performance results
results = pd.DataFrame(columns=stocks_analysis, index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'], dtype=float)
for stock in stocks_analysis:
    idx = idx_dict[stock]
    results[stock]['train acc'] = train_accuracy_df.iloc[idx]
    results[stock]['test acc'] = test_accuracy_df.iloc[idx]
    
    conf_matrix = conf_matrices[idx[1],idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)
    
    results[stock]['precision'] = precision
    results[stock]['recall'] = recall
    results[stock]['downturn_precision'] = downturn_precision
    results[stock]['specificity'] = specificity
    
results

Unnamed: 0,KO,TGT,PFE,MSFT,CVX,DVN,DAL,JPM,PEP,AAPL
train acc,0.998179,1.0,0.989071,0.994536,0.978142,1.0,0.996357,0.994536,0.996357,1.0
test acc,0.409836,0.52459,0.486339,0.568306,0.557377,0.491803,0.431694,0.437158,0.502732,0.47541
precision,0.52518,0.537975,0.613861,0.590643,0.543624,0.378378,0.569444,0.380597,0.545455,0.496503
recall,0.688679,0.876289,0.508197,0.90991,0.880435,0.162791,0.383178,0.68,0.534653,0.755319
downturn_precision,0.25,0.52,0.268293,0.166667,0.676471,0.506849,0.405405,0.510204,0.440476,0.425
specificity,0.142857,0.151163,0.360656,0.027778,0.252747,0.762887,0.592105,0.231481,0.45122,0.191011


In [30]:
score = grtel.score(X_train, y_train)
print("Classification accuracy (Train):")
print_scores(score); print()

score = grtel.score(X_test, y_test)
print("Classification accuracy (Test):")
print_scores(score); print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes); print()

Classification accuracy (Train):
[100.00%, 100.00%, 100.00%, 99.45%, 99.82%, 99.82%, 100.00%, 99.45%, 100.00%, 100.00%, 100.00%, 100.00%, 98.72%, 100.00%, 99.27%, 100.00%, 100.00%, 99.82%, 100.00%, 100.00%, 99.82%, 98.91%, 99.82%, 99.82%, 100.00%, 99.64%, 100.00%, 99.82%, 100.00%, 99.64%, 99.82%, 99.82%, 99.45%, 100.00%, 100.00%, 100.00%, 99.45%, 100.00%, 97.81%, 100.00%, 100.00%, 100.00%, 100.00%, 99.27%, 99.45%, 100.00%, 98.36%, 99.64%, 99.64%, 100.00%, 98.91%, 100.00%, 99.82%, 100.00%, 99.64%, 99.09%, 99.27%, 99.82%, 100.00%, 100.00%, 97.27%, 99.64%, 100.00%, 99.64%, 100.00%, 100.00%, 96.17%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 99.45%, 99.82%, 100.00%, 100.00%, 99.82%, 99.64%, 99.64%]

Classification accuracy (Test):
[44.26%, 42.08%, 38.80%, 46.45%, 46.45%, 36.61%, 50.82%, 36.61%, 55.19%, 45.36%, 55.19%, 56.83%, 56.83%, 45.90%, 49.73%, 46.99%, 55.19%, 45.36%, 52.46%, 51.91%, 40.98%, 53.01%, 62.84%, 39.89%, 46.99%, 45.36%, 65.03%, 41.53%, 40.98%, 50.27%, 44.81%, 38.