In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [2]:
mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
info = pd.read_csv('../data/snp_info.csv', index_col=0)

In [3]:
stocks_analysis = ['KO', 'TGT', 'PFE', 'MSFT', 'CVX', 'DVN', 'DAL', 'JPM', 'PEP', 'AAPL']

In [4]:
stocks = pd.read_csv('stocks_by_sector.csv')
stocks

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,F,KO,MRO,BAC,PFE,GE,AAPL
1,TWTR,GM,KR,KMI,WFC,MRK,DAL,AMD
2,FB,EBAY,PG,XOM,C,GILD,CSX,MU
3,CMCSA,SBUX,WMT,HAL,RF,BMY,AAL,MSFT
4,VZ,NKE,MDLZ,WMB,JPM,BSX,LUV,INTC
5,NFLX,M,MO,COP,KEY,ABT,FAST,CSCO
6,DIS,MGM,COTY,SLB,MS,CVS,CAT,HPE
7,ATVI,TJX,WBA,DVN,HBAN,ABBV,JCI,ORCL
8,IPG,TGT,PM,CVX,SCHW,JNJ,UAL,NVDA
9,DISCA,NWL,PEP,COG,SYF,MDT,UNP,AMAT


In [5]:
idx_dict = {}
for i in range(len(stocks.index)):
    for j in range(len(stocks.columns)):
        stock = stocks.iloc[i,j]
        idx_dict[stock] = (i,j)

---------
Get Samples and Labels
---------

In [6]:
samples = pd.read_csv('samples_sector.csv', index_col=0, header=[0,1,2], parse_dates=True, date_parser=mydateparser)
samples.head()

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,35.139999,66.984998,97.740062,-2.259938,0.092199,0.021266,0.047704,10592300,17.84,30.079617,...,-0.039824,-9242700,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,35.0,63.461411,78.712861,-21.287139,0.123844,0.042529,0.030928,-19672100,17.02,23.781919,...,-0.042453,-16010900,16.91,47.80152,38.129494,-61.870506,-0.07806,-0.114917,0.013789,-91721800
2016-01-26,35.400002,68.861369,92.129714,-7.870286,0.170558,0.068877,0.044248,22005000,17.01,23.71206,...,-0.049039,-10312300,17.35,57.746426,63.025232,-36.974768,-0.035484,-0.09857,0.035821,-75935100
2016-01-27,35.48,69.888295,86.610888,-13.389112,0.209019,0.097551,0.051571,72268000,16.780001,21.997274,...,-0.030759,-16104800,17.09,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,35.529999,70.587814,88.702907,-11.297093,0.23906,0.126372,0.03586,98452400,16.49,19.903236,...,-0.021625,-23024200,16.68,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400


In [7]:
labels = pd.read_csv('labels_sector.csv', index_col='Date', parse_dates=True, date_parser=mydateparser)
labels.head()

Unnamed: 0_level_0,T,TWTR,FB,CMCSA,VZ,NFLX,DIS,ATVI,IPG,DISCA,...,AAPL,AMD,MU,MSFT,INTC,CSCO,HPE,ORCL,NVDA,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2016-01-25,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-26,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-27,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


------------------------------------
Tensor Ensemble Learning
------------------------------------

In [8]:
from GRTEL.decomposition import GLTD
from GRTEL.utils import downturn_confidence, print_scores, print_1_percentage
from GRTEL.classification import GRTEL

from hottbox.core import Tensor, TensorTKD
from hottbox.pdtools import pd_to_tensor
from hottbox.algorithms.decomposition import HOSVD, HOOI
from hottbox.utils.generation import residual_tensor
from hottbox.algorithms.classification import TelVI

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [9]:
X = []
for i in range(len(samples)):
    X_t = samples.iloc[i].reorder_levels(['Metrics', 'Asset', 'Sector']) #Make 'Sector' the third mode of the tensor
    X.append(pd_to_tensor(X_t))

y = np.array(labels)

print(X[0], '\n\n', y[0])

This tensor is of order 3 and consists of 640 elements.
Sizes and names of its modes are (8, 10, 8) and ['Metrics', 'Asset', 'Sector'] respectively. 

 [1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 0. 0. 1. 1. 1. 0. 1.]


In [10]:
# Represent each sample in Tucker form and store it in a list
algo = HOOI()
# algo = GLTD()
rank = (4,3,3)
X_tk = [algo.decompose(sample, rank=rank) for sample in X]


# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_tk, y, test_size=0.25, random_state=42)

# test_size = 0.25
# k = int(len(X_tk) * (1. - test_size))

# X_train, X_test = X_tk[:k], X_tk[k:]
# y_train, y_test = y[:k], y[k:]

  S = np.sqrt(S)


In [11]:
# Initialise classifier
R = np.sum(rank) # number of base classifiers required per class
n_classes = 1 if y.ndim == 1 else y.shape[1]

base_clfs = []
for _ in range(n_classes):
    base_clfs.append([DecisionTreeClassifier() for _ in range(R)])

grtel = GRTEL(base_clfs=base_clfs,
              n_classes=n_classes,
              probability=True,
              verbose=False)


# Train classifer
grtel.fit(X_train, y_train)



#Scores
score = grtel.score(X_train, y_train)
score = score if n_classes > 1 else [score]
print("\nClassification accuracy (Train):")
print_scores(score); print()

score = grtel.score(X_test, y_test)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Test):")
print_scores(score); print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes); print()

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 


AttributeError: 'Series' object has no attribute 'as_matrix'

In [12]:
results = []
for i in range(n_classes):
    results.append(downturn_confidence(y_test[:,i], grtel.models[i].predict(X_test)))
results

[(104, 82, 0.7884615384615384),
 (71, 48, 0.676056338028169),
 (59, 42, 0.711864406779661),
 (65, 45, 0.6923076923076923),
 (101, 76, 0.7524752475247525),
 (57, 46, 0.8070175438596491),
 (84, 64, 0.7619047619047619),
 (73, 59, 0.8082191780821918),
 (109, 76, 0.6972477064220184),
 (128, 82, 0.640625),
 (123, 82, 0.6666666666666666),
 (105, 73, 0.6952380952380952),
 (87, 56, 0.6436781609195402),
 (88, 65, 0.7386363636363636),
 (103, 71, 0.6893203883495146),
 (115, 78, 0.6782608695652174),
 (62, 45, 0.7258064516129032),
 (82, 58, 0.7073170731707317),
 (94, 66, 0.7021276595744681),
 (96, 66, 0.6875),
 (71, 56, 0.7887323943661971),
 (96, 70, 0.7291666666666666),
 (70, 49, 0.7),
 (52, 37, 0.7115384615384616),
 (79, 65, 0.8227848101265823),
 (89, 67, 0.7528089887640449),
 (140, 93, 0.6642857142857143),
 (90, 71, 0.7888888888888889),
 (66, 58, 0.8787878787878788),
 (69, 54, 0.782608695652174),
 (97, 71, 0.7319587628865979),
 (116, 84, 0.7241379310344828),
 (79, 59, 0.7468354430379747),
 (92, 7

In [14]:
results = []
for i in range(n_classes):
    results.append(downturn_confidence(y_train[:,i], grtel.models[i].predict(X_train)))
results

[(267, 267, 1.0),
 (256, 256, 1.0),
 (223, 223, 1.0),
 (235, 235, 1.0),
 (259, 259, 1.0),
 (228, 228, 1.0),
 (255, 255, 1.0),
 (240, 240, 1.0),
 (272, 272, 1.0),
 (288, 288, 1.0),
 (304, 304, 1.0),
 (291, 291, 1.0),
 (254, 254, 1.0),
 (248, 248, 1.0),
 (267, 267, 1.0),
 (286, 286, 1.0),
 (228, 228, 1.0),
 (241, 241, 1.0),
 (267, 267, 1.0),
 (280, 280, 1.0),
 (233, 233, 1.0),
 (279, 279, 1.0),
 (238, 238, 1.0),
 (222, 222, 1.0),
 (249, 249, 1.0),
 (265, 265, 1.0),
 (327, 327, 1.0),
 (246, 246, 1.0),
 (234, 234, 1.0),
 (246, 246, 1.0),
 (281, 281, 1.0),
 (280, 280, 1.0),
 (265, 265, 1.0),
 (280, 280, 1.0),
 (275, 275, 1.0),
 (258, 258, 1.0),
 (298, 298, 1.0),
 (291, 291, 1.0),
 (245, 245, 1.0),
 (257, 257, 1.0),
 (230, 230, 1.0),
 (272, 272, 1.0),
 (254, 254, 1.0),
 (241, 241, 1.0),
 (235, 235, 1.0),
 (255, 255, 1.0),
 (250, 250, 1.0),
 (234, 234, 1.0),
 (236, 236, 1.0),
 (280, 280, 1.0),
 (242, 242, 1.0),
 (238, 238, 1.0),
 (281, 281, 1.0),
 (239, 239, 1.0),
 (197, 197, 1.0),
 (214, 214

-------
Grid Search
------

In [15]:
max_features = ['auto', 'sqrt', None, 'log2']
max_depth = list(range(10, 70, 10))
max_depth.append(None)
min_samples_split = [2, 5, 10, 20, 30]
min_samples_leaf = [1, 3, 5, 7, 12, 14]

search_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

search_params = [search_grid for _ in range(R)]

print("\tPerforming grid search for each base classifer and for each class")
grtel.grid_search(X_train, y_train, search_params)

print("\tTrain base classifiers with optimal hyperparameters")
grtel.fit(X_train, y_train); print()

score = grtel.score(X_train, y_train)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Train):")
print_scores(score); print()

score = grtel.score(X_test, y_test)
score = score if n_classes > 1 else [score]
print("Classification accuracy (Test):")
print_scores(score); print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes); print()

	Performing grid search for each base classifer and for each class
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 
	Train base classifiers with optimal hyperparameters
0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 

Classification accuracy (Train):
[100.00%, 99.64%, 99.64%, 100.00%, 99.82%, 99.82%, 100.00%, 100.

TypeError: print_1_percentage() missing 1 required positional argument: 'n_classes'

In [16]:
results = []
for i in range(n_classes):
    results.append(downturn_confidence(y_test[:,i], grtel.models[i].predict(X_test)))
results

[(98, 75, 0.7653061224489796),
 (74, 42, 0.5675675675675675),
 (56, 37, 0.6607142857142857),
 (70, 45, 0.6428571428571429),
 (110, 79, 0.7181818181818181),
 (73, 49, 0.6712328767123288),
 (86, 61, 0.7093023255813954),
 (68, 49, 0.7205882352941176),
 (122, 81, 0.6639344262295082),
 (137, 84, 0.6131386861313869),
 (130, 79, 0.6076923076923076),
 (105, 80, 0.7619047619047619),
 (87, 59, 0.6781609195402298),
 (102, 73, 0.7156862745098039),
 (92, 59, 0.6413043478260869),
 (131, 83, 0.6335877862595419),
 (62, 41, 0.6612903225806451),
 (86, 60, 0.6976744186046512),
 (99, 62, 0.6262626262626263),
 (125, 74, 0.592),
 (86, 59, 0.686046511627907),
 (122, 75, 0.6147540983606558),
 (62, 46, 0.7419354838709677),
 (44, 30, 0.6818181818181818),
 (79, 59, 0.7468354430379747),
 (82, 58, 0.7073170731707317),
 (143, 89, 0.6223776223776224),
 (95, 68, 0.7157894736842105),
 (52, 45, 0.8653846153846154),
 (66, 52, 0.7878787878787878),
 (113, 74, 0.6548672566371682),
 (120, 81, 0.675),
 (91, 63, 0.69230769230

In [17]:
results = []
for i in range(n_classes):
    results.append(downturn_confidence(y_train[:,i], grtel.models[i].predict(X_train)))
results

[(270, 266, 0.9851851851851852),
 (260, 250, 0.9615384615384616),
 (228, 218, 0.956140350877193),
 (235, 235, 1.0),
 (267, 259, 0.9700374531835206),
 (228, 226, 0.9912280701754386),
 (256, 252, 0.984375),
 (239, 239, 1.0),
 (281, 272, 0.9679715302491103),
 (316, 287, 0.9082278481012658),
 (324, 304, 0.9382716049382716),
 (295, 291, 0.9864406779661017),
 (256, 252, 0.984375),
 (249, 248, 0.9959839357429718),
 (276, 261, 0.9456521739130435),
 (286, 286, 1.0),
 (229, 224, 0.9781659388646288),
 (243, 238, 0.9794238683127572),
 (267, 266, 0.9962546816479401),
 (328, 280, 0.8536585365853658),
 (229, 228, 0.9956331877729258),
 (314, 276, 0.8789808917197452),
 (239, 238, 0.99581589958159),
 (200, 197, 0.985),
 (243, 236, 0.9711934156378601),
 (265, 264, 0.9962264150943396),
 (337, 326, 0.9673590504451038),
 (247, 244, 0.9878542510121457),
 (233, 231, 0.9914163090128756),
 (245, 242, 0.9877551020408163),
 (302, 281, 0.9304635761589404),
 (296, 280, 0.9459459459459459),
 (271, 264, 0.97416974169