In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [2]:
mydateparser = lambda x: datetime.strptime(x, "%Y-%m-%d")
info = pd.read_csv('../data/snp_info.csv', index_col=0)

In [3]:
stocks_analysis = ['KO', 'TGT', 'PFE', 'MSFT', 'CVX', 'DVN', 'DAL', 'JPM', 'PEP', 'AAPL']

In [4]:
stocks = pd.read_csv('stocks_by_sector.csv')
stocks

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,F,KO,MRO,BAC,PFE,GE,AAPL
1,TWTR,GM,KR,KMI,WFC,MRK,DAL,AMD
2,FB,EBAY,PG,XOM,C,GILD,CSX,MU
3,CMCSA,SBUX,WMT,HAL,RF,BMY,AAL,MSFT
4,VZ,NKE,MDLZ,WMB,JPM,BSX,LUV,INTC
5,NFLX,M,MO,COP,KEY,ABT,FAST,CSCO
6,DIS,MGM,COTY,SLB,MS,CVS,CAT,HPE
7,ATVI,TJX,WBA,DVN,HBAN,ABBV,JCI,ORCL
8,IPG,TGT,PM,CVX,SCHW,JNJ,UAL,NVDA
9,DISCA,NWL,PEP,COG,SYF,MDT,UNP,AMAT


In [5]:
idx_dict = {}
for i in range(len(stocks.index)):
    for j in range(len(stocks.columns)):
        stock = stocks.iloc[i,j]
        idx_dict[stock] = (i,j)

---------
Get Samples and Labels
---------

In [6]:
samples = pd.read_csv('samples_sector.csv', index_col=0, header=[0,1,2], parse_dates=True, date_parser=mydateparser)
samples.head()

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,35.139999,66.984998,97.740062,-2.259938,0.092199,0.021266,0.047704,10592300,17.84,30.079617,...,-0.039824,-9242700,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,35.0,63.461411,78.712861,-21.287139,0.123844,0.042529,0.030928,-19672100,17.02,23.781919,...,-0.042453,-16010900,16.91,47.80152,38.129494,-61.870506,-0.07806,-0.114917,0.013789,-91721800
2016-01-26,35.400002,68.861369,92.129714,-7.870286,0.170558,0.068877,0.044248,22005000,17.01,23.71206,...,-0.049039,-10312300,17.35,57.746426,63.025232,-36.974768,-0.035484,-0.09857,0.035821,-75935100
2016-01-27,35.48,69.888295,86.610888,-13.389112,0.209019,0.097551,0.051571,72268000,16.780001,21.997274,...,-0.030759,-16104800,17.09,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,35.529999,70.587814,88.702907,-11.297093,0.23906,0.126372,0.03586,98452400,16.49,19.903236,...,-0.021625,-23024200,16.68,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400


In [7]:
labels = pd.read_csv('labels_sector.csv', index_col='Date', parse_dates=True, date_parser=mydateparser)
labels.head()

Unnamed: 0_level_0,T,TWTR,FB,CMCSA,VZ,NFLX,DIS,ATVI,IPG,DISCA,...,AAPL,AMD,MU,MSFT,INTC,CSCO,HPE,ORCL,NVDA,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2016-01-25,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-26,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-27,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


------------------------------------
Training and Prediction (Base)
------------------------------------

In [8]:
from GRTEL.classification import MultiClassifier
from GRTEL.utils import downturn_confidence, print_scores, print_1_percentage, confusion_matrix_metrics 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [9]:
X = samples
y = np.array(labels)


# Split into train and test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

test_size = 0.25
k = int(len(X) * (1. - test_size))

X_train, X_test = X[:k], X[k:]
y_train, y_test = y[:k], y[k:]

In [10]:
# Initialise classifier
n_classes = y.shape[1]

multi_clf = MultiClassifier(n_classes)

# Train classifer
multi_clf.fit(X_train, y_train)

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 


In [11]:
# train accuracy
train_score = multi_clf.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(np.array(train_score).reshape(stocks.shape[1], stocks.shape[0])).T
train_accuracy_df.columns = stocks.columns

# test accuracy
test_score = multi_clf.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(np.array(test_score).reshape(stocks.shape[1], stocks.shape[0])).T
test_accuracy_df.columns = stocks.columns

# confusion matrices
conf_matrices = multi_clf.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(stocks.shape[1], stocks.shape[0], 2, 2)



## Print performance results
results = pd.DataFrame(columns=stocks_analysis, index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'], dtype=float)
for stock in stocks_analysis:
    idx = idx_dict[stock]
    results[stock]['train acc'] = train_accuracy_df.iloc[idx]
    results[stock]['test acc'] = test_accuracy_df.iloc[idx]
    
    conf_matrix = conf_matrices[idx[1],idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)
    
    results[stock]['precision'] = precision
    results[stock]['recall'] = recall
    results[stock]['downturn_precision'] = downturn_precision
    results[stock]['specificity'] = specificity
    
results

Unnamed: 0,KO,TGT,PFE,MSFT,CVX,DVN,DAL,JPM,PEP,AAPL
train acc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
test acc,0.540984,0.453552,0.469945,0.480874,0.464481,0.486339,0.661202,0.530055,0.502732,0.519126
precision,0.683333,0.489655,0.650602,0.611111,0.48,0.439394,0.689076,0.44086,0.5625,0.583333
recall,0.386792,0.731959,0.442623,0.396396,0.782609,0.337209,0.766355,0.546667,0.445545,0.223404
downturn_precision,0.471545,0.315789,0.32,0.396396,0.393939,0.512821,0.609375,0.622222,0.456311,0.503401
specificity,0.753247,0.139535,0.52459,0.611111,0.142857,0.618557,0.513158,0.518519,0.573171,0.831461


In [12]:
#Scores
score = multi_clf.score(X_train, y_train)
print("\nClassification accuracy (Train):")
print_scores(score); print()

score = multi_clf.score(X_test, y_test)
print("Classification accuracy (Test):")
print_scores(score); print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes); print()


Classification accuracy (Train):
[100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%]

Classification accuracy (Test):
[53.55%, 56.28%, 53.55%, 39.34%, 53.01%, 42.62%, 56.28%, 46.99%, 48.63%, 51.37%, 63.39%, 53.01%, 42.62%, 59.02%, 61.20%, 49.18%, 60.11%, 59.02%, 45.36%, 49.73%, 54.10%, 52.46%, 51.37%, 44.81%, 60.11%, 45.90%, 57.