In [1]:
# Allows imports from other packages in the project
import sys
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd

In [3]:
snp_info_path = "../data/snp_info.csv"
info = (
    pd.read_csv(snp_info_path, index_col=0)
    .set_index("Symbol")
)

In [4]:
stocks_by_sector_df = pd.read_csv("../data/stocks_by_sector.csv")
stocks_by_sector_df

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,AMZN,WMT,MRO,BAC,PFE,CSX,AAPL
1,GOOGL,F,KO,KMI,WFC,MRK,GE,AMD
2,GOOG,GM,KR,XOM,C,GILD,DAL,NVDA
3,CMCSA,EBAY,PG,HAL,RF,BMY,AAL,MU
4,VZ,SBUX,MDLZ,WMB,JPM,BSX,CPRT,MSFT
5,NFLX,NKE,MO,COP,KEY,ABT,LUV,INTC
6,DIS,M,MNST,SLB,MS,CVS,FAST,CSCO
7,IPG,MGM,COTY,DVN,HBAN,ABBV,CAT,HPE
8,EA,TJX,WBA,CVX,SCHW,JNJ,JCI,ORCL
9,NWSA,TGT,PM,MPC,SYF,MDT,UAL,AMAT


In [5]:
ticker_to_idx = {
    ticker: (r, c)
    for r, row in enumerate(stocks_by_sector_df.values)
    for c, ticker in enumerate(row)
}

## Get Samples and Labels

In [6]:
samples = pd.read_csv(
    "../data/samples_sector.csv",
    index_col=0,
    header=[0, 1, 2],
    parse_dates=True
)
samples.head()

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,26.540785,66.984964,97.740045,-2.259955,0.069637,0.016062,0.047704,14024206,37.272999,58.189602,...,0.004329,-16351600,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,26.435045,63.461389,78.712858,-21.287142,0.093537,0.032122,0.030928,-26045860,36.681,49.067155,...,-0.017172,-33145200,16.91,47.80152,38.129494,-61.870506,-0.07806,-0.114917,0.013789,-91721800
2016-01-26,26.73716,68.86132,92.129624,-7.870376,0.12882,0.052022,0.044248,29134620,36.689499,49.199084,...,-0.003958,-13416300,17.35,57.746426,63.025232,-36.974768,-0.035484,-0.09857,0.035821,-75935100
2016-01-27,26.797583,69.888271,86.610891,-13.389109,0.157869,0.073679,0.051571,95682832,35.879002,38.286812,...,0.024354,-33983900,17.09,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,26.835346,70.587786,88.7029,-11.2971,0.180559,0.095447,0.03586,130350978,37.415001,58.442415,...,0.015522,-18511100,16.68,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400


In [7]:
labels = pd.read_csv(
    "../data/labels_sector.csv",
    index_col="Date",
    parse_dates=True,
)
labels.head()


Unnamed: 0_level_0,T,GOOGL,GOOG,CMCSA,VZ,NFLX,DIS,IPG,EA,NWSA,...,AAPL,AMD,NVDA,MU,MSFT,INTC,CSCO,HPE,ORCL,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
2016-01-25,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-26,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-27,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Training and Prediction (Base)

In [8]:
from grtel.classification import MultiClassifier
from grtel.utils import print_scores, print_1_percentage, confusion_matrix_metrics

from sklearn.model_selection import train_test_split

In [9]:
X = samples
y = np.array(labels)

test_size = 0.25
is_random_split = False

if is_random_split:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
else:
    k = int(len(X) * (1. - test_size))
    X_train, X_test = X[:k], X[k:]
    y_train, y_test = y[:k], y[k:]

In [10]:
# Initialize the classifier
n_classes = y.shape[1]

multi_classifier = MultiClassifier(n_classes)
multi_classifier.fit(X_train, y_train)

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 


## Performance

In [11]:
num_sectors = stocks_by_sector_df.shape[1]
num_assets = stocks_by_sector_df.shape[0]

# train accuracy
train_score = multi_classifier.score(X_train, y_train)
train_accuracy_df = pd.DataFrame(
    np.array(train_score).reshape(num_sectors, num_assets)
).T
train_accuracy_df.columns = stocks_by_sector_df.columns

# test accuracy
test_score = multi_classifier.score(X_test, y_test)
test_accuracy_df = pd.DataFrame(
    np.array(test_score).reshape(num_sectors, num_assets)
).T
test_accuracy_df.columns = stocks_by_sector_df.columns

# confusion matrices
conf_matrices = multi_classifier.confusion_matrices(X_test, y_test)
conf_matrices = np.array(conf_matrices).reshape(num_sectors, num_assets, 2, 2)

In [12]:
stocks_to_analyze = ["WMT", "TGT", "PFE", "MSFT", "CVX", "DVN", "JPM", "AAPL", "NVDA"]

# Performace metrics
performance_df = pd.DataFrame(
    columns=stocks_to_analyze,
    index=['train acc', 'test acc', 'precision', 'recall', 'downturn_precision', 'specificity'],
    dtype=float,
)
for ticker in stocks_to_analyze:
    idx = ticker_to_idx[ticker]
    performance_df[ticker]['train acc'] = train_accuracy_df.iloc[idx]
    performance_df[ticker]['test acc'] = test_accuracy_df.iloc[idx]

    conf_matrix = conf_matrices[idx[1], idx[0]]
    accuracy, precision, recall, specificity, downturn_precision = confusion_matrix_metrics(conf_matrix)

    performance_df[ticker]['precision'] = precision
    performance_df[ticker]['recall'] = recall
    performance_df[ticker]['downturn_precision'] = downturn_precision
    performance_df[ticker]['specificity'] = specificity

performance_df

Unnamed: 0,WMT,TGT,PFE,MSFT,CVX,DVN,JPM,AAPL,NVDA
train acc,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
test acc,0.557377,0.464481,0.355191,0.453552,0.469945,0.459016,0.47541,0.502732,0.393443
precision,0.672131,0.496241,0.521739,0.627907,0.44898,0.428571,0.37931,0.52459,0.402878
recall,0.401961,0.680412,0.393443,0.243243,0.23913,0.453488,0.44,0.340426,0.666667
downturn_precision,0.5,0.38,0.186813,0.4,0.477612,0.48913,0.5625,0.491803,0.363636
specificity,0.753086,0.22093,0.278689,0.777778,0.703297,0.463918,0.5,0.674157,0.161616


In [13]:
# Scores
print("Classification accuracy (Train):")
score = multi_classifier.score(X_train, y_train)
print_scores(score)
print()

print("Classification accuracy (Test):")
score = multi_classifier.score(X_test, y_test)
print_scores(score)
print()

print("Percentage of 1s (Test):")
print_1_percentage(y_test, n_classes)
print()

Classification accuracy (Train):
[100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%, 100.00%]

Classification accuracy (Test):
[48.63%, 47.54%, 55.19%, 45.90%, 48.09%, 43.72%, 55.19%, 56.28%, 42.62%, 60.66%, 52.46%, 46.99%, 57.92%, 33.88%, 41.53%, 62.30%, 56.28%, 57.92%, 60.66%, 46.45%, 55.74%, 39.34%, 38.25%, 46.45%, 63.39%, 57.38%, 72.6