In [1]:
# Allows imports from other packages in the project
import sys
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_root)

In [2]:
import pandas as pd

In [3]:
stocks_by_cluster_df = pd.read_csv('../data/stocks_by_cluster.csv')
stocks_by_cluster_df

Unnamed: 0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8
0,TGT,PFE,DVN,BAC,MSFT,SLB,AMAT,GOOG
1,TJX,ABBV,MRO,JPM,ABT,CVX,MU,GOOGL
2,WBA,JNJ,XOM,MS,MGM,HAL,NVDA,AMZN
3,WMT,GILD,MPC,C,ORCL,COP,AAPL,BSX
4,CVS,BMY,WMB,RF,EBAY,KMI,NFLX,MDLZ


## Get the data and the indicators

Stock data is collected with `yfinance` for dates `2016-01-02 - 2019-01-01` (3 years). Certain indicators are then calculated for analysis. These indicators are:

- Relative Strength Index (RSI)
- Stochastic Oscillator
- Williams %R
- Moving Average Convergence Divergence (MACD)
- Price Rate Of Change
- On Balance Volume

A new dataframe with columns `['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']` is then created to reprent a single stock.

Each of these dataframes are then concatenated to obtain the resulting samples dataframe of shape `(T, NM)`, where `T` is the number of time steps, `N` the number of assets, and `M` the number of indicators for each asset.

Asset names are standardized (e.g. asset_1) for tensorization.

Note that the labels are set according to the price change `n` days-out (set to be 9 below).

In [4]:
from grtel.indicators import get_indicators

In [5]:
samples = []
labels = []
for cluster in stocks_by_cluster_df.columns:
    for i, ticker in enumerate(stocks_by_cluster_df[cluster]):
        asset_name = f'asset_{i + 1}'

        stock_data_and_indicators = get_indicators(ticker)

        X_i = stock_data_and_indicators[['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']].copy()
        num_indicators = len(X_i.columns)
        X_i.columns = [ [cluster] * num_indicators, [asset_name] * num_indicators, X_i.columns]

        y_i = stock_data_and_indicators['Prediction'].copy()
        y_i.name = ticker

        samples.append(X_i)
        labels.append(y_i)

samples_df = pd.concat(samples, axis=1)
samples_df.columns.names = ['Cluster', 'Asset', 'Metrics']

labels_df = pd.concat(labels, axis=1)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [6]:
samples_df

Cluster,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,...,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_4,asset_4,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,69.720001,40.841917,34.171927,-65.828073,-0.609793,-0.426234,-0.023256,-28661600,34.474998,52.461434,...,0.024096,-48541400,41.599998,49.475466,40.251530,-59.748470,-0.223801,-0.216184,0.014882,-36938000
2016-01-25,68.919998,36.140205,25.786151,-74.213849,-0.624351,-0.467303,-0.048329,-33973800,34.380001,50.599646,...,0.023351,-57890800,40.200001,38.612846,10.901476,-89.098524,-0.263040,-0.225897,-0.035509,-50403200
2016-01-26,70.440002,49.009156,41.719109,-58.280891,-0.535363,-0.481309,-0.039804,-29921200,34.845001,58.848049,...,-0.023047,-68694300,41.349998,49.186828,37.193741,-62.806259,-0.219761,-0.224634,-0.018281,-42275700
2016-01-27,70.690002,50.887399,44.339654,-55.660346,-0.445382,-0.473958,-0.013261,-25959800,34.595001,53.324693,...,-0.018803,-78069200,41.119999,47.306391,42.857094,-57.142906,-0.198261,-0.219238,0.013057,-50018100
2016-01-28,70.989998,53.270660,54.381726,-45.618274,-0.352089,-0.449137,0.008237,-21886500,34.695000,55.262643,...,-0.069383,-86157700,41.549999,51.320984,65.614033,-34.385967,-0.152250,-0.205594,0.004594,-38397600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,66.779999,26.379582,10.806684,-89.193316,-3.875616,-3.644456,-0.064444,17665000,45.730000,34.577989,...,0.036869,166023400,44.209999,51.426356,48.333296,-51.666704,0.350623,0.456104,-0.007409,-195223100
2018-12-12,67.790001,33.742171,26.179624,-73.820376,-3.800247,-3.675614,-0.056900,23116300,46.099998,38.569402,...,0.012278,173017100,44.070000,47.964201,42.499992,-57.500008,0.314939,0.427871,-0.007879,-202785800
2018-12-13,66.680000,29.944648,10.778461,-89.221539,-3.786437,-3.697779,-0.063088,17249500,45.810001,36.552604,...,-0.001620,167946300,44.349998,54.961391,54.166600,-45.833400,0.305729,0.403442,-0.004042,-196683800
2018-12-14,67.169998,33.742858,18.113758,-81.886242,-3.693379,-3.696899,-0.053410,23721200,45.220001,32.556282,...,-0.049642,161123800,43.549999,38.080230,20.833320,-79.166680,0.231211,0.368996,-0.031792,-203185800


In [7]:
labels_df

Unnamed: 0_level_0,TGT,TJX,WBA,WMT,CVS,PFE,ABBV,JNJ,GILD,BMY,...,AMAT,MU,NVDA,AAPL,NFLX,GOOG,GOOGL,AMZN,BSX,MDLZ
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-25,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-26,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-27,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-28,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-12,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-13,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-14,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
samples_df.to_csv('../data/samples_cluster.csv')
labels_df.to_csv('../data/labels_cluster.csv')