In [1]:
# Allows imports from other packages in the project
import sys
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_root)

In [2]:
import pandas as pd

In [3]:
stocks_by_sector_df = pd.read_csv("../data/stocks_by_sector.csv")
stocks_by_sector_df

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,AMZN,WMT,MRO,BAC,PFE,CSX,AAPL
1,GOOGL,F,KO,KMI,WFC,MRK,GE,AMD
2,GOOG,GM,KR,XOM,C,GILD,DAL,NVDA
3,CMCSA,EBAY,PG,HAL,RF,BMY,AAL,MU
4,VZ,SBUX,MDLZ,WMB,JPM,BSX,CPRT,MSFT
5,NFLX,NKE,MO,COP,KEY,ABT,LUV,INTC
6,DIS,M,MNST,SLB,MS,CVS,FAST,CSCO
7,IPG,MGM,COTY,DVN,HBAN,ABBV,CAT,HPE
8,EA,TJX,WBA,CVX,SCHW,JNJ,JCI,ORCL
9,NWSA,TGT,PM,MPC,SYF,MDT,UAL,AMAT


## Get the data and the indicators

Stock data is collected with `yfinance` for dates `2016-01-02 - 2019-01-01` (3 years). Certain indicators are then calculated for analysis. These indicators are:

- Relative Strength Index (RSI)
- Stochastic Oscillator
- Williams %R
- Moving Average Convergence Divergence (MACD)
- Price Rate Of Change
- On Balance Volume

A new dataframe with columns `['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']` is then created to reprent a single stock.

Each of these dataframes are then concatenated to obtain the resulting samples dataframe of shape `(T, NM)`, where `T` is the number of time steps, `N` the number of assets, and `M` the number of indicators for each asset.

Asset names are standardized (e.g. asset_1) for tensorization.

Note that the labels are set according to the price change `n` days-out (set to be 9 below).

In [4]:
from grtel.indicators import get_indicators

In [5]:
samples = []
labels = []
for sector in stocks_by_sector_df.columns:
    for i, ticker in enumerate(stocks_by_sector_df[sector]):
        asset_name = f'asset_{i + 1}'

        stock_data_and_indicators = get_indicators(ticker)

        X_i = stock_data_and_indicators[['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']].copy()
        num_indicators = len(X_i.columns)
        X_i.columns = [ [sector] * num_indicators, [asset_name] * num_indicators, X_i.columns]

        y_i = stock_data_and_indicators['Prediction'].copy()
        y_i.name = ticker

        samples.append(X_i)
        labels.append(y_i)

samples_df = pd.concat(samples, axis=1)
samples_df.columns.names = ['Sector', 'Asset', 'Metrics']

labels_df = pd.concat(labels, axis=1)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [6]:
samples_df

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,26.540785,66.984964,97.740045,-2.259955,0.069637,0.016062,0.047704,14024206,37.272999,58.189602,...,0.004329,-16351600,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,26.435045,63.461389,78.712858,-21.287142,0.093537,0.032122,0.030928,-26045860,36.681000,49.067155,...,-0.017172,-33145200,16.910000,47.801520,38.129494,-61.870506,-0.078060,-0.114917,0.013789,-91721800
2016-01-26,26.737160,68.861320,92.129624,-7.870376,0.128820,0.052022,0.044248,29134620,36.689499,49.199084,...,-0.003958,-13416300,17.350000,57.746426,63.025232,-36.974768,-0.035484,-0.098570,0.035821,-75935100
2016-01-27,26.797583,69.888271,86.610891,-13.389109,0.157869,0.073679,0.051571,95682832,35.879002,38.286812,...,0.024354,-33983900,17.090000,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,26.835346,70.587786,88.702900,-11.297100,0.180559,0.095447,0.035860,130350978,37.415001,58.442415,...,0.015522,-18511100,16.680000,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,22.432024,36.639805,18.662003,-81.337997,-0.177166,-0.180185,-0.024310,35111028,53.082500,47.241232,...,-0.020658,682301800,34.020000,39.676060,16.000019,-83.999981,-0.062314,0.159786,-0.055000,597071100
2018-12-12,22.779455,46.469092,34.859142,-65.140858,-0.175076,-0.179164,-0.018868,89267527,53.686501,52.226371,...,-0.018054,703705800,34.200001,41.599638,14.450871,-85.549129,-0.131203,0.101588,-0.078416,608966600
2018-12-13,22.590633,42.349233,26.056326,-73.943674,-0.186507,-0.180632,-0.021590,24262040,53.676998,52.136943,...,-0.013988,680410400,33.709999,37.812369,5.009603,-94.990397,-0.222768,0.036717,-0.077449,599290800
2018-12-14,22.824774,48.839002,36.971882,-63.028118,-0.174659,-0.179438,-0.032650,80851654,52.585499,42.493301,...,-0.044299,658976100,32.650002,30.810807,0.332786,-99.667214,-0.376527,-0.045932,-0.124195,588467900


In [7]:
labels_df

Unnamed: 0_level_0,T,GOOGL,GOOG,CMCSA,VZ,NFLX,DIS,IPG,EA,NWSA,...,AAPL,AMD,NVDA,MU,MSFT,INTC,CSCO,HPE,ORCL,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
2016-01-25,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-26,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-27,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
samples_df.to_csv('../data/samples_sector.csv')
labels_df.to_csv('../data/labels_sector.csv')