In [1]:
# Allows imports from other packages in the project
import sys
import os
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_root)

In [2]:
import pandas as pd

In [3]:
stocks_by_cluster_df = pd.read_csv('../data/stocks_by_cluster.csv')
stocks_by_cluster_df

Unnamed: 0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8
0,AMAT,DVN,MSFT,JPM,CVX,GOOG,PFE,TGT
1,MU,MRO,ABT,BAC,SLB,GOOGL,ABBV,TJX
2,NVDA,HAL,MGM,MS,COP,AMZN,JNJ,WBA
3,AAPL,XOM,ORCL,C,KMI,BSX,GILD,WMT
4,NFLX,MPC,EBAY,RF,F,MDLZ,BMY,CVS


## Get the data and the indicators

We collect the stock data from yfinance for between dates '2016-01-02'-'2019-01-01' (3 years) and calculate certain indicators helpful for analysis, which are:

Relative Strength Index (RSI)
Stochastic Oscillator
Williams %R
Moving Average Convergence Divergnece (MACD)
Price Rate Of Change
On Balance Volume
After calculating these indicators for each day, we add them to the dataframe of the stock as new columns. We select the columns ['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume'] at the end to obtain a new dataframe representing the stock.

We do this for each of the stocks we have selected and concatenate the individual dataframes to obtain a a resulting dataframe of shape (T, NxM), where T is the number of time steps, N the number of assets, M the number of indicators for each asset.

We also standardize the names of individual assets (e.g. asset_1) for tensorization.

Also Note that the labels are set according to the price change
 days-out (set to be 9 below).

In [4]:
from grtel.indicators import get_rsi, get_stochastic_oscillator, get_williams, get_macd, get_obv, get_indicators

In [5]:
samples = []
labels = []
for cluster in stocks_by_cluster_df.columns:
    for i, ticker in enumerate(stocks_by_cluster_df[cluster]):
        asset_name = 'asset_' + str(i+1)

        stock_data_and_indicators = get_indicators(ticker)

        X_i = stock_data_and_indicators[['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']].copy()
        num_indicators = len(X_i.columns)
        X_i.columns = [ [cluster] * num_indicators, [asset_name] * num_indicators, X_i.columns]

        y_i = stock_data_and_indicators['Prediction'].copy()
        y_i.name = ticker

        samples.append(X_i)
        labels.append(y_i)

samples_df = pd.concat(samples, axis=1)
samples_df.columns.names = ['Cluster', 'Asset', 'Metrics']

labels_df = pd.concat(labels, axis=1)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [6]:
samples_df

Cluster,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,...,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_4,asset_4,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5,asset_5
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700,11.070000,32.955810,...,-0.013377,120042900,95.639999,54.832811,68.541642,-31.458358,-0.050732,-0.090357,0.022232,-1321000
2016-01-25,16.910000,47.801520,38.129494,-61.870506,-0.078060,-0.114917,0.013789,-91721800,10.480000,27.248351,...,-0.011990,158513100,94.070000,43.171986,35.833336,-64.166664,-0.091800,-0.090656,-0.008955,-7055900
2016-01-26,17.350000,57.746426,63.025232,-36.974768,-0.035484,-0.098570,0.035821,-75935100,10.550000,28.933255,...,0.005973,186836700,94.040001,42.970516,39.486050,-60.513950,-0.123628,-0.097442,-0.017243,-11584900
2016-01-27,17.090000,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300,10.390000,27.267746,...,0.032784,156193800,94.339996,45.884517,46.495290,-53.504710,-0.127802,-0.103654,0.002444,-7679300
2016-01-28,16.680000,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400,9.890000,22.580823,...,0.018395,190028700,93.489998,39.316729,26.635507,-73.364493,-0.183022,-0.119819,-0.021150,-11754000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,34.020000,39.676060,16.000019,-83.999981,-0.062314,0.159786,-0.055000,597071100,35.209999,37.702548,...,-0.012521,739386900,73.410004,25.598058,18.737689,-81.262311,-0.354557,0.392181,-0.076604,-85443400
2018-12-12,34.200001,41.599638,14.450871,-85.549129,-0.131203,0.101588,-0.078416,608966600,36.029999,44.116271,...,-0.044634,710469300,74.500000,36.029502,29.487160,-70.512840,-0.488715,0.216002,-0.071882,-75301600
2018-12-13,33.709999,37.812369,5.009603,-94.990397,-0.222768,0.036717,-0.077449,599290800,35.020000,38.485228,...,-0.044506,686342400,73.370003,30.854822,18.343202,-81.656798,-0.678397,0.037122,-0.082875,-85995500
2018-12-14,32.650002,30.810807,0.332786,-99.667214,-0.376527,-0.045932,-0.124195,588467900,34.200001,34.374934,...,-0.059396,651861600,71.879997,25.321635,3.648867,-96.351133,-0.938139,-0.157930,-0.103741,-96620200


In [7]:
labels_df

Unnamed: 0_level_0,AMAT,MU,NVDA,AAPL,NFLX,DVN,MRO,HAL,XOM,MPC,...,PFE,ABBV,JNJ,GILD,BMY,TGT,TJX,WBA,WMT,CVS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2016-01-25,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2016-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2016-01-27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2016-01-28,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
samples_df.to_csv('../data/samples_cluster.csv')
labels_df.to_csv('../data/labels_cluster.csv')