In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import yfinance as yf

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [2]:
stocks = pd.read_csv('stocks_by_sector.csv')
stocks

Unnamed: 0,Communication Services,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
0,T,F,KO,MRO,BAC,PFE,GE,AAPL
1,TWTR,GM,KR,KMI,WFC,MRK,DAL,AMD
2,FB,EBAY,PG,XOM,C,GILD,CSX,MU
3,CMCSA,SBUX,WMT,HAL,RF,BMY,AAL,MSFT
4,VZ,NKE,MDLZ,WMB,JPM,BSX,LUV,INTC
5,NFLX,M,MO,COP,KEY,ABT,FAST,CSCO
6,DIS,MGM,COTY,SLB,MS,CVS,CAT,HPE
7,ATVI,TJX,WBA,DVN,HBAN,ABBV,JCI,ORCL
8,IPG,TGT,PM,CVX,SCHW,JNJ,UAL,NVDA
9,DISCA,NWL,PEP,COG,SYF,MDT,UNP,AMAT


------------------------------------
**Get the data and the indicators**
------------------------------------
We collect the stock data from yfinance for between dates `'2016-01-02'-'2019-01-01'` (3 years) and calculate certain indicators helpful for analysis, which are:
- Relative Strength Index (RSI)
- Stochastic Oscillator
- Williams %R
- Moving Average Convergence Divergnece (MACD)
- Price Rate Of Change
- On Balance Volume

After calculating these indicators for each day, we add them to the dataframe of the stock as new columns. We select the columns `['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']` at the end to obtain a new dataframe representing the stock.

We do this for each of the stocks we have selected and concatenate the individual dataframes to obtain a a resulting dataframe of shape `(T, NxM)`, where `T` is the number of time steps, `N` the number of assets, `M` the number of indicators for each asset.

We also standardize the names of individual assets (e.g. `asset_1`) for tensorization.

Also Note that the labels are set according to the price change $n$ days-out (set to be 9 below).

In [3]:
from GRTEL.indicators import get_RSI, get_Stochastic_Oscillator, get_Williams, get_MACD, get_OBV

In [4]:
samples = []
labels = []
for sector in stocks.columns:
    for i, stock in enumerate(stocks[sector].values):
        
        stock_name = 'asset_' + str(i+1)
        
        # get the original data
        data = yf.download(stock, start='2016-01-02', end='2019-01-01')

        # calculate change in price
        data['change_in_price'] = data['Close'].diff()

        # calculate indicators
        data['RSI'] = get_RSI(data)
        data['k_percent'] = get_Stochastic_Oscillator(data)
        data['r_percent'] = get_Williams(data)

        # Calculate the MACD
        macd, ema_9_macd = get_MACD(data)
        data['MACD'] = macd
        data['MACD_EMA9'] = ema_9_macd

        # Calculate the 9-day Price Rate of Change
        data['Price Rate Of Change'] = data['Close'].pct_change(periods=9)

        # Calculate On Balance Volume
        data['On Balance Volume'] = get_OBV(data)

        # Create the predicition column (To keep this as a binary classifier we'll consider flat days as up days)
        days_out = 9
        data['Prediction'] = np.sign(np.sign(data['Close'].shift(-days_out) - data['Close']) + 1.)

        # Drop rows with NaN.
        data = data.dropna()

        X_i = data[['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']].copy()
        X_i.columns = [[sector]*len(X_i.columns), [stock_name]*len(X_i.columns), X_i.columns]
        
        y_i = data['Prediction'].copy()
        y_i.name = stock
        
        samples.append(X_i)
        labels.append(y_i)

        
samples = pd.concat(samples, axis=1)
samples.columns.names = ['Sector', 'Asset', 'Metrics']

labels = pd.concat(labels, axis=1)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [5]:
samples

Sector,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,Communication Services,...,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology,Information Technology
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_9,asset_9,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10,asset_10
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,35.139999,66.984998,97.740062,-2.259938,0.092199,0.021266,0.047704,10592300,17.840000,30.079617,...,-0.039824,-9242700,17.049999,51.119297,43.165445,-56.834555,-0.097708,-0.124555,0.015485,-82509700
2016-01-25,35.000000,63.461411,78.712861,-21.287139,0.123844,0.042529,0.030928,-19672100,17.020000,23.781919,...,-0.042453,-16010900,16.910000,47.801520,38.129494,-61.870506,-0.078060,-0.114917,0.013789,-91721800
2016-01-26,35.400002,68.861369,92.129714,-7.870286,0.170558,0.068877,0.044248,22005000,17.010000,23.712060,...,-0.049039,-10312300,17.350000,57.746426,63.025232,-36.974768,-0.035484,-0.098570,0.035821,-75935100
2016-01-27,35.480000,69.888295,86.610888,-13.389112,0.209019,0.097551,0.051571,72268000,16.780001,21.997274,...,-0.030759,-16104800,17.090000,51.107526,70.454523,-29.545477,-0.018663,-0.082221,0.054938,-88525300
2016-01-28,35.529999,70.587814,88.702907,-11.297093,0.239060,0.126372,0.035860,98452400,16.490000,19.903236,...,-0.021625,-23024200,16.680000,42.266121,47.976887,-52.023113,-0.031403,-0.071871,-0.007733,-106022400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,29.700001,36.639813,18.662008,-81.337992,-0.234568,-0.238566,-0.024310,26518900,34.450001,65.458987,...,-0.036037,1013352200,34.020000,39.676060,16.000019,-83.999981,-0.062314,0.159786,-0.055000,597071100
2018-12-12,30.160000,46.469108,34.859168,-65.140832,-0.231801,-0.237213,-0.018868,67422600,36.250000,74.545905,...,-0.069782,1029705600,34.200001,41.599638,14.450871,-85.549129,-0.131203,0.101588,-0.078416,608966600
2018-12-13,29.910000,42.349247,26.056346,-73.943654,-0.246935,-0.239157,-0.021590,18324800,35.889999,70.279251,...,-0.053826,1017921000,33.709999,37.812369,5.009603,-94.990397,-0.222768,0.036717,-0.077449,599290800
2018-12-14,30.219999,48.838972,36.971827,-63.028173,-0.231248,-0.237575,-0.032650,61066200,35.869999,70.022338,...,-0.103898,1006125500,32.650002,30.810807,0.332786,-99.667214,-0.376527,-0.045932,-0.124195,588467900


In [6]:
labels

Unnamed: 0_level_0,T,TWTR,FB,CMCSA,VZ,NFLX,DIS,ATVI,IPG,DISCA,...,AAPL,AMD,MU,MSFT,INTC,CSCO,HPE,ORCL,NVDA,AMAT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2016-01-25,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-26,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-27,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-28,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


-------
Store samples and labels as .csv files
-----

In [7]:
# samples.to_csv('samples_sector.csv')
# labels.to_csv('labels_sector.csv')