In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import copy

import yfinance as yf

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

%load_ext autoreload
%autoreload 2

In [2]:
stocks = pd.read_csv('stocks_by_cluster.csv')
stocks

Unnamed: 0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8
0,PEP,TGT,ABT,MSFT,CVX,DVN,DAL,JPM
1,KO,TJX,PFE,AMAT,SLB,MRO,AAL,BAC
2,PG,WBA,MDT,AAPL,COP,KMI,UAL,MS
3,PM,WMT,ABBV,INTC,HAL,WMB,UNP,C
4,MDLZ,CVS,JNJ,NFLX,XOM,MGM,FAST,KEY
5,MO,M,BSX,NVDA,CAT,HPE,IPG,SCHW
6,NWL,KR,GILD,ATVI,RF,DISCA,GM,HBAN


------------------------------------
**Get the data and the indicators**
------------------------------------
We collect the stock data from yfinance for between dates `'2016-01-02'-'2019-01-01'` (3 years) and calculate certain indicators helpful for analysis, which are:
- Relative Strength Index (RSI)
- Stochastic Oscillator
- Williams %R
- Moving Average Convergence Divergnece (MACD)
- Price Rate Of Change
- On Balance Volume

After calculating these indicators for each day, we add them to the dataframe of the stock as new columns. We select the columns `['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']` at the end to obtain a new dataframe representing the stock.

We do this for each of the stocks we have selected and concatenate the individual dataframes to obtain a a resulting dataframe of shape `(T, NxM)`, where `T` is the number of time steps, `N` the number of assets, `M` the number of indicators for each asset.

We also standardize the names of individual assets (e.g. `asset_1`) for tensorization.

Also Note that the labels are set according to the price change $n$ days-out (set to be 9 below).

In [3]:
from GRTEL.indicators import get_RSI, get_Stochastic_Oscillator, get_Williams, get_MACD, get_OBV

In [4]:
samples = []
labels = []
for cluster in stocks.columns:
    for i, stock in enumerate(stocks[cluster].values):
        
        stock_name = 'asset_' + str(i+1)
        
        # get the original data
        data = yf.download(stock, start='2016-01-02', end='2019-01-01')

        # calculate change in price
        data['change_in_price'] = data['Close'].diff()

        # calculate indicators
        data['RSI'] = get_RSI(data)
        data['k_percent'] = get_Stochastic_Oscillator(data)
        data['r_percent'] = get_Williams(data)

        # Calculate the MACD
        macd, ema_9_macd = get_MACD(data)
        data['MACD'] = macd
        data['MACD_EMA9'] = ema_9_macd

        # Calculate the 9-day Price Rate of Change
        data['Price Rate Of Change'] = data['Close'].pct_change(periods=9)

        # Calculate On Balance Volume
        data['On Balance Volume'] = get_OBV(data)

        # Create the predicition column (To keep this as a binary classifier we'll consider flat days as up days)
        days_out = 9
        data['Prediction'] = np.sign(np.sign(data['Close'].shift(-days_out) - data['Close']) + 1.)

        # Drop rows with NaN.
        data = data.dropna()

        X_i = data[['Close', 'RSI', 'k_percent', 'r_percent', 'MACD', 'MACD_EMA9', 'Price Rate Of Change', 'On Balance Volume']].copy()
        X_i.columns = [[cluster]*len(X_i.columns), [stock_name]*len(X_i.columns), X_i.columns]
        
        y_i = data['Prediction'].copy()
        y_i.name = stock
        
        samples.append(X_i)
        labels.append(y_i)

        
samples = pd.concat(samples, axis=1)
samples.columns.names = ['Cluster', 'Asset', 'Metrics']

labels = pd.concat(labels, axis=1)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [5]:
samples

Cluster,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,cluster_1,...,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8,cluster_8
Asset,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_1,asset_2,asset_2,...,asset_6,asset_6,asset_7,asset_7,asset_7,asset_7,asset_7,asset_7,asset_7,asset_7
Metrics,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume,Close,RSI,...,Price Rate Of Change,On Balance Volume,Close,RSI,k_percent,r_percent,MACD,MACD_EMA9,Price Rate Of Change,On Balance Volume
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2016-01-22,95.849998,46.758102,39.393925,-60.606075,-0.449477,-0.355034,-0.013990,3217800,42.060001,55.344782,...,-0.109310,-121219600,9.15,20.659193,25.892845,-74.107155,-0.166997,-0.107536,-0.089552,-59805200
2016-01-25,95.940002,47.496642,40.757622,-59.242378,-0.390672,-0.362422,-0.015394,7624800,42.160000,57.575863,...,-0.124217,-136501300,8.80,15.455201,10.267876,-89.732124,-0.194758,-0.125617,-0.128713,-131099000
2016-01-26,96.320000,50.819987,47.158198,-52.841802,-0.318094,-0.353299,-0.013317,11360000,42.080002,55.037601,...,-0.129617,-124863700,8.05,9.523649,7.801429,-92.198571,-0.258863,-0.153038,-0.209234,-219455100
2016-01-27,95.940002,47.360939,46.945914,-53.054086,-0.282311,-0.338775,-0.005803,5564900,42.090000,55.321647,...,-0.088777,-135186600,8.26,19.504577,16.475110,-83.524890,-0.291544,-0.181377,-0.152820,-182006200
2016-01-28,97.449997,59.881188,78.066898,-21.933102,-0.156237,-0.301598,0.017117,10734800,42.570000,66.903600,...,-0.117354,-147285700,8.48,28.975531,25.291817,-74.708183,-0.299122,-0.205358,-0.139086,-154026100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,117.290001,51.231303,40.379770,-59.620230,0.927007,1.284166,0.007300,252404100,49.540001,53.128143,...,-0.090007,374435000,12.76,12.519003,3.211038,-96.788962,-0.345539,-0.153765,-0.127820,75425200
2018-12-12,117.000000,49.392346,36.708873,-63.291127,0.844125,1.196158,-0.012658,247514400,49.220001,47.253034,...,-0.082786,386212100,12.72,12.193951,2.714951,-97.285049,-0.406593,-0.204331,-0.133515,58750900
2018-12-13,118.349998,57.572562,53.797458,-46.202542,0.877262,1.132379,0.000676,253695600,49.470001,52.034475,...,-0.061821,377215000,12.41,9.896204,2.766787,-97.233213,-0.474524,-0.258370,-0.144138,44891200
2018-12-14,113.949997,35.806561,3.361329,-96.638671,0.542230,1.014349,-0.065524,246291400,49.340000,49.350329,...,-0.080580,369464400,12.17,8.470405,0.735311,-99.264689,-0.541483,-0.314992,-0.165867,31520100


In [6]:
labels

Unnamed: 0_level_0,PEP,KO,PG,PM,MDLZ,MO,NWL,TGT,TJX,WBA,...,FAST,IPG,GM,JPM,BAC,MS,C,KEY,SCHW,HBAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-22,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-25,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-01-26,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2016-01-27,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2016-01-28,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-12-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


-------
Store samples and labels as .csv files
-----

In [7]:
# samples.to_csv('samples_cluster.csv')
# labels.to_csv('labels_cluster.csv')