# Data Collection, Cleaning, and Concatenation

This notebook collects, joins, and exports all of the initial variables used in the feature set

Imports

In [43]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import finnhub
import yfinance as yf
import talib as ta

Get dates for a ten-year window and convert to UNIX Timestamp integers

In [44]:
# Create datetime objects
start = pd.Timestamp("2011-08-01 23:59:00")
end = pd.Timestamp("2022-08-01 23:59:00")

# Convert datetime objects to UNIX timestamp integers
start_unix = int(pd.Timestamp.timestamp(start))
end_unix = int(pd.Timestamp.timestamp(end))

Get S&P 500 data from Yahoo!

In [45]:
# Fetch market data and use only the adjusted close
gspc = yf.download('^GSPC', start='2012-08-01', end='2022-08-02')

[*********************100%***********************]  1 of 1 completed


In [46]:
gspc = gspc.drop(columns=['Close'])
gspc.columns = ['open', 'high', 'low', 'close', 'volume']
gspc.index.names = ['date']
gspc = gspc.drop(index='2022-08-02', errors='ignore')
gspc.tail()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-07-26,3953.219971,3953.219971,3910.73999,3921.050049,3083420000
2022-07-27,3951.429932,4039.560059,3951.429932,4023.610107,3584170000
2022-07-28,4026.129883,4078.949951,3992.969971,4072.429932,3882850000
2022-07-29,4087.330078,4140.149902,4079.219971,4130.290039,3817740000
2022-08-01,4112.379883,4144.950195,4096.02002,4118.629883,3540960000


Check for nulls or missing data

In [47]:
gspc.isnull().values.any()

False

Check shape and start/end dates

In [50]:
display(gspc.shape, gspc)

(2516, 5)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-08-01,1379.319946,1385.030029,1373.349976,1375.319946,4440920000
2012-08-02,1375.130005,1375.130005,1354.650024,1365.000000,4193740000
2012-08-03,1365.449951,1394.160034,1365.449951,1390.989990,3751170000
2012-08-06,1391.040039,1399.630005,1391.040039,1394.229980,3122050000
2012-08-07,1394.459961,1407.140015,1394.459961,1401.349976,3682490000
...,...,...,...,...,...
2022-07-26,3953.219971,3953.219971,3910.739990,3921.050049,3083420000
2022-07-27,3951.429932,4039.560059,3951.429932,4023.610107,3584170000
2022-07-28,4026.129883,4078.949951,3992.969971,4072.429932,3882850000
2022-07-29,4087.330078,4140.149902,4079.219971,4130.290039,3817740000


## Create Variables with TA Lib

In [51]:
technicals = gspc.copy()

close = technicals['close']
high = technicals['high']
low = technicals['low']

# Hilbert Transform - Instantaneous Trendline
technicals['trend'] = ta.HT_TRENDLINE(close)

# RSI
technicals['rsi'] = ta.RSI(close, timeperiod=20)

# Stochastic RSI
technicals['rsi_fast_k'], technicals['rsi_fast_d'] = ta.STOCHRSI(
    close, 
    timeperiod=14, 
    fastk_period=5, 
    fastd_period=3, 
    fastd_matype=0
)

# Williams' %R
technicals['williams_r'] = ta.WILLR(high, low, close, timeperiod=14)

# MACD, MACD Signal, MACD Histogram
technicals['macd'], technicals['macd_signal'], technicals['macd_hist'] = ta.MACD(
    close, 
    fastperiod=12, 
    slowperiod=26, 
    signalperiod=9
)

# Moving Averages
technicals['ma_20'] = ta.MA(close, timeperiod=20, matype=0)
technicals['ma_50'] = ta.MA(close, timeperiod=50, matype=0)
technicals['ma_65'] = ta.MA(close, timeperiod=65, matype=0)
technicals['ma_200'] = ta.MA(close, timeperiod=200, matype=0)

# Bollinger Bands
technicals['bb_upp'], technicals['bb_mid'], technicals['bb_low'] = ta.BBANDS(
    close, 
    timeperiod=5, 
    nbdevup=2, 
    nbdevdn=2, 
    matype=0
)

# Percentage Price Oscillator
technicals['ppo'] = ta.PPO(close, fastperiod=12, slowperiod=26, matype=0)

# Momentum
technicals['mom'] = ta.MOM(close, timeperiod=10)

# Rate of Change
technicals['roc'] = ta.ROC(close, timeperiod=10)

# Exponential Moving Averages
technicals['ema_20'] = ta.EMA(close, timeperiod=20)
technicals['ema_50'] = ta.EMA(close, timeperiod=50)
technicals['ema_65'] = ta.EMA(close, timeperiod=65)
technicals['ema_200'] = ta.EMA(close, timeperiod=200)

Slice the data from Aug 1, 2012 onward

In [52]:
# Remove all data before Aug 1, 2012 for an exact ten years
technicals = technicals.loc['2012-08-01':]
display(technicals.shape, technicals.head())

(2516, 27)

Unnamed: 0_level_0,open,high,low,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,...,bb_upp,bb_mid,bb_low,ppo,mom,roc,ema_20,ema_50,ema_65,ema_200
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,1379.319946,1385.030029,1373.349976,1375.319946,4440920000,,,,,,...,,,,,,,,,,
2012-08-02,1375.130005,1375.130005,1354.650024,1365.0,4193740000,,,,,,...,,,,,,,,,,
2012-08-03,1365.449951,1394.160034,1365.449951,1390.98999,3751170000,,,,,,...,,,,,,,,,,
2012-08-06,1391.040039,1399.630005,1391.040039,1394.22998,3122050000,,,,,,...,,,,,,,,,,
2012-08-07,1394.459961,1407.140015,1394.459961,1401.349976,3682490000,,,,,,...,1411.931823,1385.377979,1358.824134,,,,,,,


Remove high, low, and open from our initial variable set

In [53]:
technicals = technicals.drop(columns=['high', 'low', 'open'])

Check for nulls

In [54]:
technicals[technicals.isnull().any(axis=1)]

Unnamed: 0_level_0,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,macd,macd_signal,macd_hist,...,bb_upp,bb_mid,bb_low,ppo,mom,roc,ema_20,ema_50,ema_65,ema_200
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,1375.319946,4440920000,,,,,,,,,...,,,,,,,,,,
2012-08-02,1365.000000,4193740000,,,,,,,,,...,,,,,,,,,,
2012-08-03,1390.989990,3751170000,,,,,,,,,...,,,,,,,,,,
2012-08-06,1394.229980,3122050000,,,,,,,,,...,,,,,,,,,,
2012-08-07,1401.349976,3682490000,,,,,,,,,...,1411.931823,1385.377979,1358.824134,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-05-13,1633.770020,2910600000,1603.001356,65.056930,60.471618,52.803112,-3.704289,19.026607,15.289300,3.737306,...,1637.543240,1630.557983,1623.572727,1.400810,40.160034,2.520067,1601.962685,1572.394825,1560.222992,
2013-05-14,1650.339966,3457790000,1605.692330,68.136064,100.000000,75.716774,-1.033466,20.517537,16.334948,4.182590,...,1651.240321,1635.433984,1619.627648,1.543432,52.770020,3.303143,1606.570045,1575.451497,1562.953809,
2013-05-15,1658.780029,3657440000,1608.362800,69.573602,100.000000,86.823873,-3.228837,22.125109,17.492980,4.632129,...,1664.546267,1640.652002,1616.757737,1.663051,76.080078,4.806981,1611.542425,1578.719283,1565.857634,
2013-05-16,1650.469971,3513130000,1610.892102,66.465804,21.236852,73.745617,-13.738966,22.469553,18.488295,3.981258,...,1665.437569,1645.411987,1625.386405,1.785360,52.880005,3.309986,1615.249810,1581.533035,1568.421644,


## Get Bond Data from CSVs

In [55]:
# Import bond data CSVs
# We are using only the % change column of each bond
bond_3mt = pd.read_csv('../csv/us-3mt-bond.csv', usecols=['Date', 'Change %'], index_col='Date', parse_dates=True, infer_datetime_format=True)
bond_2yr = pd.read_csv('../csv/us-2yr-bond.csv', usecols=['Date', 'Change %'], index_col='Date', parse_dates=True, infer_datetime_format=True)
bond_5yr = pd.read_csv('../csv/us-5yr-bond.csv', usecols=['Date', 'Change %'], index_col='Date', parse_dates=True, infer_datetime_format=True)
bond_10yr = pd.read_csv('../csv/us-10yr-bond.csv', usecols=['Date', 'Change %'], index_col='Date', parse_dates=True, infer_datetime_format=True)

bond_3mt.columns = ['bond_3mt']
bond_2yr.columns = ['bond_2yr']
bond_5yr.columns = ['bond_5yr']
bond_10yr.columns = ['bond_10yr']

In [56]:
# Join bonds
bonds = pd.concat([bond_3mt, bond_2yr, bond_5yr, bond_10yr], join='inner', axis='columns')

# Remove percentage symbol from each row and convert to decimal value
bonds = bonds.apply(
    lambda bond: bond.str.rstrip("%").replace(',','', regex=True).astype(float) / 100
)

bonds.head()

Unnamed: 0_level_0,bond_3mt,bond_2yr,bond_5yr,bond_10yr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-08-01,0.0651,-0.016,-0.0279,-0.0384
2022-07-31,0.0022,0.0105,0.0081,0.0048
2022-07-29,-0.0054,0.0077,-0.0062,-0.0048
2022-07-28,-0.0265,-0.0387,-0.0487,-0.0411
2022-07-27,-0.0312,-0.0246,-0.0221,-0.0065


Check for null values

In [57]:
bonds[bonds.isnull().any(axis=1)]

Unnamed: 0_level_0,bond_3mt,bond_2yr,bond_5yr,bond_10yr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


Check shape and dates

In [58]:
display(bonds.shape, bonds)

(3012, 4)

Unnamed: 0_level_0,bond_3mt,bond_2yr,bond_5yr,bond_10yr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-08-01,0.0651,-0.0160,-0.0279,-0.0384
2022-07-31,0.0022,0.0105,0.0081,0.0048
2022-07-29,-0.0054,0.0077,-0.0062,-0.0048
2022-07-28,-0.0265,-0.0387,-0.0487,-0.0411
2022-07-27,-0.0312,-0.0246,-0.0221,-0.0065
...,...,...,...,...
2012-08-07,0.0495,0.1167,0.0846,0.0403
2012-08-06,0.2454,0.0017,-0.0240,-0.0019
2012-08-03,-0.1099,0.0573,0.0900,0.0609
2012-08-02,-0.0521,-0.0340,-0.0378,-0.0302


## Get features from Yahoo!

In [71]:
# Row 1: USD/x Currency pairs
# Row 2: US/Global Indices
# Row 3: Futures
# Row 4: Large US Company Stocks
yf_tickers = \
"CAD=X CNY=X HKD=X AUD=X JPY=X EUR=X \
^IXIC ^DJI ^DJT ^RUT ^VIX ^FTSE ^HSI ^N225 \
ES=F YM=F NQ=F ZB=F ZN=F ZT=F CL=F NG=F GC=F SI=F HG=F KE=F ZC=F ZF=F ZS=F \
AAPL MSFT AMZN TSLA UNH NVDA JNJ XOM PG V JPM WMT KO"

# Fetch market data and use only the adjusted close
markets_ohlcv = yf.download(yf_tickers, start='2012-08-01', end='2022-08-02')
markets = markets_ohlcv['Adj Close'].drop(index='2022-08-02')
markets.tail()

[*********************100%***********************]  42 of 42 completed


Unnamed: 0_level_0,AAPL,AMZN,AUD=X,CAD=X,CL=F,CNY=X,ES=F,EUR=X,GC=F,HG=F,...,ZS=F,ZT=F,^DJI,^DJT,^FTSE,^HSI,^IXIC,^N225,^RUT,^VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-26,151.600006,114.809998,1.4386,1.28507,94.980003,6.7495,3923.25,0.978,1717.699951,3.3825,...,1532.75,104.976562,31761.539062,13614.139648,7306.299805,20905.880859,11562.570312,27655.210938,1805.25,24.690001
2022-07-27,156.789993,120.970001,1.43939,1.28737,97.260002,6.7623,4024.5,0.98721,1719.099976,3.4295,...,1578.75,105.101562,32197.589844,13847.009766,7348.200195,20670.039062,12032.419922,27715.75,1848.339966,23.24
2022-07-28,157.350006,122.279999,1.429613,1.28171,96.419998,6.7574,4073.5,0.9795,1750.300049,3.475,...,1609.25,105.28125,32529.630859,14275.299805,7345.299805,20622.679688,12162.589844,27815.480469,1873.030029,22.33
2022-07-29,162.509995,134.949997,1.4284,1.2807,98.620003,6.7458,4133.5,0.98113,1762.900024,3.584,...,1637.0,105.230469,32845.128906,14609.0,7423.399902,20156.509766,12390.69043,27801.640625,1885.22998,21.33
2022-08-01,161.509995,135.389999,1.43447,1.28156,93.889999,6.7432,4120.5,0.9796,1769.0,3.552,...,1594.25,105.203125,32798.398438,14634.089844,7413.399902,20165.839844,12368.980469,27993.349609,1883.310059,22.84


In [72]:
markets.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2609 entries, 2012-08-01 to 2022-08-01
Freq: B
Data columns (total 42 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2516 non-null   float64
 1   AMZN    2516 non-null   float64
 2   AUD=X   2604 non-null   float64
 3   CAD=X   2604 non-null   float64
 4   CL=F    2516 non-null   float64
 5   CNY=X   2604 non-null   float64
 6   ES=F    2516 non-null   float64
 7   EUR=X   2604 non-null   float64
 8   GC=F    2514 non-null   float64
 9   HG=F    2515 non-null   float64
 10  HKD=X   2604 non-null   float64
 11  JNJ     2516 non-null   float64
 12  JPM     2516 non-null   float64
 13  JPY=X   2605 non-null   float64
 14  KE=F    2514 non-null   float64
 15  KO      2516 non-null   float64
 16  MSFT    2516 non-null   float64
 17  NG=F    2516 non-null   float64
 18  NQ=F    2516 non-null   float64
 19  NVDA    2516 non-null   float64
 20  PG      2516 non-null   float64
 21  SI=F    251

Format column Names

In [73]:
yf_columns = [
    'stk_aapl', 'stk_amzn', 'usd_aud', 'usd_cad', 'fut_crude', 'usd_cny', 'fut_sp',
    'usd_eur', 'fut_gold', 'fut_copper', 'usd_hkd', 'stk_jnj', 'stk_jpm', 'usd_jpy', 'fut_wheat', 'stk_ko', 'stk_msft', 
    'fut_natgas', 'fut_nasdaq', 'stk_nvda', 'stk_pg', 'fut_silver', 'stk_tsla', 'stk_unh', 'stk_visa', 'stk_wmt', 'stk_xom',
    'fut_dji', 'fut_us_tbond', 'fut_corn', 'fut_5yr_tnote', 'fut_10yr_tnote', 'fut_soybean', 'fut_2yr_tnote',
    'idx_dji', 'idx_djt', 'idx_ftse', 'idx_hsi', 'idx_ixic', 'idx_n225', 'idx_rut', 'idx_vix'
]

markets.columns = yf_columns
markets = markets.reindex(columns=np.sort(yf_columns))

display(markets, markets.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2609 entries, 2012-08-01 to 2022-08-01
Freq: B
Data columns (total 42 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fut_10yr_tnote  2514 non-null   float64
 1   fut_2yr_tnote   2516 non-null   float64
 2   fut_5yr_tnote   2516 non-null   float64
 3   fut_copper      2515 non-null   float64
 4   fut_corn        2512 non-null   float64
 5   fut_crude       2516 non-null   float64
 6   fut_dji         2516 non-null   float64
 7   fut_gold        2514 non-null   float64
 8   fut_nasdaq      2516 non-null   float64
 9   fut_natgas      2516 non-null   float64
 10  fut_silver      2513 non-null   float64
 11  fut_soybean     2514 non-null   float64
 12  fut_sp          2516 non-null   float64
 13  fut_us_tbond    2513 non-null   float64
 14  fut_wheat       2514 non-null   float64
 15  idx_dji         2516 non-null   float64
 16  idx_djt         2516 non-null   float64
 17  idx_fts

Unnamed: 0_level_0,fut_10yr_tnote,fut_2yr_tnote,fut_5yr_tnote,fut_copper,fut_corn,fut_crude,fut_dji,fut_gold,fut_nasdaq,fut_natgas,...,stk_unh,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,134.156250,110.273438,124.546875,3.3765,800.50,88.910004,12923.0,1603.699951,2625.0,3.171,...,43.832554,29.716944,58.796192,57.188240,0.953740,1.00320,6.3618,0.81340,7.75376,78.120003
2012-08-02,134.625000,110.281250,124.695312,3.2925,794.00,87.129997,12831.0,1587.400024,2618.5,2.920,...,43.738403,30.138948,59.139599,56.510452,0.955110,1.00472,6.3688,0.81690,7.75500,78.379997
2012-08-03,133.906250,110.257812,124.382812,3.3710,810.00,91.400002,13055.0,1606.000000,2671.0,2.877,...,44.174934,30.537619,59.538925,57.609364,0.956130,1.00705,6.3676,0.82100,7.75426,78.220001
2012-08-06,134.109375,110.265625,124.531250,3.3930,803.00,92.199997,13067.0,1612.900024,2686.5,2.908,...,43.652809,30.549292,59.323292,57.543556,0.947060,1.00057,6.3715,0.80460,7.75430,78.610001
2012-08-07,133.578125,110.218750,124.273438,3.4450,796.00,93.669998,13119.0,1609.699951,2710.0,2.964,...,44.970951,30.402399,59.091671,57.852821,0.946700,1.00060,6.3740,0.80710,7.75500,78.199997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-26,119.859375,104.976562,112.906250,3.3825,597.00,94.980003,31732.0,1717.699951,12112.5,8.993,...,531.590027,212.490005,121.980003,89.629997,1.438600,1.28507,6.7495,0.97800,7.84913,136.431000
2022-07-27,120.421875,105.101562,113.296875,3.4295,600.25,97.260002,32172.0,1719.099976,12619.0,8.687,...,534.609985,210.470001,126.589996,91.570000,1.439390,1.28737,6.7623,0.98721,7.84926,136.970001
2022-07-28,120.984375,105.281250,113.718750,3.4750,615.00,96.419998,32490.0,1750.300049,12737.5,8.134,...,541.489990,211.350006,129.750000,92.639999,1.429613,1.28171,6.7574,0.97950,7.84910,136.110992
2022-07-29,121.140625,105.230469,113.726562,3.5840,616.25,98.620003,32825.0,1762.900024,12971.5,8.229,...,542.340027,212.110001,132.050003,96.930000,1.428400,1.28070,6.7458,0.98113,7.84950,134.397003


None

Check for nulls

In [74]:
markets[markets.isna().any(axis=1)]

Unnamed: 0_level_0,fut_10yr_tnote,fut_2yr_tnote,fut_5yr_tnote,fut_copper,fut_corn,fut_crude,fut_dji,fut_gold,fut_nasdaq,fut_natgas,...,stk_unh,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-27,133.812500,110.210938,124.468750,3.4845,794.50,95.470001,13107.0,1672.400024,2782.75,2.653,...,46.742744,29.814077,58.214840,58.109558,0.961090,0.99208,6.3548,0.79972,7.75610,78.730003
2012-09-03,,,,,,,,,,,...,,,,,0.972940,0.98600,6.3486,0.79510,7.75590,78.290001
2012-09-17,133.312500,110.218750,124.437500,3.8100,748.00,96.620003,13541.0,1767.699951,2852.00,2.865,...,46.820354,31.264378,59.411243,60.871326,0.948890,0.97159,6.3153,0.76240,7.75106,78.333000
2012-10-01,133.578125,110.265625,124.664062,3.7930,756.75,92.480003,13437.0,1780.500000,2788.25,3.480,...,48.521973,31.932312,59.459442,60.798496,0.965680,0.98420,6.2848,0.78070,7.75330,77.959000
2012-10-02,133.640625,110.265625,124.703125,3.8070,758.25,91.889999,13411.0,1772.699951,2795.25,3.531,...,48.814175,31.761814,59.218544,60.745502,0.965160,0.98190,6.2848,0.77590,7.75420,78.041000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-03,119.046875,105.808594,112.742188,4.4775,727.00,118.870003,32888.0,1845.400024,12551.00,8.523,...,483.884369,212.649994,125.320000,99.089996,1.375705,1.25698,6.6592,0.93014,7.84476,129.917999
2022-06-20,116.515625,104.695312,110.882812,3.9650,,110.269997,30148.0,,11420.75,6.694,...,,,,,1.438228,1.30005,6.7157,0.95316,7.84968,135.276001
2022-07-01,119.265625,105.187500,112.835938,3.6190,754.50,108.430000,31061.0,1798.900024,11611.25,5.730,...,517.400024,199.179993,122.629997,87.550003,1.449780,1.28737,6.6982,0.95441,7.84710,135.785995
2022-07-04,,,,,,,,,,,...,,,,,1.467180,1.28860,6.7005,0.95841,7.84680,135.042999


#### 😱 *null values!*

Since we have existing data alongside of nulls, we want to fill the gaps rather than delete existing data

We will use `df.ffill()` to forward-fill the nulls 

(Back-filling would introduce leakage from future data, which we don't want)

In [75]:
markets = markets.ffill()
markets[markets.isna().any(axis=1)]

Unnamed: 0_level_0,fut_10yr_tnote,fut_2yr_tnote,fut_5yr_tnote,fut_copper,fut_corn,fut_crude,fut_dji,fut_gold,fut_nasdaq,fut_natgas,...,stk_unh,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


Check shape and dates

In [76]:
display(markets.shape, markets)

(2609, 42)

Unnamed: 0_level_0,fut_10yr_tnote,fut_2yr_tnote,fut_5yr_tnote,fut_copper,fut_corn,fut_crude,fut_dji,fut_gold,fut_nasdaq,fut_natgas,...,stk_unh,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,134.156250,110.273438,124.546875,3.3765,800.50,88.910004,12923.0,1603.699951,2625.0,3.171,...,43.832554,29.716944,58.796192,57.188240,0.953740,1.00320,6.3618,0.81340,7.75376,78.120003
2012-08-02,134.625000,110.281250,124.695312,3.2925,794.00,87.129997,12831.0,1587.400024,2618.5,2.920,...,43.738403,30.138948,59.139599,56.510452,0.955110,1.00472,6.3688,0.81690,7.75500,78.379997
2012-08-03,133.906250,110.257812,124.382812,3.3710,810.00,91.400002,13055.0,1606.000000,2671.0,2.877,...,44.174934,30.537619,59.538925,57.609364,0.956130,1.00705,6.3676,0.82100,7.75426,78.220001
2012-08-06,134.109375,110.265625,124.531250,3.3930,803.00,92.199997,13067.0,1612.900024,2686.5,2.908,...,43.652809,30.549292,59.323292,57.543556,0.947060,1.00057,6.3715,0.80460,7.75430,78.610001
2012-08-07,133.578125,110.218750,124.273438,3.4450,796.00,93.669998,13119.0,1609.699951,2710.0,2.964,...,44.970951,30.402399,59.091671,57.852821,0.946700,1.00060,6.3740,0.80710,7.75500,78.199997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-26,119.859375,104.976562,112.906250,3.3825,597.00,94.980003,31732.0,1717.699951,12112.5,8.993,...,531.590027,212.490005,121.980003,89.629997,1.438600,1.28507,6.7495,0.97800,7.84913,136.431000
2022-07-27,120.421875,105.101562,113.296875,3.4295,600.25,97.260002,32172.0,1719.099976,12619.0,8.687,...,534.609985,210.470001,126.589996,91.570000,1.439390,1.28737,6.7623,0.98721,7.84926,136.970001
2022-07-28,120.984375,105.281250,113.718750,3.4750,615.00,96.419998,32490.0,1750.300049,12737.5,8.134,...,541.489990,211.350006,129.750000,92.639999,1.429613,1.28171,6.7574,0.97950,7.84910,136.110992
2022-07-29,121.140625,105.230469,113.726562,3.5840,616.25,98.620003,32825.0,1762.900024,12971.5,8.229,...,542.340027,212.110001,132.050003,96.930000,1.428400,1.28070,6.7458,0.98113,7.84950,134.397003


## Join DataFrames

In [77]:
X = technicals.join(bonds, on=technicals.index)
X = X.join(markets, on=X.index)
X

Unnamed: 0_level_0,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,macd,macd_signal,macd_hist,...,stk_unh,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,1375.319946,4440920000,,,,,,,,,...,43.832554,29.716944,58.796192,57.188240,0.953740,1.00320,6.3618,0.81340,7.75376,78.120003
2012-08-02,1365.000000,4193740000,,,,,,,,,...,43.738403,30.138948,59.139599,56.510452,0.955110,1.00472,6.3688,0.81690,7.75500,78.379997
2012-08-03,1390.989990,3751170000,,,,,,,,,...,44.174934,30.537619,59.538925,57.609364,0.956130,1.00705,6.3676,0.82100,7.75426,78.220001
2012-08-06,1394.229980,3122050000,,,,,,,,,...,43.652809,30.549292,59.323292,57.543556,0.947060,1.00057,6.3715,0.80460,7.75430,78.610001
2012-08-07,1401.349976,3682490000,,,,,,,,,...,44.970951,30.402399,59.091671,57.852821,0.946700,1.00060,6.3740,0.80710,7.75500,78.199997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-26,3921.050049,3083420000,3871.253815,49.880434,0.000000,32.941861,-31.418430,12.041694,-6.988644,19.030338,...,531.590027,212.490005,121.980003,89.629997,1.438600,1.28507,6.7495,0.97800,7.84913,136.431000
2022-07-27,4023.610107,3584170000,3880.207597,55.308438,100.000000,42.153586,-5.015708,20.063656,-1.578184,21.641839,...,534.609985,210.470001,126.589996,91.570000,1.439390,1.28737,6.7623,0.98721,7.84926,136.970001
2022-07-28,4072.429932,3882850000,3893.329737,57.608834,100.000000,66.666667,-1.824344,30.014481,4.740349,25.274132,...,541.489990,211.350006,129.750000,92.639999,1.429613,1.28171,6.7574,0.97950,7.84910,136.110992
2022-07-29,4130.290039,3817740000,3908.661969,60.166728,100.000000,100.000000,-2.355495,42.084288,12.209137,29.875151,...,542.340027,212.110001,132.050003,96.930000,1.428400,1.28070,6.7458,0.98113,7.84950,134.397003


## Add Primitive Variables

In [78]:
X['day_of_week'] = X.index.dayofweek
# X['week_of_year'] = X.index.isocalendar().week


### DataFrame Overview

In [79]:
display(X.shape, X)

(2516, 71)

Unnamed: 0_level_0,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,macd,macd_signal,macd_hist,...,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,1375.319946,4440920000,,,,,,,,,...,29.716944,58.796192,57.188240,0.953740,1.00320,6.3618,0.81340,7.75376,78.120003,2
2012-08-02,1365.000000,4193740000,,,,,,,,,...,30.138948,59.139599,56.510452,0.955110,1.00472,6.3688,0.81690,7.75500,78.379997,3
2012-08-03,1390.989990,3751170000,,,,,,,,,...,30.537619,59.538925,57.609364,0.956130,1.00705,6.3676,0.82100,7.75426,78.220001,4
2012-08-06,1394.229980,3122050000,,,,,,,,,...,30.549292,59.323292,57.543556,0.947060,1.00057,6.3715,0.80460,7.75430,78.610001,0
2012-08-07,1401.349976,3682490000,,,,,,,,,...,30.402399,59.091671,57.852821,0.946700,1.00060,6.3740,0.80710,7.75500,78.199997,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-26,3921.050049,3083420000,3871.253815,49.880434,0.000000,32.941861,-31.418430,12.041694,-6.988644,19.030338,...,212.490005,121.980003,89.629997,1.438600,1.28507,6.7495,0.97800,7.84913,136.431000,1
2022-07-27,4023.610107,3584170000,3880.207597,55.308438,100.000000,42.153586,-5.015708,20.063656,-1.578184,21.641839,...,210.470001,126.589996,91.570000,1.439390,1.28737,6.7623,0.98721,7.84926,136.970001,2
2022-07-28,4072.429932,3882850000,3893.329737,57.608834,100.000000,66.666667,-1.824344,30.014481,4.740349,25.274132,...,211.350006,129.750000,92.639999,1.429613,1.28171,6.7574,0.97950,7.84910,136.110992,3
2022-07-29,4130.290039,3817740000,3908.661969,60.166728,100.000000,100.000000,-2.355495,42.084288,12.209137,29.875151,...,212.110001,132.050003,96.930000,1.428400,1.28070,6.7458,0.98113,7.84950,134.397003,4


Check for null values

In [80]:
X[X.isnull().any(axis=1)]

Unnamed: 0_level_0,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,macd,macd_signal,macd_hist,...,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,1375.319946,4440920000,,,,,,,,,...,29.716944,58.796192,57.188240,0.95374,1.00320,6.3618,0.81340,7.75376,78.120003,2
2012-08-02,1365.000000,4193740000,,,,,,,,,...,30.138948,59.139599,56.510452,0.95511,1.00472,6.3688,0.81690,7.75500,78.379997,3
2012-08-03,1390.989990,3751170000,,,,,,,,,...,30.537619,59.538925,57.609364,0.95613,1.00705,6.3676,0.82100,7.75426,78.220001,4
2012-08-06,1394.229980,3122050000,,,,,,,,,...,30.549292,59.323292,57.543556,0.94706,1.00057,6.3715,0.80460,7.75430,78.610001,0
2012-08-07,1401.349976,3682490000,,,,,,,,,...,30.402399,59.091671,57.852821,0.94670,1.00060,6.3740,0.80710,7.75500,78.199997,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-05-15,1658.780029,3657440000,1608.362800,69.573602,100.000000,86.823873,-3.228837,22.125109,17.492980,4.632129,...,42.715210,65.285728,61.617729,1.00920,1.01820,6.1328,0.77297,7.76166,102.209999,2
2013-05-16,1650.469971,3513130000,1610.892102,66.465804,21.236852,73.745617,-13.738966,22.469553,18.488295,3.981258,...,42.254601,64.173943,61.253067,1.01030,1.01550,6.1359,0.77625,7.76170,102.258003,3
2013-05-17,1667.469971,3440710000,1612.486809,69.408407,86.227776,69.154876,-0.000000,23.839479,19.558531,4.280948,...,43.375595,63.658901,61.968903,1.01770,1.01850,6.1390,0.77620,7.76376,102.135002,4
2016-11-11,2164.449951,4988050000,2129.989310,55.938610,88.300571,96.100190,-18.120085,0.461721,-5.982808,6.444529,...,78.827934,63.429581,64.587990,1.31430,1.34704,6.8017,0.91830,7.75590,106.856003,4


One final forward-fill

In [81]:
X = X.ffill()
X[X.isnull().any(axis=1)]

Unnamed: 0_level_0,close,volume,trend,rsi,rsi_fast_k,rsi_fast_d,williams_r,macd,macd_signal,macd_hist,...,stk_visa,stk_wmt,stk_xom,usd_aud,usd_cad,usd_cny,usd_eur,usd_hkd,usd_jpy,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-08-01,1375.319946,4440920000,,,,,,,,,...,29.716944,58.796192,57.188240,0.95374,1.00320,6.3618,0.81340,7.75376,78.120003,2
2012-08-02,1365.000000,4193740000,,,,,,,,,...,30.138948,59.139599,56.510452,0.95511,1.00472,6.3688,0.81690,7.75500,78.379997,3
2012-08-03,1390.989990,3751170000,,,,,,,,,...,30.537619,59.538925,57.609364,0.95613,1.00705,6.3676,0.82100,7.75426,78.220001,4
2012-08-06,1394.229980,3122050000,,,,,,,,,...,30.549292,59.323292,57.543556,0.94706,1.00057,6.3715,0.80460,7.75430,78.610001,0
2012-08-07,1401.349976,3682490000,,,,,,,,,...,30.402399,59.091671,57.852821,0.94670,1.00060,6.3740,0.80710,7.75500,78.199997,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-05-13,1633.770020,2910600000,1603.001356,65.056930,60.471618,52.803112,-3.704289,19.026607,15.289300,3.737306,...,41.973171,64.173943,60.847847,1.00150,1.01210,6.1317,0.77129,7.76040,101.922997,0
2013-05-14,1650.339966,3457790000,1605.692330,68.136064,100.000000,75.716774,-1.033466,20.517537,16.334948,4.182590,...,42.299248,64.402832,61.529945,1.00290,1.01020,6.1366,0.76994,7.76084,101.667000,1
2013-05-15,1658.780029,3657440000,1608.362800,69.573602,100.000000,86.823873,-3.228837,22.125109,17.492980,4.632129,...,42.715210,65.285728,61.617729,1.00920,1.01820,6.1328,0.77297,7.76166,102.209999,2
2013-05-16,1650.469971,3513130000,1610.892102,66.465804,21.236852,73.745617,-13.738966,22.469553,18.488295,3.981258,...,42.254601,64.173943,61.253067,1.01030,1.01550,6.1359,0.77625,7.76170,102.258003,3


All of the data are joined, and there are no null values.

In [83]:
# Export to CSV
# X.to_csv('../csv/initial_variables.csv')

# END