In [1]:
# pip install pandas_ta

In [2]:
import pandas as pd
import numpy as np
import pandas_ta as pta

In [3]:
es_df = pd.read_csv('ES_intraday.csv')
wec_df = pd.read_csv('WEC_intraday.csv')

In [4]:
es_df.set_index('time',inplace=True)
wec_df.set_index('time',inplace=True)

es_df.sort_index(inplace=True)
wec_df.sort_index(inplace=True)

In [5]:
es_df['vwap'] = (es_df.volume*(es_df.high+es_df.low)/2).cumsum() / es_df.volume.cumsum()
wec_df['vwap'] = (wec_df.volume*(wec_df.high+es_df.low)/2).cumsum() / wec_df.volume.cumsum()

#ma5
es_df['SMA(5)'] = es_df.close.rolling(5).mean()
wec_df['SMA(5)'] = wec_df.close.rolling(5).mean()

#ma10
es_df['SMA(10)'] = es_df.close.rolling(10).mean()
wec_df['SMA(10)'] = wec_df.close.rolling(10).mean()

es_df['12dayEWM'] = es_df.close.ewm(span=5, adjust=False).mean()
wec_df['12dayEWM'] = wec_df.close.ewm(span=5, adjust=False).mean()

#macd
es_df['MACD'] = pta.macd(es_df['close']).iloc[:,0]
wec_df['MACD'] = pta.macd(wec_df['close']).iloc[:,0]

In [6]:
#rsi
#Might have to adjust the length
es_df["rsi"] = pta.rsi(es_df['close'], length = 12)
wec_df["rsi"] = pta.rsi(wec_df['close'], length = 12)

#Mom - also might need to adjust the shift
es_df["rolling"] = es_df.close.shift(12)
es_df["mom"] = es_df["close"] - es_df["rolling"]
es_df.drop(["rolling"], axis=1, inplace=True)
wec_df["rolling"] = wec_df.close.shift(12)
wec_df["mom"] = wec_df["close"] - wec_df["rolling"]
wec_df.drop(["rolling"], axis=1, inplace=True)

#mfi
def mfi(h, l, o, c, v, n=12):
  typical_price = (h+l+c)/3
  money_flow = typical_price*v
  mf_sign = np.where(typical_price > typical_price.shift(1),1,-1)
  signed_mf = money_flow * mf_sign

  mf_ave_gain = signed_mf.rolling(n).apply(lambda x: ((x>0)*x).sum(), raw = True)
  mf_ave_loss = signed_mf.rolling(n).apply(lambda x: ((x<0)*x).sum(), raw = True)
  return (100-(100/(1+mf_ave_gain / abs(mf_ave_loss)))).to_numpy()

es_df["mfi"] = mfi(es_df.high,es_df.low,es_df.open,es_df.close,es_df.volume)
wec_df["mfi"] = mfi(wec_df.high,wec_df.low,wec_df.open,wec_df.close,wec_df.volume)

**Combining Features**

In [7]:
ratio_features = ['vwap','SMA(5)','SMA(10)','12dayEWM','rsi'] #exlude momentum, MFI and MACD as taking their ratio is also weird
diff_features = ['MACD','mom','mfi']
ratio_features_df = es_df[ratio_features]/wec_df[ratio_features]
diff_features_df = es_df[diff_features] - wec_df[diff_features]
diff_features_df['spread'] = wec_df['close'] - es_df['close']

features_df = pd.concat([ratio_features_df,diff_features_df],axis=1)

In [8]:
features_df.shape

(4181, 9)

In [9]:
features_df.dropna(inplace = True)
features_df

Unnamed: 0_level_0,vwap,SMA(5),SMA(10),12dayEWM,rsi,MACD,mom,mfi,spread
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-11-13 10:00:00,0.947119,0.897453,0.899334,0.898887,1.064583,-0.043296,-0.449802,35.545069,9.775620
2020-11-13 11:00:00,0.947180,0.898634,0.899741,0.900333,1.132370,-0.003602,0.461106,32.276849,9.473539
2020-11-13 12:00:00,0.947164,0.900476,0.900057,0.901297,1.159534,0.029941,-0.013803,21.229298,9.443415
2020-11-13 13:00:00,0.947214,0.901967,0.900010,0.901770,1.162830,0.053314,0.520326,19.808220,9.475573
2020-11-13 14:00:00,0.947256,0.902490,0.900092,0.902230,1.154855,0.073625,0.357355,19.637644,9.442908
...,...,...,...,...,...,...,...,...,...
2022-10-28 13:00:00,0.948541,0.837141,0.838733,0.836955,0.914318,-0.267499,-1.990000,0.492807,14.850000
2022-10-28 14:00:00,0.948539,0.836728,0.838468,0.837909,0.968452,-0.254640,0.225000,9.559848,14.535000
2022-10-28 15:00:00,0.948534,0.837559,0.838659,0.838728,0.973852,-0.245484,-0.322900,15.006942,14.582900
2022-10-28 16:00:00,0.948511,0.838191,0.838129,0.838419,0.953725,-0.257618,-0.860000,7.968697,14.860000


In [10]:
### Don't forget to normalize these features if your model is sensitive.

In [11]:
#generate label for training
import numpy as np
def gen_labels(df,t,threshold):
    # Calculate % return on spread t hours later
    df['forward_return'] = df['spread'].diff(periods=t)/df['spread']
    
    #If the return is more than x%, we should have bought, and hence the label is (1)
    #If return is less than x%, we should have sold, and hence label is (-1)
    #If in between, do nothing (0)
    df['output'] = np.select([df['forward_return'] > threshold ,df['forward_return'] < -threshold],[1,-1])
    return df

df = gen_labels(features_df,t=24,threshold=0.02)
df

Unnamed: 0_level_0,vwap,SMA(5),SMA(10),12dayEWM,rsi,MACD,mom,mfi,spread,forward_return,output
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-11-13 10:00:00,0.947119,0.897453,0.899334,0.898887,1.064583,-0.043296,-0.449802,35.545069,9.775620,,0
2020-11-13 11:00:00,0.947180,0.898634,0.899741,0.900333,1.132370,-0.003602,0.461106,32.276849,9.473539,,0
2020-11-13 12:00:00,0.947164,0.900476,0.900057,0.901297,1.159534,0.029941,-0.013803,21.229298,9.443415,,0
2020-11-13 13:00:00,0.947214,0.901967,0.900010,0.901770,1.162830,0.053314,0.520326,19.808220,9.475573,,0
2020-11-13 14:00:00,0.947256,0.902490,0.900092,0.902230,1.154855,0.073625,0.357355,19.637644,9.442908,,0
...,...,...,...,...,...,...,...,...,...,...,...
2022-10-28 13:00:00,0.948541,0.837141,0.838733,0.836955,0.914318,-0.267499,-1.990000,0.492807,14.850000,0.071380,1
2022-10-28 14:00:00,0.948539,0.836728,0.838468,0.837909,0.968452,-0.254640,0.225000,9.559848,14.535000,0.054696,1
2022-10-28 15:00:00,0.948534,0.837559,0.838659,0.838728,0.973852,-0.245484,-0.322900,15.006942,14.582900,0.050257,1
2022-10-28 16:00:00,0.948511,0.838191,0.838129,0.838419,0.953725,-0.257618,-0.860000,7.968697,14.860000,0.081427,1


In [12]:
#train and test data
split = round(0.8*len(df))
train, test = df[:split],df[split:]
x_train = train[['vwap','SMA(5)','SMA(10)','12dayEWM','rsi','MACD','mom','mfi','spread']]
y_train = train[['output']]

x_test = test[['vwap','SMA(5)','SMA(10)','12dayEWM','rsi','MACD','mom','mfi','spread']]
y_test = test[['output']]

In [13]:
df.dropna(inplace=True)
df.to_csv('combined_features.csv')