In [1]:
# pip install pandas_ta

In [None]:
import pandas as pd
import numpy as np
import pandas_ta as pta

In [None]:
ibm_df = pd.read_csv('/content/gdrive/MyDrive/IS460 ML Project/trading_models/ibm.csv')
hp_df = pd.read_csv('/content/gdrive/MyDrive/IS460 ML Project/trading_models/hp.csv')

In [None]:
ibm_df.set_index('time',inplace=True)
hp_df.set_index('time',inplace=True)

ibm_df.sort_index(inplace=True)
hp_df.sort_index(inplace=True)

In [None]:
ibm_df['vwap'] = (ibm_df.volume*(ibm_df.high+ibm_df.low)/2).cumsum() / ibm_df.volume.cumsum()
hp_df['vwap'] = (hp_df.volume*(hp_df.high+ibm_df.low)/2).cumsum() / hp_df.volume.cumsum()

#ma5
ibm_df['SMA(5)'] = ibm_df.close.rolling(5).mean()
hp_df['SMA(5)'] = hp_df.close.rolling(5).mean()

#ma10
ibm_df['SMA(10)'] = ibm_df.close.rolling(10).mean()
hp_df['SMA(10)'] = hp_df.close.rolling(10).mean()

ibm_df['12dayEWM'] = ibm_df.close.ewm(span=5, adjust=False).mean()
hp_df['12dayEWM'] = hp_df.close.ewm(span=5, adjust=False).mean()

#macd
ibm_df['MACD'] = pta.macd(ibm_df['close']).iloc[:,0]
hp_df['MACD'] = pta.macd(hp_df['close']).iloc[:,0]

In [None]:
#rsi
#Might have to adjust the length
ibm_df["rsi"] = pta.rsi(ibm_df['close'], length = 12)
hp_df["rsi"] = pta.rsi(hp_df['close'], length = 12)

#Mom - also might need to adjust the shift
ibm_df["rolling"] = ibm_df.close.shift(12)
ibm_df["mom"] = ibm_df["close"] - ibm_df["rolling"]
ibm_df.drop(["rolling"], axis=1, inplace=True)
hp_df["rolling"] = hp_df.close.shift(12)
hp_df["mom"] = hp_df["close"] - hp_df["rolling"]
hp_df.drop(["rolling"], axis=1, inplace=True)

#mfi
def mfi(h, l, o, c, v, n=12):
  typical_price = (h+l+c)/3
  money_flow = typical_price*v
  mf_sign = np.where(typical_price > typical_price.shift(1),1,-1)
  signed_mf = money_flow * mf_sign

  mf_ave_gain = signed_mf.rolling(n).apply(lambda x: ((x>0)*x).sum(), raw = True)
  mf_ave_loss = signed_mf.rolling(n).apply(lambda x: ((x<0)*x).sum(), raw = True)
  return (100-(100/(1+mf_ave_gain / abs(mf_ave_loss)))).to_numpy()

ibm_df["mfi"] = mfi(ibm_df.high,ibm_df.low,ibm_df.open,ibm_df.close,ibm_df.volume)
hp_df["mfi"] = mfi(hp_df.high,hp_df.low,hp_df.open,hp_df.close,hp_df.volume)

**Combining Features**

In [None]:
ratio_features = ['vwap','SMA(5)','SMA(10)','12dayEWM','rsi'] #exlude momentum, MFI and MACD as taking their ratio is also weird
diff_features = ['MACD','mom','mfi']
ratio_features_df = ibm_df[ratio_features]/hp_df[ratio_features]
diff_features_df = ibm_df[diff_features] - hp_df[diff_features]
diff_features_df['spread'] = ibm_df['close'] - hp_df['close']

features_df = pd.concat([ratio_features_df,diff_features_df],axis=1)

In [None]:
features_df.shape

(3595, 9)

In [None]:
features_df.dropna(inplace = True)
features_df

Unnamed: 0_level_0,vwap,SMA(5),SMA(10),12dayEWM,rsi,MACD,mom,mfi,spread
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-10-04 08:00:00,1.659676,4.768259,4.786339,4.771101,1.107056,0.867368,0.543952,43.744151,102.684738
2021-10-04 09:00:00,1.659675,4.766889,4.776943,4.769641,1.032303,0.822687,0.844880,42.875897,102.612395
2021-10-04 10:00:00,1.660783,4.769485,4.775439,4.779400,1.121594,0.918496,2.310488,42.838238,104.369796
2021-10-04 11:00:00,1.664391,4.774242,4.771703,4.777858,1.000573,0.913061,0.528564,24.307401,103.491902
2021-10-04 12:00:00,1.664180,4.777072,4.769049,4.770852,0.963529,0.904516,0.586155,11.604788,103.568593
...,...,...,...,...,...,...,...,...,...
2022-09-23 13:00:00,1.579399,4.908354,4.900728,4.905692,1.341466,-0.357752,-2.098000,-26.144407,97.885000
2022-09-23 14:00:00,1.579414,4.905133,4.904967,4.903628,1.290696,-0.426738,-2.333600,-34.984305,97.701400
2022-09-23 15:00:00,1.579429,4.894920,4.903911,4.890514,1.061713,-0.552944,-3.514100,-43.456076,96.735900
2022-09-23 16:00:00,1.579460,4.882692,4.900211,4.873021,0.829592,-0.594141,-2.135000,-22.911079,97.380000


In [None]:
# path = '/content/gdrive/MyDrive/IS460 ML Project/trading_models/'
# ibm_df.to_csv(path+'ibm_features.csv')
# hp_df.to_csv(path+'hp_features.csv')
# features_df.to_csv(path+'combined_features.csv')

In [None]:
### Don't forget to normalize these features if your model is sensitive.

In [None]:
#generate label for training
import numpy as np
def gen_labels(df,t,threshold):
    # Calculate % return on spread t hours later
    df['forward_return'] = df['spread'].diff(periods=t)/df['spread']
    
    #If the return is more than x%, we should have bought, and hence the label is (1)
    #If return is less than x%, we should have sold, and hence label is (-1)
    #If in between, do nothing (0)
    df['output'] = np.select([df['forward_return'] > threshold ,df['forward_return'] < -threshold],[1,-1])
    return df

df = gen_labels(features_df,t=24,threshold=0.02)
df

Unnamed: 0_level_0,vwap,SMA(5),SMA(10),12dayEWM,rsi,MACD,mom,mfi,spread,forward_return,output
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-10-04 08:00:00,1.659676,4.768259,4.786339,4.771101,1.107056,0.867368,0.543952,43.744151,102.684738,,0
2021-10-04 09:00:00,1.659675,4.766889,4.776943,4.769641,1.032303,0.822687,0.844880,42.875897,102.612395,,0
2021-10-04 10:00:00,1.660783,4.769485,4.775439,4.779400,1.121594,0.918496,2.310488,42.838238,104.369796,,0
2021-10-04 11:00:00,1.664391,4.774242,4.771703,4.777858,1.000573,0.913061,0.528564,24.307401,103.491902,,0
2021-10-04 12:00:00,1.664180,4.777072,4.769049,4.770852,0.963529,0.904516,0.586155,11.604788,103.568593,,0
...,...,...,...,...,...,...,...,...,...,...,...
2022-09-23 13:00:00,1.579399,4.908354,4.900728,4.905692,1.341466,-0.357752,-2.098000,-26.144407,97.885000,-0.028656,-1
2022-09-23 14:00:00,1.579414,4.905133,4.904967,4.903628,1.290696,-0.426738,-2.333600,-34.984305,97.701400,-0.033097,-1
2022-09-23 15:00:00,1.579429,4.894920,4.903911,4.890514,1.061713,-0.552944,-3.514100,-43.456076,96.735900,-0.040152,-1
2022-09-23 16:00:00,1.579460,4.882692,4.900211,4.873021,0.829592,-0.594141,-2.135000,-22.911079,97.380000,-0.034042,-1


In [None]:
#train and test data
split = round(0.8*len(df))
train, test = df[:split],df[split:]
x_train = train[['vwap','SMA(5)','SMA(10)','12dayEWM','rsi','MACD','mom','mfi','spread']]
y_train = train[['output']]

x_test = test[['vwap','SMA(5)','SMA(10)','12dayEWM','rsi','MACD','mom','mfi','spread']]
y_test = test[['output']]
