### Imports

In [3]:
from backtesting.test import EURUSD, GOOG



### Process parquet

In [4]:
import pandas as pd
import pyarrow.parquet as pq

DATA_PATH='/home/bkadmin/datasets/spatialedge-hackathon-H1/v1/' 
FROM_DATE='20200101'
TO_DATE='20221231'
SYMBOL = 'EURCHF'
TIMEFRAME = 'H1'

def read_and_process_parquet(data_path, from_date, to_date, symbol, timeframe):
    partition = ['symbol','timeframe', 'date', 'date']
    operator = ['=', '=', '>=', '<=']
    params = [symbol, timeframe, from_date, to_date]
            
    dataset = pq.ParquetDataset(data_path, filters=list(zip(partition, operator, params)), use_legacy_dataset=True)
    table = dataset.read()
    df = table.to_pandas()

    df['date'] = df['date'].astype(str)
    df['time'] = df['time'].astype(str)

    df['datetime'] = df['date'] + ' ' + df['time']
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y%m%d %H:%M:%S')
    df.set_index('datetime', inplace=True)

    df.drop(['time', 'symbol', 'timeframe', 'date'], axis=1, inplace=True)
    df = df.sort_values('datetime')
    df.fillna(method='ffill', inplace=True)

    return df 

data = read_and_process_parquet(DATA_PATH, FROM_DATE, TO_DATE, SYMBOL, TIMEFRAME)

  dataset = pq.ParquetDataset(data_path, filters=list(zip(partition, operator, params)), use_legacy_dataset=True)
  df.fillna(method='ffill', inplace=True)


### Create helper functions

In [5]:
#data = EURUSD.copy()

# Relative Strength Index (RSI)
#close = data.Close.values


def RSI(values, period=168):
    delta = pd.Series(values).diff()
    gain = delta.where(delta>0.0)
    #gain.fillna(0, inplace=True) 
    loss = -delta.where(delta<0.0)
    #loss.fillna(0, inplace=True) #TODO: instead of replacing with 0, remove those vaues
    loss
    avg_gain = gain.rolling(period, min_periods=1).mean() # average gain over 7 day period (168hr) 
    avg_gain.fillna(0, inplace=True)
    avg_loss = loss.rolling(period, min_periods=1).mean() # average loss over 7 day period (168hr)
    avg_loss.fillna(0.000000001, inplace=True)
    rs = (avg_gain/avg_loss)
    rsi = 100-(100/(1+rs))
   
    return rsi

def SMA(values, n):
    
    """
    Return simple moving average of `values`, at
    each step taking into account `n` previous values.
    """
    return pd.Series(values).rolling(n).mean()

def Stochastic_Oscillator(values, period=168):
    '''Calculates stochastic oscillator'''
    time_period = period
    data = pd.Series(values)
    if len(values) < time_period:
        time_period = len(values)
    lowest_close_in_period = data.rolling(time_period, min_periods=1).min()
    highest_close_in_period = data.rolling(time_period, min_periods=1).max()
    so = ((data-lowest_close_in_period)/(highest_close_in_period-lowest_close_in_period))*100
    return so


### Create features

In [6]:
data.columns = ['Open','High','Low','Close','Volume']
rsi = RSI(data.Close, 168)
so14 = Stochastic_Oscillator(data.Close, 14)
so28 = Stochastic_Oscillator(data.Close, 28)
sma10 = SMA(data.Close, 10)
sma20 = SMA(data.Close, 20)
sma50 = SMA(data.Close, 50)

data['X_RSI'] = rsi
data['X_SO14'] = so14
data['X_SO28'] = so28
data['X_DELTA_SO14'] = (so14-so28)/data.Close.values
data['X_SMA10'] = sma10
data['X_SMA20'] = sma20
data['X_SMA50'] = sma50
data['X_DELTA_SMA10'] = (sma10-sma20)/data.Close.values
data['X_DELTA_SMA20'] = (sma50-sma20)/data.Close.values
data.dropna().astype(float)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,X_RSI,X_SO14,X_SO28,X_DELTA_SO14,X_SMA10,X_SMA20,X_SMA50,X_DELTA_SMA10,X_DELTA_SMA20
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-05 23:00:00,1.083520,1.084500,1.083520,1.084370,7853.820027,45.114428,42.018779,53.130930,-10.247564,1.084604,1.084215,1.085165,0.000359,0.000876
2020-01-06 00:00:00,1.084365,1.084800,1.084185,1.084580,7548.190011,44.614496,51.877934,61.100569,-8.503416,1.084551,1.084262,1.085151,0.000267,0.000820
2020-01-06 01:00:00,1.084600,1.084680,1.083815,1.084115,8137.510020,44.783102,30.046948,43.453510,-12.366365,1.084596,1.084269,1.085118,0.000301,0.000783
2020-01-06 02:00:00,1.084130,1.084495,1.084040,1.084200,6787.410002,44.058286,31.961259,46.679317,-13.575039,1.084606,1.084271,1.085076,0.000309,0.000742
2020-01-06 03:00:00,1.084205,1.084500,1.084035,1.084430,9166.750024,43.655247,43.099274,55.407970,-11.350383,1.084585,1.084344,1.085041,0.000222,0.000643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-30 17:00:00,0.988540,0.988855,0.987415,0.987485,14612.490284,51.597549,79.827916,81.076233,-1.264138,0.985817,0.984923,0.984949,0.000905,0.000026
2022-12-30 18:00:00,0.987475,0.988600,0.987295,0.988225,18293.070081,51.559651,93.977055,93.977055,0.000000,0.986004,0.985122,0.985008,0.000893,-0.000115
2022-12-30 19:00:00,0.988245,0.989325,0.987905,0.989110,12581.919959,51.822234,100.000000,100.000000,0.000000,0.986545,0.985374,0.985099,0.001184,-0.000277
2022-12-30 20:00:00,0.989075,0.990020,0.988700,0.989885,10301.150102,52.079265,100.000000,100.000000,0.000000,0.987182,0.985668,0.985181,0.001530,-0.000492


### Create matrix

In [7]:
import numpy as np

def get_X(data):
    return data.filter(like='X').values

def get_y(data):
    y = data.Close.pct_change(48).shift(-48)
    y[y.between(-.004,.004)] = 0
    y[y > 0] = 1
    y[y < 0] = -1
    return y

def get_clean_Xy(df):
    X = get_X(df)
    y = get_y(df).values
    isnan = np.isnan(y)
    X = X[~isnan]
    y = y[~isnan]
    return X, y



### Test model

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X, y = get_clean_Xy(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5, random_state=0)
model = DecisionTreeClassifier()
#pd.DataFrame(X)
model.fit(X,y)
y_pred = model.predict(X_test)
#_ = pd.DataFrame({'y_true': y_test, 'y_pred': y_pred}).plot(figsize=(15, 2), alpha=.7)
print('Classification accuracy: ', np.mean(y_test == y_pred))


Classification accuracy:  0.8625414837811798


### Strategy based on Relative Strength Index

In [16]:
from backtesting import Strategy, Backtest
from backtesting.lib import crossover

N_TRAIN = 1464
    
class RulesBasedStrategy(Strategy):
    # TODO: add rules based
    # The SMA periods
    n1 = 10
    n2 = 20

    def init(self):
        self.sma1 = self.I(SMA, self.data.Close, self.n1)
        self.sma2 = self.I(SMA, self.data.Close, self.n2)
        self.rsi = self.I(RSI, self.data.Close, 23) # Calculate 1 day rsi
        self.stochastic_oscillator = self.I(Stochastic_Oscillator, self.data.Close, 168)

    def next(self):
        # If all indicators say you should buy, buy
        if self.rsi < 30 or self.stochastic_oscillator< 20 or crossover(self.sma1, self.sma2):
            self.position.close()
            self.buy()
        
        # If all indicators say you should sell, sell
        elif self.rsi > 70 or self.stochastic_oscillator > 80 or crossover(self.sma2, self.sma1):
            self.position.close()
            self.sell()

CASH = 1000000        

class MLStrategy(Strategy):
    

    def init(self):
        self.clf = DecisionTreeClassifier()
        df = self.data.df.iloc[:N_TRAIN]
        X,y = get_clean_Xy(df)
        self.clf.fit(X,y)
        self.tpo = False
        self.I(get_y, self.data.df, name='y_true')
        self.forecasts = self.I(lambda: np.repeat(np.nan, len(self.data)), name = 'forecast')

    def next(self):
        if len(self.data)< N_TRAIN: return

        high, low, close = self.data.High, self.data.Low, self.data.Close
        curr_time = self.data.index[-1]
        
        forecast = self.clf.predict(get_X(self.data.df.iloc[-1:]))

        self.forecasts[-1] = forecast

        if not self.tpo:
            if forecast == 1:
                self.position.close()
                self.buy()
            elif forecast == -1:
                self.position.close()
                self.sell()
        
        if self.equity > CASH*1.1: # If equity is ever 10% higher than starting cash, close position and cease trading
            self.position.close()
            self.tpo=True


### Backtest

In [17]:

bt = Backtest(data, MLStrategy, cash=10_000, commission=0.002)
stats = bt.run()
stats
#bt.plot()

Start                     2020-01-01 22:00:00
End                       2022-12-30 21:00:00
Duration                   1093 days 23:00:00
Exposure Time [%]                    62.41858
Equity Final [$]                     0.947719
Equity Peak [$]                       10000.0
Return [%]                         -99.990523
Buy & Hold Return [%]               -8.773813
Return (Ann.) [%]                  -91.699428
Volatility (Ann.) [%]                2.229837
Sharpe Ratio                              0.0
Sortino Ratio                             0.0
Calmar Ratio                              0.0
Max. Drawdown [%]                  -99.990536
Avg. Drawdown [%]                  -99.990536
Max. Drawdown Duration     1003 days 08:00:00
Avg. Drawdown Duration     1003 days 08:00:00
# Trades                                 4963
Win Rate [%]                         1.108201
Best Trade [%]                        1.47974
Worst Trade [%]                     -1.246734
Avg. Trade [%]                    