In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

forex_df = pd.read_pickle("../data/GBP_JPY_H1.pkl")

In [2]:
forex_df.columns

Index(['time', 'volume', 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h',
       'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c'],
      dtype='object')

In [3]:
forex_df.drop(['bid_o', 'bid_h','bid_l', 'bid_c', 'ask_o','ask_h', 'ask_l', 'ask_c'], axis=1, inplace=True)

In [4]:
def apply_direction(df):
    direction = df.mid_c - df.mid_o
    direction = [1 if x >= 0 else -1 for x in direction]
    df['direction'] = direction

In [5]:
def create_lagged_features(df, max_lag, forecast_horizon):
    for lag in range(1, max_lag + 1):
        df[f'mid_c_lag_{lag}'] = df['mid_c'].shift(lag)

    # Create a target variable, shifting 'mid_c' back by the forecast horizon
    df['future_mid_c'] = df['mid_c'].shift(-forecast_horizon)
    
    # Remove rows with NaN values that result from shifting
    df.dropna(inplace=True)

In [6]:
apply_direction(forex_df)
create_lagged_features(forex_df, 5, 3)

In [7]:
backtest_split = len(forex_df) - int(len(forex_df)*0.5)

# DataFrame used for training, hyperparameter tuning, and testing prediction accuracy
df = forex_df.iloc[:backtest_split]

# DataFrame used for backtesting trading
backtest_df = forex_df.iloc[backtest_split:]

In [8]:
def tune_catboost_hyperparams(X, y, param_grid, cv=3, n_iter=50, plot=True):
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create a CatBoostRegressor instance
    model = CatBoostRegressor(loss_function='RMSE', eval_metric='RMSE', verbose=0)
    
    # Setup the GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1)
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_model = grid_search.best_estimator_
    
    return best_model

In [9]:
# Assuming 'Price' is the target variable and the rest are features
X = df.drop(['time','mid_c'], axis=1)
y = df['mid_c']

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Example parameter grid
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 10]
}

# Tuning hyperparameters
best_model = tune_catboost_hyperparams(X, y, param_grid)

# Optionally, you can print the best parameters
print("Best Parameters:", best_model.get_params())

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'iterations': 200, 'learning_rate': 0.1, 'depth': 10, 'loss_function': 'RMSE', 'verbose': 0, 'eval_metric': 'RMSE'}


In [10]:
# Make predictions
predictions = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error on Test Set: {mse}')

Mean Squared Error on Test Set: 0.03073726112092706


In [11]:
# Define thresholds
threshold_pct = 0.01  # 1% change
stop_loss_pct = 0.005  # 0.5%
take_profit_pct = 0.015  # 1.5%

# Calculate signals
backtest_df['price_change'] = (backtest_df['future_mid_c'] - backtest_df['mid_c']) / backtest_df['mid_c']

# Generate trading signals
backtest_df['signal'] = np.where(backtest_df['price_change'] > threshold_pct, 'Buy',
                        np.where(backtest_df['price_change'] < -threshold_pct, 'Sell', 'Hold'))

# Calculate stop loss and take profit prices for each trade
backtest_df['stop_loss'] = np.where(backtest_df['signal'] == 'Buy', backtest_df['mid_c'] * (1 - stop_loss_pct),
                           np.where(backtest_df['signal'] == 'Sell', backtest_df['mid_c'] * (1 + stop_loss_pct), np.nan))

backtest_df['take_profit'] = np.where(backtest_df['signal'] == 'Buy', backtest_df['mid_c'] * (1 + take_profit_pct),
                             np.where(backtest_df['signal'] == 'Sell', backtest_df['mid_c'] * (1 - take_profit_pct), np.nan))

backtest_df = backtest_df[['time', 'mid_c', 'future_mid_c', 'signal', 'stop_loss', 'take_profit']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  backtest_df['price_change'] = (backtest_df['future_mid_c'] - backtest_df['mid_c']) / backtest_df['mid_c']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  backtest_df['signal'] = np.where(backtest_df['price_change'] > threshold_pct, 'Buy',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  backtest_df['s

In [12]:
backtest_df[backtest_df.signal != 'Hold']

Unnamed: 0,time,mid_c,future_mid_c,signal,stop_loss,take_profit
18785,2019-01-15 17:00:00+00:00,138.286,140.006,Buy,137.59457,140.36029
18786,2019-01-15 18:00:00+00:00,137.906,139.792,Buy,137.21647,139.97459
19737,2019-03-12 08:00:00+00:00,147.306,145.301,Sell,148.04253,145.09641
19738,2019-03-12 09:00:00+00:00,147.070,145.572,Sell,147.80535,144.86395
22383,2019-08-13 10:00:00+00:00,127.158,128.767,Buy,126.52221,129.06537
...,...,...,...,...,...,...
30107,2020-11-09 09:00:00+00:00,136.174,138.244,Buy,135.49313,138.21661
30108,2020-11-09 10:00:00+00:00,136.306,138.228,Buy,135.62447,138.35059
30109,2020-11-09 11:00:00+00:00,137.228,138.616,Buy,136.54186,139.28642
30583,2020-12-07 05:00:00+00:00,139.652,138.102,Sell,140.35026,137.55722


In [13]:
cumulative_gains = 0

for index, row in backtest_df.iterrows():
    entry_price = row['mid_c']
    gain = 0
    if row['signal'] == 'Buy':
        # Check if take profit or stop loss would have been hit first
        if row['take_profit'] <= row['future_mid_c']:
            gain = row['take_profit'] - entry_price
        elif row['stop_loss'] >= row['future_mid_c']:
            gain = row['stop_loss'] - entry_price
        else:
            gain = row['future_mid_c'] - entry_price  # close at future price if neither is hit
    elif row['signal'] == 'Sell':
        if row['take_profit'] >= row['future_mid_c']:
            gain = entry_price - row['take_profit']
        elif row['stop_loss'] <= row['future_mid_c']:
            gain = entry_price - row['stop_loss']
        else:
            gain = entry_price - row['future_mid_c']  # close at future price if neither is hit
    
    # Update cumulative gains
    cumulative_gains += gain

# Print the result
print(f'Cumulative gains: {cumulative_gains}')

Cumulative gains: 117.009585
