In [2]:
# IMPORTING LIBRARIES 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier 
import sklearn.datasets
import sklearn.preprocessing
import sklearn.random_projection
import sklearn.neighbors
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, RocCurveDisplay


In [3]:
# define patterns
n = 5
profit_taking = 0.0035
p = 0.27

In [4]:
#READ IN DATA
df_original = pd.read_csv("data/final_clean_FX_data.csv")
print(df_original.shape)

(5262, 11)


In [5]:
#feature engineer
df_original = df_original.sort_values(by =  "Date", ascending=True)
df = df_original
df["c_over_o"] = (df["Close"] - df["Open"]) / df["Open"]
df["h_over_o"] = (df["High"]  - df["Open"]) / df["Open"]
df["l_over_o"] = (df["Low"]   - df["Open"]) / df["Open"]
df["range"] = (df["High"] - df["Low"]) / df["Open"]


# CREATE PRIOR n DAYS FEATURE
for before in range(1, n+1):
    df[f"Close_{before}_before"] = df["Close"].shift(before)
    df[f"Open_{before}_before"] = df["Open"].shift(before)
    df[f"High_{before}_before"] = df["High"].shift(before)
    df[f"Low_{before}_before"] = df["Low"].shift(before)
    df[f"c_over_o_lag{before}"] = df["c_over_o"].shift(before)
    df[f"h_over_o_lag{before}"] = df["h_over_o"].shift(before)
    df[f"l_over_o_lag{before}"] = df["l_over_o"].shift(before)

#feature enginnering
df["range_lag1"] = df["range"].shift(1)
df["range_5d"] = df["range_lag1"].rolling(5).mean()


df["ret_1d"] = np.log(df["Close"]) - np.log(df["Close_1_before"])
df["ret_5d"] = np.log(df["Close"]) - np.log(df["Close_5_before"])
df["vol_5d"]  = df["ret_1d"].rolling(5).std()
df["vol_10d"] = df["ret_1d"].rolling(10).std()
df["mom_5d"] = df["ret_1d"].rolling(5).mean()
df["mom_10d"] = df["ret_1d"].rolling(10).mean()


feature_cols = [
    "ret_1d", "ret_5d",
    "vol_5d", "vol_10d",
    "mom_5d", "mom_10d", 
    "range_5d"
]

df[feature_cols] = df[feature_cols].shift(1)

df["target"] = (df["High"] > (profit_taking+1)*df["Open"]).astype(int)
target = df[["target"]]
df = df.dropna()
df = df.drop(columns=['c_over_o', 'h_over_o', 'l_over_o', 'range'])
print(df.shape)

(3338, 55)


In [6]:
# make splits
train_df   = df[(df['Date'] >= '2013-01-01') & (df['Date'] < '2020-01-01')]
val_df   = df[(df['Date'] >= '2020-01-01') & (df['Date'] < '2022-01-01')]
test_df  = df[df['Date'] >= '2022-01-01']

x_train = train_df.drop(columns=['target', 'Date', 'Close', 'Open', 'High', 'Low'])
y_train = train_df['target']

x_val = val_df.drop(columns=['target', 'Date', 'Close', 'Open', 'High', 'Low'])
y_val = val_df['target']

x_test = test_df.drop(columns=['target', 'Date', 'Close', 'Open', 'High', 'Low'])
y_test = test_df['target']

print(test_df.shape)

(1011, 55)


In [7]:
#scale data

standardize = StandardScaler(with_mean=True, with_std=True)
standardize.fit(x_train)
x_train = pd.DataFrame(
    standardize.transform(x_train),
    columns=x_train.columns,
    index=x_train.index
)
x_val = pd.DataFrame(
    standardize.transform(x_val),
    columns=x_val.columns,
    index=x_val.index
)
x_test = pd.DataFrame(
    standardize.transform(x_test),
    columns=x_test.columns,
    index=x_test.index
)

# pca
#pca = sklearn.decomposition.PCA(n_components=15)
#pca.fit(x_train)
#x_train = pca.transform(x_train)
#x_test = pca.transform(x_test)
#x_val = pca.transform(x_val)

In [8]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print(y_train.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

(1804, 49)
(523, 49)
(1011, 49)
target
0    0.546009
1    0.453991
Name: proportion, dtype: float64
target
0    0.629063
1    0.370937
Name: proportion, dtype: float64
target
0    0.695351
1    0.304649
Name: proportion, dtype: float64


In [9]:
# MODEL TRAINING
model1 = MLPClassifier(
    hidden_layer_sizes=(16, 8),
    activation="relu",
    alpha=0.001,
    max_iter=2000,
    random_state=42
)

model1.fit(x_train, y_train)

train_probs = model1.predict_proba(x_train)[:, 1]
val_probs   = model1.predict_proba(x_val)[:, 1]

train_auc = roc_auc_score(y_train, train_probs)
val_auc   = roc_auc_score(y_val, val_probs)

print(f"Train ROC AUC: {train_auc:.4f}")
print(f"Val   ROC AUC: {val_auc:.4f}")

if True:
    test_probs   = model1.predict_proba(x_test)[:, 1]
    test_auc   = roc_auc_score(y_test, test_probs)
    print(f"Test ROC AUC: {test_auc:.4f}")


Train ROC AUC: 0.7754
Val   ROC AUC: 0.6112
Test ROC AUC: 0.6166


In [21]:
import numpy as np
from sklearn.metrics import roc_auc_score

# ======================================================
# SETUP
# ======================================================

df_test = test_df.copy()

# ======================================================
# MODEL PROBABILITIES
# ======================================================

X_test = df_test.drop(columns=["target", "Date", "Close", "Open", "High", "Low"])
X_test = standardize.transform(X_test)

probs = model1.predict_proba(X_test)[:, 1]

auc = roc_auc_score(df_test["target"], probs)
print(f"ROC-AUC: {auc:.3f}")

# ======================================================
# HIT RATES
# ======================================================

base_hit_rate = df_test["target"].mean()
entered = probs >= p
hit_rate_trades = df_test.loc[entered, "target"].mean()
coverage = entered.mean()

print("\n===== HIT RATES =====")
print("Base hit rate:", base_hit_rate)
print("Hit rate on trades:", hit_rate_trades)
print("Coverage:", coverage)

# ======================================================
# RETURNS (EXECUTION ONLY)
# ======================================================

df_test["ret_oc"] = (df_test["Close"] - df_test["Open"]) / df_test["Open"]

# ======================================================
# COMPOUNDED RETURNS
# ======================================================

def simulate_compounded_equity(df, enter_mask, profit_taking, start_equity=1.0):
    equity = start_equity
    equity_curve = []

    for take_trade, (_, row) in zip(enter_mask, df.iterrows()):
        if take_trade:
            r = (
                profit_taking
                if row["High"] >= (1 + profit_taking) * row["Open"]
                else row["ret_oc"]
            )
            equity *= (1 + r)
        equity_curve.append(equity)

    return equity, equity_curve


baseline_enter = np.ones(len(df_test), dtype=bool)

baseline_final, baseline_curve = simulate_compounded_equity(
    df_test, baseline_enter, profit_taking
)

model_final, model_curve = simulate_compounded_equity(
    df_test, entered, profit_taking
)

baseline_comp_return = baseline_final - 1
model_comp_return = model_final - 1
compounded_above_baseline = model_final / baseline_final - 1

print("\n===== COMPOUNDED PERFORMANCE =====")
print(f"Baseline final equity:     {baseline_final:.4f}")
print(f"Model final equity:        {model_final:.4f}")
print(f"Baseline compounded return: {baseline_comp_return:.2%}")
print(f"Model compounded return:    {model_comp_return:.2%}")
print(f"Compounded ABOVE baseline:  {compounded_above_baseline * 100:.2f}%")

# ======================================================
# NON-COMPOUNDED RETURNS (PURE .sum() OF %)
# ======================================================

df_pnl = df_test.copy()
df_pnl["enter"] = entered.astype(int)

df_pnl["daily_return"] = 0.0

# Model: TP hit
df_pnl.loc[
    (df_pnl["enter"] == 1) &
    (df_pnl["High"] >= (1 + profit_taking) * df_pnl["Open"]),
    "daily_return"
] = profit_taking

# Model: TP not hit
df_pnl.loc[
    (df_pnl["enter"] == 1) &
    (df_pnl["High"] < (1 + profit_taking) * df_pnl["Open"]),
    "daily_return"
] = df_pnl["ret_oc"]

model_non_comp_return = df_pnl["daily_return"].sum()
model_trades = df_pnl["enter"].sum()

# ======================================================
# BASELINE NON-COMPOUNDED
# ======================================================

df_base = df_test.copy()

df_base["daily_return"] = np.where(
    df_base["High"] >= (1 + profit_taking) * df_base["Open"],
    profit_taking,
    df_base["ret_oc"]
)

baseline_non_comp_return = df_base["daily_return"].sum()
baseline_trades = len(df_base)

non_comp_above_baseline = model_non_comp_return - baseline_non_comp_return

# ======================================================
# FINAL SUMMARY (PRINT EVERYTHING)
# ======================================================

print("\n===== NON-COMPOUNDED PERFORMANCE (.sum OF RETURNS) =====")
print(f"Baseline non-compounded return: {baseline_non_comp_return:.4%}")
print(f"Model non-compounded return:    {model_non_comp_return:.4%}")
print(f"NON-COMPOUNDED ABOVE BASELINE:  {non_comp_above_baseline * 100:.4f}%")

print("\n===== TRADE COUNTS =====")
print(f"Baseline trades: {baseline_trades}")
print(f"Model trades:    {model_trades}")



ROC-AUC: 0.617

===== HIT RATES =====
Base hit rate: 0.304648862512364
Hit rate on trades: 0.45634920634920634
Coverage: 0.24925816023738873

===== COMPOUNDED PERFORMANCE =====
Baseline final equity:     0.9579
Model final equity:        1.1418
Baseline compounded return: -4.21%
Model compounded return:    14.18%
Compounded ABOVE baseline:  19.19%

===== NON-COMPOUNDED PERFORMANCE (.sum OF RETURNS) =====
Baseline non-compounded return: -3.8226%
Model non-compounded return:    13.4102%
NON-COMPOUNDED ABOVE BASELINE:  17.2328%

===== TRADE COUNTS =====
Baseline trades: 1011
Model trades:    252


