In [372]:
# IMPORTING LIBRARIES 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier 
import sklearn.datasets
import sklearn.preprocessing
import sklearn.random_projection
import sklearn.neighbors
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import classification_report

In [373]:
# define patterns
n = 5
profit_taking = 0.0035
p = 0.65

In [374]:
#READ IN DATA
df_original = pd.read_csv("data/final_clean_FX_data.csv")

In [375]:
#feature engineer
df_original = df_original.sort_values(by =  "Date", ascending=True)
df = df_original
df["c_over_o"] = (df["Close"] - df["Open"]) / df["Open"]
df["h_over_o"] = (df["High"]  - df["Open"]) / df["Open"]
df["l_over_o"] = (df["Low"]   - df["Open"]) / df["Open"]
df["range"] = (df["High"] - df["Low"]) / df["Open"]


# CREATE PRIOR n DAYS FEATURE
for before in range(1, n+1):
    df[f"Close_{before}_before"] = df["Close"].shift(before)
    df[f"Open_{before}_before"] = df["Open"].shift(before)
    df[f"High_{before}_before"] = df["High"].shift(before)
    df[f"Low_{before}_before"] = df["Low"].shift(before)
    df[f"c_over_o_lag{before}"] = df["c_over_o"].shift(before)
    df[f"h_over_o_lag{before}"] = df["h_over_o"].shift(before)
    df[f"l_over_o_lag{before}"] = df["l_over_o"].shift(before)

#feature enginnering
df["range_lag1"] = df["range"].shift(1)
df["range_5d"] = df["range_lag1"].rolling(5).mean()


df["ret_1d"] = np.log(df["Close"]) - np.log(df["Close_1_before"])
df["ret_5d"] = np.log(df["Close"]) - np.log(df["Close_5_before"])
df["vol_5d"]  = df["ret_1d"].rolling(5).std()
df["vol_10d"] = df["ret_1d"].rolling(10).std()
df["mom_5d"] = df["ret_1d"].rolling(5).mean()
df["mom_10d"] = df["ret_1d"].rolling(10).mean()


feature_cols = [
    "ret_1d", "ret_5d",
    "vol_5d", "vol_10d",
    "mom_5d", "mom_10d", 
    "range_5d"
]

df[feature_cols] = df[feature_cols].shift(1)

df["target"] = (df["High"] > (profit_taking+1)*df["Open"]).astype(int)
target = df[["target"]]
df = df.dropna()
df = df.drop(columns=['c_over_o', 'h_over_o', 'l_over_o', 'range'])

In [376]:
# make splits
train_df   = df[(df['Date'] >= '2007-01-01') & (df['Date'] < '2017-01-01')]
val_df   = df[(df['Date'] >= '2017-01-01') & (df['Date'] < '2020-01-01')]
test_df  = df[df['Date'] >= '2020-01-01']

x_train = train_df.drop(columns=['target', 'Date', 'Close', 'Open', 'High', 'Low'])
y_train = train_df['target']

x_val = val_df.drop(columns=['target', 'Date', 'Close', 'Open', 'High', 'Low'])
y_val = val_df['target']

x_test = test_df.drop(columns=['target', 'Date', 'Close', 'Open', 'High', 'Low'])
y_test = test_df['target']


In [377]:
#scale data

standardize = StandardScaler(with_mean=True, with_std=True)
standardize.fit(x_train)
x_train = pd.DataFrame(
    standardize.transform(x_train),
    columns=x_train.columns,
    index=x_train.index
)
x_val = pd.DataFrame(
    standardize.transform(x_val),
    columns=x_val.columns,
    index=x_val.index
)
x_test = pd.DataFrame(
    standardize.transform(x_test),
    columns=x_test.columns,
    index=x_test.index
)

# pca
pca = sklearn.decomposition.PCA(n_components=10)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
x_val = pca.transform(x_val)

In [378]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print(y_train.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

(1022, 10)
(782, 10)
(1534, 10)
target
1    0.520548
0    0.479452
Name: proportion, dtype: float64
target
1    0.524297
0    0.475703
Name: proportion, dtype: float64
target
0    0.605606
1    0.394394
Name: proportion, dtype: float64


In [419]:
# MODEL TRAINING
model1 = MLPClassifier(
    hidden_layer_sizes=(9, ),
    activation="tanh",
    alpha=0.01,
    max_iter=2000,
    random_state=42
)

model1.fit(x_train, y_train)

validation_accuracy = model1.score(x_val, y_val)
print(f"validation_accuracy={validation_accuracy:0.4f}")
train_accuracy = model1.score(x_train, y_train)
print(f"train_accuracy={train_accuracy:0.4f}")
y_pred = model1.predict(x_train)

print("training")
print(classification_report(y_train, y_pred))

y_pred_val = model1.predict(x_val)
print("validation")
print(classification_report(y_val, y_pred_val))

if False:
    model1.fit(x_train, y_train)
    test_accuracy = model1.score(x_test, y_test)
    print(f"test_accuracy={test_accuracy}")

validation_accuracy=0.5217
train_accuracy=0.6800
training
              precision    recall  f1-score   support

           0       0.67      0.65      0.66       490
           1       0.69      0.71      0.70       532

    accuracy                           0.68      1022
   macro avg       0.68      0.68      0.68      1022
weighted avg       0.68      0.68      0.68      1022

validation
              precision    recall  f1-score   support

           0       0.49      0.11      0.18       372
           1       0.53      0.90      0.66       410

    accuracy                           0.52       782
   macro avg       0.51      0.50      0.42       782
weighted avg       0.51      0.52      0.43       782



In [420]:
#eval NN

## cleaning data
X_test = test_df.drop(columns=["target", "Date", "Close", "Open", "High", "Low"])
y_test = test_df["target"]
X_test = standardize.transform(X_test)
X_test = pca.transform(X_test)


## probabilities
probs = model1.predict_proba(X_test)[:, 1]

# probability distribution
print("Probability summary:")
print(pd.Series(probs).describe())

# ROC-accuracy
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test, probs)
print(f"ROC-AUC: {auc:.3f}")


# Quintile hit-rate 
df_eval = test_df.copy()
df_eval["prob"] = probs

df_eval["quintile"] = pd.qcut(df_eval["prob"], 5, labels=False)

quintile_perf = (
    df_eval
    .groupby("quintile")["target"]
    .mean()
)

print("TP hit-rate by probability quintile:")
print(quintile_perf)

# threshold-based trading behavior
df_eval["enter"] = (df_eval["prob"] >= p).astype(int)

num_trades = df_eval["enter"].sum()
trade_hit_rate = df_eval.loc[df_eval["enter"] == 1, "target"].mean()
base_hit_rate = df_eval["target"].mean()

print(f"Base hit rate: {base_hit_rate:.3f}")
print(f"Trades taken: {num_trades}")
print(f"Hit rate on trades: {trade_hit_rate:.3f}")


precision = precision_score(
    y_test,
    df_eval["enter"]
)

print(f"Precision (hit rate on entered days): {precision:.3f}")

coverage = df_eval["enter"].mean()

print(f"Coverage (fraction of days entered): {coverage:.3f}")

## model peformance
df_pnl = df_eval.copy()

# Open â†’ Close return
df_pnl["ret_oc"] = (df_pnl["Close"] - df_pnl["Open"]) / df_pnl["Open"]

df_pnl["daily_return"] = 0.0

# If we enter and TP hits
df_pnl.loc[
    (df_pnl["enter"] == 1) & (df_pnl["target"] == 1),
    "daily_return"
] = profit_taking

# If we enter and TP does NOT hit
df_pnl.loc[
    (df_pnl["enter"] == 1) & (df_pnl["target"] == 0),
    "daily_return"
] = df_pnl["ret_oc"]

# Compound equity
df_pnl["equity_model"] = (1 + df_pnl["daily_return"]).cumprod()

model_final_equity = df_pnl["equity_model"].iloc[-1]
print(f"Final equity (model strategy): {model_final_equity:.3f}")


#baseline performance

df_base = test_df.copy()

df_base["ret_oc"] = (df_base["Close"] - df_base["Open"]) / df_base["Open"]

df_base["daily_return"] = df_base["ret_oc"]

# Override with TP when hit
df_base.loc[
    df_base["target"] == 1,
    "daily_return"
] = profit_taking

df_base["equity_base"] = (1 + df_base["daily_return"]).cumprod()

base_final_equity = df_base["equity_base"].iloc[-1]
print(f"Final equity (enter every day): {base_final_equity:.3f}")

relative_performance = model_final_equity / base_final_equity
print(f"Relative performance (model / baseline): {relative_performance:.3f}x")


Probability summary:
count    1534.000000
mean        0.651195
std         0.137817
min         0.155653
25%         0.570482
50%         0.663555
75%         0.764091
max         0.917800
dtype: float64
ROC-AUC: 0.547
TP hit-rate by probability quintile:
quintile
0    0.328990
1    0.358306
2    0.408497
3    0.426710
4    0.449511
Name: target, dtype: float64
Base hit rate: 0.394
Trades taken: 808
Hit rate on trades: 0.432
Precision (hit rate on entered days): 0.432
Coverage (fraction of days entered): 0.527
Final equity (model strategy): 1.143
Final equity (enter every day): 0.930
Relative performance (model / baseline): 1.230x




In [422]:
ps = np.linspace(0.5, 0.9, 41)
results = []

df_test = test_df.copy()
df_test["prob"] = test_probs
df_test["ret_oc"] = (df_test["Close"] - df_test["Open"]) / df_test["Open"]

base_hit_rate = df_test["target"].mean()

for p_try in ps:
    df_tmp = df_test.copy()

    df_tmp["enter"] = (df_tmp["prob"] >= p_try).astype(int)

    # Hit rate on trades
    if df_tmp["enter"].sum() > 0:
        hit_rate = df_tmp.loc[df_tmp["enter"] == 1, "target"].mean()
    else:
        hit_rate = np.nan

    coverage = df_tmp["enter"].mean()

    # PnL
    df_tmp["daily_return"] = 0.0
    df_tmp.loc[
        (df_tmp["enter"] == 1) & (df_tmp["target"] == 1),
        "daily_return"
    ] = profit_taking

    df_tmp.loc[
        (df_tmp["enter"] == 1) & (df_tmp["target"] == 0),
        "daily_return"
    ] = df_tmp["ret_oc"]

    equity = (1 + df_tmp["daily_return"]).cumprod().iloc[-1]

    results.append((p_try, equity, coverage, hit_rate, base_hit_rate))

results_df = (
    pd.DataFrame(
        results,
        columns=["p", "equity", "coverage", "hit_rate", "base_hit_rate"]
    )
    .sort_values("equity", ascending=False)
)

print(results_df.head(10))



       p    equity  coverage  hit_rate  base_hit_rate
15  0.65  1.114219  0.318123  0.463115       0.394394
16  0.66  1.111255  0.294654  0.471239       0.394394
22  0.72  1.106928  0.165580  0.500000       0.394394
17  0.67  1.100885  0.265971  0.468137       0.394394
21  0.71  1.100550  0.177314  0.485294       0.394394
23  0.73  1.081943  0.150587  0.493506       0.394394
18  0.68  1.081194  0.244459  0.466667       0.394394
14  0.64  1.079314  0.344198  0.454545       0.394394
20  0.70  1.077152  0.196219  0.468439       0.394394
24  0.74  1.072766  0.131682  0.485149       0.394394


In [371]:
#lower profit taking harder to find signal - lots of class imbalance
#8, tanh, 0.01 solida, [ =0.65
