### Using single threshold

In [1]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from bayes_opt import BayesianOptimization
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# Load data
def load_data(filepath):
    df = pd.read_csv(filepath, parse_dates=['datetime'], index_col='datetime')
    return df

In [3]:
# Feature engineering
def feature_engineering(df):
    df['SMA_20'] = df.ta.sma(length=20)
    df['RSI_14'] = df.ta.rsi(length=14)
    df['rsi_mom'] = df['RSI_14'] - df['RSI_14'].shift(5)
    macd = df.ta.macd()
    df['MACD'] = macd['MACD_12_26_9']
    df['MACD_signal'] = macd['MACDs_12_26_9']
    df['MACD_hist'] = df['MACD'] - df['MACD_signal']
    df['ATR_14'] = df.ta.atr(length=14)
    df['volume_SMA_20'] = df['volume'].rolling(20).mean()
    df['OBV'] = ta.obv(df['close'], df['volume'])
    df['ROC_10'] = df['close'].pct_change(10)
    df['lagged_return'] = df['close'].pct_change().shift(1)
    bbands = df.ta.bbands()
    df['BB_upper'] = bbands['BBU_5_2.0']
    df['BB_lower'] = bbands['BBL_5_2.0']
    df['momentum'] = df['close'].diff(5)
    df['volatility'] = df['close'].rolling(20).std()
    df['vol_ratio'] = df['volatility'] / df['volatility'].shift(5)
    df['trend'] = df['close'] / df['SMA_20']
    df['acceleration'] = df['close'].diff(5).diff(5)
    df['rsi_div'] = df['RSI_14'].diff()
    df['vol_spike'] = np.where(df['volume_SMA_20'] == 0, 0, df['volume'] / df['volume_SMA_20'])
    df['price_vol_ratio'] = np.where(df['volume'] == 0, 0, df['close'] / df['volume'])
    df['mom_div'] = df['momentum'] - df['momentum'].shift(5)
    df.dropna(inplace=True)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    return df

In [4]:
# Define target
def define_target(df):
    df['target'] = np.where(df['close'].shift(-1) > df['close'] * 1.005, 1, 0)
    return df

In [5]:
# Undersample majority class
def undersample_majority(X, y, target_size=60000):
    rus = RandomUnderSampler(sampling_strategy={0: target_size, 1: np.sum(y == 1)}, random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    return X_res, y_res

In [6]:
# Optimize xgoost
def optimize_xgboost(X_train, y_train, X_val, y_val):
    def xgb_eval(max_depth, learning_rate, colsample_bytree, subsample, gamma, min_child_weight, num_boost_round):
        params = {
            'max_depth': int(max_depth),
            'learning_rate': learning_rate,
            'colsample_bytree': colsample_bytree,
            'subsample': subsample,
            'gamma': gamma,
            'min_child_weight': min_child_weight,
            'eval_metric': 'logloss',
            'scale_pos_weight': 3.0, 
            'objective': 'binary:logistic'
        }
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        model = xgb.train(params, dtrain, num_boost_round=int(num_boost_round), evals=[(dval, 'val')], 
                          early_stopping_rounds=10, verbose_eval=False)
        return -model.best_score
    
    param_bounds = {
        'max_depth': (3, 6),
        'learning_rate': (0.05, 0.5),
        'colsample_bytree': (0.6, 1.0),
        'subsample': (0.6, 1.0),
        'gamma': (0, 5),
        'min_child_weight': (5, 10),
        'num_boost_round': (50, 300)
    }
    optimizer = BayesianOptimization(f=xgb_eval, pbounds=param_bounds, random_state=42)
    optimizer.maximize(n_iter=30)
    
    best_params = optimizer.max['params']
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['num_boost_round'] = int(best_params['num_boost_round'])
    best_params['min_child_weight'] = int(best_params['min_child_weight'])
    best_params['eval_metric'] = 'logloss'
    best_params['scale_pos_weight'] = 3.0 
    best_params['objective'] = 'binary:logistic'
    return best_params

In [7]:
# Train model
def train_model(X_train, y_train, X_val, y_val, best_params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    params = {k: v for k, v in best_params.items() if k != 'num_boost_round'}
    print("Training with parameters:", params)
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=best_params['num_boost_round'],
        evals=[(dval, 'val')],
        early_stopping_rounds=10,
        verbose_eval=True
    )
    joblib.dump(model, "optimized_xgboost.joblib")
    print("Model saved successfully.")
    return model

In [8]:
# Evaluate with single threshold
def evaluate_model(model, best_params, X_train, y_train, X_val, y_val, X_test, y_test):
    params = {k: v for k, v in best_params.items() if k not in ['eval_metric', 'num_boost_round', 'objective']}
    xgb_clf = XGBClassifier(**params, n_estimators=model.best_iteration + 1)
    xgb_clf.fit(X_train, y_train)
    calibrated = CalibratedClassifierCV(xgb_clf, method='isotonic', cv='prefit')
    calibrated.fit(X_val, y_val)
    dtest = xgb.DMatrix(X_test)
    y_probs_raw = np.clip(model.predict(dtest), 0, 1)
    y_probs_calibrated = calibrated.predict_proba(X_test)[:, 1]
    
    print("Raw probabilities (sample):", y_probs_raw[:10])
    print("Calibrated probabilities (sample):", y_probs_calibrated[:10])
    
    lr = LogisticRegression(class_weight={0: 1, 1: 1}, penalty='l1', solver='liblinear')
    lr.fit(X_train, y_train)
    y_probs_lr = lr.predict_proba(X_test)[:, 1]
    y_probs_ensemble = 0.9 * y_probs_calibrated + 0.1 * y_probs_lr
    
    threshold = 0.38
    y_pred = (y_probs_ensemble > threshold).astype(int)
    print(f"\nThreshold: {threshold}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

In [9]:

# Main execution
df = load_data("D:/try-quant/processed_data_version_7.csv")
df = feature_engineering(df)
df = define_target(df)

feature_cols = [col for col in df.columns if col not in ['target', 'datetime']]
X = df[feature_cols]
y = df['target']

print("Original class distribution:", np.bincount(y))

X_res, y_res = undersample_majority(X, y, target_size=60000)
print("Class distribution after undersampling:", np.bincount(y_res))

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_res)

X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y_res, test_size=0.2, shuffle=True, random_state=42)
X_train_orig, X_val, y_train_orig, y_val = train_test_split(X_temp, y_temp, test_size=0.25, shuffle=True, random_state=42)

print("Train (before SMOTE) class distribution:", np.bincount(y_train_orig))
print("Validation class distribution:", np.bincount(y_val))
print("Test class distribution:", np.bincount(y_test))

smote = SMOTE(sampling_strategy={0: 36044, 1: 28000}, random_state=42)
X_train, y_train = smote.fit_resample(X_train_orig, y_train_orig)

print("Train class distribution (after SMOTE):", np.bincount(y_train))

best_params = optimize_xgboost(X_train, y_train, X_val, y_val)
model = train_model(X_train, y_train, X_val, y_val, best_params)
evaluate_model(model, best_params, X_train, y_train, X_val, y_val, X_test, y_test)

Original class distribution: [83371 10459]
Class distribution after undersampling: [60000 10459]
Train (before SMOTE) class distribution: [36044  6231]
Validation class distribution: [11959  2133]
Test class distribution: [11997  2095]
Train class distribution (after SMOTE): [36044 28000]
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | num_bo... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m-0.5336  [39m | [39m0.7498   [39m | [39m4.754    [39m | [39m0.3794   [39m | [39m4.796    [39m | [39m5.78     [39m | [39m89.0     [39m | [39m0.6232   [39m |
| [35m2        [39m | [35m-0.509   [39m | [35m0.9465   [39m | [35m3.006    [39m | [35m0.3686   [39m | [35m3.062    [39m | [35m9.85     [39m | [35m258.1    [39m | [35m0.6849   [39m |
| [39m3        [39m | [39m-0.5358  [39m | [39m0.6727   [39m | [39m0.917    [39m | 



Raw probabilities (sample): [0.09093307 0.41279334 0.02464595 0.6985987  0.3689653  0.28824
 0.25989565 0.35912743 0.03618515 0.48203564]
Calibrated probabilities (sample): [0.06832298 0.17791411 0.02919708 0.32114881 0.16505441 0.12702702
 0.14692983 0.16505441 0.02919708 0.23225152]

Threshold: 0.38
Accuracy: 0.8420
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     11997
           1       0.42      0.17      0.24      2095

    accuracy                           0.84     14092
   macro avg       0.64      0.56      0.57     14092
weighted avg       0.80      0.84      0.81     14092

