### Using single threshold

In [31]:
import pandas as pd
import pandas_ta as ta
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from bayes_opt import BayesianOptimization
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

In [32]:
# Load data
def load_data(filepath):
    df = pd.read_csv(filepath, parse_dates=['datetime'], index_col='datetime')
    return df

In [33]:
# Feature engineering (add interaction features for open, high, low, close)
def feature_engineering(df):
    # Interaction features for open, high, low, close
    df['high_low_spread'] = df['high'] - df['low']
    df['open_close_diff'] = df['close'] - df['open']
    
    # Existing features
    df['volume_SMA_20'] = df['volume'].rolling(20).mean()  # Needed for Volume_ratio
    df['RSI_14'] = ta.rsi(df['close'], length=14)
    df['macd'] = ta.macd(df['close'], fast=12, slow=26, signal=9)['MACD_12_26_9']
    df['MACD_signal'] = ta.macd(df['close'], fast=12, slow=26, signal=9)['MACDs_12_26_9']
    df['ATR_14'] = ta.atr(df['high'], df['low'], df['close'], length=14)
    df['OBV'] = ta.obv(df['close'], df['volume'])
    df['ROC_10'] = ta.roc(df['close'], length=10)
    df['price_delta'] = df['close'].diff()
    df['volatility'] = df['close'].rolling(20).std()
    df['RSI_lag_1'] = df['RSI_14'].shift(1)
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['ATR_ratio'] = df['ATR_14'] / df['close']
    df['Volume_ratio'] = df['volume'] / df['volume_SMA_20']
    df['trend'] = df['close'].rolling(20).mean() / df['close']
    df['price_vol_ratio'] = np.where(df['volume'] == 0, 0, df['close'] / df['volume'])
    df['momentum'] = df['close'].diff(5)  # Needed for mom_div
    df['mom_div'] = df['momentum'] - df['momentum'].shift(5)
    
    # Drop unnecessary columns, but keep open, high, low, close
    df.drop(columns=['momentum', 'volume_SMA_20', 
                     'bollinger_high', 'bollinger_low', 'bollinger_mavg', 'minute', 'close_lag_1'], 
            inplace=True, errors='ignore')
    
    df.dropna(inplace=True)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    return df

In [16]:
# Define target
def define_target(df):
    df['target'] = np.where(df['close'].shift(-1) > df['close'] * 1.005, 1, 0)
    return df

In [34]:
# Undersample majority class
def undersample_majority(X, y, target_size=60000):
    rus = RandomUnderSampler(sampling_strategy={0: target_size, 1: np.sum(y == 1)}, random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    return X_res, y_res

In [35]:
# Optimize XGBoost
def optimize_xgboost(X_train, y_train, X_val, y_val):
    def xgb_eval(max_depth, learning_rate, colsample_bytree, subsample, gamma, min_child_weight, num_boost_round):
        params = {
            'max_depth': int(max_depth),
            'learning_rate': learning_rate,
            'colsample_bytree': colsample_bytree,
            'subsample': subsample,
            'gamma': gamma,
            'min_child_weight': min_child_weight,
            'eval_metric': 'logloss',
            'scale_pos_weight': 4.0,  # Adjusted
            'objective': 'binary:logistic'
        }
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        model = xgb.train(params, dtrain, num_boost_round=int(num_boost_round), evals=[(dval, 'val')], 
                          early_stopping_rounds=10, verbose_eval=False)
        return -model.best_score
    
    param_bounds = {
        'max_depth': (3, 8),
        'learning_rate': (0.05, 0.5),
        'colsample_bytree': (0.6, 1.0),
        'subsample': (0.6, 1.0),
        'gamma': (0, 5),
        'min_child_weight': (5, 10),
        'num_boost_round': (50, 500)
    }
    optimizer = BayesianOptimization(f=xgb_eval, pbounds=param_bounds, random_state=42)
    optimizer.maximize(n_iter=30)
    
    best_params = optimizer.max['params']
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['num_boost_round'] = int(best_params['num_boost_round'])
    best_params['min_child_weight'] = int(best_params['min_child_weight'])
    best_params['eval_metric'] = 'logloss'
    best_params['scale_pos_weight'] = 4.0
    best_params['objective'] = 'binary:logistic'
    return best_params

In [36]:
# Train model
def train_model(X_train, y_train, X_val, y_val, best_params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    params = {k: v for k, v in best_params.items() if k != 'num_boost_round'}
    print("Training with parameters:", params)
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=best_params['num_boost_round'],
        evals=[(dval, 'val')],
        early_stopping_rounds=10,
        verbose_eval=True
    )
    joblib.dump(model, "optimized_xgboost.joblib")
    print("Model saved successfully.")
    return model

In [37]:
# Evaluate with new threshold
def evaluate_model(model, best_params, X_train, y_train, X_val, y_val, X_test, y_test, feature_cols):
    params = {k: v for k, v in best_params.items() if k not in ['eval_metric', 'num_boost_round', 'objective']}
    xgb_clf = XGBClassifier(**params, n_estimators=model.best_iteration + 1)
    xgb_clf.fit(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    y_probs_raw = np.clip(model.predict(dtest), 0, 1)
    
    print("Raw probabilities (sample):", y_probs_raw[:10])
    
    lr = LogisticRegression(class_weight={0: 1, 1: 1}, penalty='l1', solver='liblinear')
    lr.fit(X_train, y_train)
    y_probs_lr = lr.predict_proba(X_test)[:, 1]
    y_probs_ensemble = 0.6 * y_probs_raw + 0.4 * y_probs_lr
    
    threshold = 0.6  # Increased to 0.6
    y_pred = (y_probs_ensemble > threshold).astype(int)
    print(f"\nThreshold: {threshold}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    
    # Print feature importances as text
    importance = model.get_score(importance_type='weight')
    print("\nFeature Importances:")
    for i, col in enumerate(feature_cols):
        score = importance.get(f"f{i}", 0)
        print(f"{col} (f{i}): {score}")

In [38]:

# Main execution
df = load_data("D:/try-quant/processed_data_version_7.csv")
df = feature_engineering(df)
df = define_target(df)

feature_cols = [col for col in df.columns if col not in ['target', 'datetime']]
print("Feature columns:", feature_cols)

X = df[feature_cols]
y = df['target']

print("Original class distribution:", np.bincount(y))

X_res, y_res = undersample_majority(X, y, target_size=60000)
print("Class distribution after undersampling:", np.bincount(y_res))

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_res)

X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y_res, test_size=0.2, shuffle=True, random_state=42)
X_train_orig, X_val, y_train_orig, y_val = train_test_split(X_temp, y_temp, test_size=0.25, shuffle=True, random_state=42)

print("Train (before SMOTE) class distribution:", np.bincount(y_train_orig))
print("Validation class distribution:", np.bincount(y_val))
print("Test class distribution:", np.bincount(y_test))

smote = SMOTE(sampling_strategy={0: 36044, 1: 30000}, random_state=42)
X_train, y_train = smote.fit_resample(X_train_orig, y_train_orig)

print("Train class distribution (after SMOTE):", np.bincount(y_train))

best_params = optimize_xgboost(X_train, y_train, X_val, y_val)
model = train_model(X_train, y_train, X_val, y_val, best_params)
evaluate_model(model, best_params, X_train, y_train, X_val, y_val, X_test, y_test, feature_cols)

Feature columns: ['open', 'high', 'low', 'close', 'volume', 'SMA_20', 'RSI_14', 'MACD', 'ATR_14', 'OBV', 'ROC_10', 'price_delta', 'volatility', 'RSI_lag_1', 'hour', 'dayofweek', 'ATR_ratio', 'Volume_ratio', 'MACD_signal', 'high_low_spread', 'open_close_diff', 'macd', 'trend', 'price_vol_ratio', 'mom_div']
Original class distribution: [83371 10459]
Class distribution after undersampling: [60000 10459]
Train (before SMOTE) class distribution: [36044  6231]
Validation class distribution: [11959  2133]
Test class distribution: [11997  2095]
Train class distribution (after SMOTE): [36044 30000]
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | num_bo... | subsample |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m-0.5766  [39m | [39m0.7498   [39m | [39m4.754    [39m | [39m0.3794   [39m | [39m5.993    [39m | [39m5.78     [39m | [39m120.2    [39m | [39m