In [17]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor,XGBClassifier,XGBRanker
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score
import joblib 


In [33]:
PROCESSED_DATA_OUTPUT_DIR="../data/processed"

In [34]:
df=pd.read_parquet(f"{PROCESSED_DATA_OUTPUT_DIR}/swing_trading_model_data.parquet")

In [35]:
df.head()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,sp500_Open,sp500_High,sp500_Low,...,ema_8_21_cross,stochastic_k,stochastic_d,bollinger_percent_b,roc_21d,obv_scaled,atr_14d,target_5d,target_regression,target_binary
0,2015-02-04,A,36.067509,36.186722,35.599812,35.865757,2567400.0,2048.860107,2054.73999,2036.719971,...,0,88.990128,19.966251,0.667705,-0.023176,0.224158,0.860475,0.005625,0.005625,False
1,2015-02-05,A,35.966644,36.516872,35.966644,36.250927,1826300.0,2043.449951,2063.550049,2043.449951,...,0,65.595771,41.798319,0.526968,-0.017337,-0.28762,0.843563,0.012396,0.012396,False
2,2015-02-06,A,36.260101,36.507705,35.975815,36.076691,2697200.0,2062.280029,2072.399902,2049.969971,...,1,84.862251,62.67653,0.64711,0.008933,0.1709,0.817903,0.02059,0.02059,True
3,2015-02-09,A,35.975816,36.159227,35.719044,35.801579,3586100.0,2053.469971,2056.159912,2041.880005,...,1,76.146826,79.81605,0.632199,-0.009068,-0.391981,0.779768,0.03791,0.03791,True
4,2015-02-10,A,36.049176,36.461845,35.563138,36.379311,1408600.0,2049.379883,2070.860107,2048.620117,...,1,62.385517,75.534949,0.570313,-0.045243,-1.291251,0.73449,0.047139,0.047139,True


In [36]:
df.columns

Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume',
       'return_5d', 'rsi_14d', 'daily_return', 'volatility_10d',
       'volatility_20d', 'sp500_return_5d', 'relative_strength_5d',
       'macd_histogram', 'ema_8d', 'ema_21d', 'ema_8_21_cross', 'stochastic_k',
       'stochastic_d', 'bollinger_percent_b', 'roc_21d', 'obv_scaled',
       'atr_14d', 'target_5d', 'target_regression', 'target_binary'],
      dtype='object')

In [42]:
feature_cols = ['return_5d', 'rsi_14d', 'volatility_10d', 'volatility_20d', 
                   'sp500_return_5d', 'relative_strength_5d',
                   "stochastic_k","stochastic_d",'ema_8_21_cross','ema_21d',
                   'ema_8d','macd_histogram','obv_scaled','atr_14d','bollinger_percent_b','roc_21d',
                ]

target=["target_regression"]


In [8]:
def create_time_splits(df):
    df_copy=df.copy()
    df_copy['year'] = df_copy['Date'].dt.year
    #year split
    train = df_copy[df_copy['Date'] < "2022-06-06"]
    val   = df_copy[(df_copy['Date'] >= "2022-06-06") & (df_copy['Date'] < "2025-06-06")]
    test  = df_copy[df_copy['Date'] >= "2025-06-06"]
    
    
    return train,val,test

In [38]:
train, val, test = create_time_splits(df)
print(f"Train: {len(train):,} rows ({train['Date'].min()} to {train['Date'].max()})")
print(f"Val: {len(val):,} rows ({val['Date'].min()} to {val['Date'].max()})")
print(f"Test: {len(test):,} rows ({test['Date'].min()} to {test['Date'].max()})")

# Check target distribution
print(f"\nTarget distribution:")
print(f"Train: {train['target_binary'].mean():.3f}")
print(f"Val: {val['target_binary'].mean():.3f}")
print(f"Test: {test['target_binary'].mean():.3f}")

Train: 889,440 rows (2015-02-04 00:00:00 to 2022-06-03 00:00:00)
Val: 377,045 rows (2022-06-06 00:00:00 to 2025-06-05 00:00:00)
Test: 35,991 rows (2025-06-06 00:00:00 to 2025-09-18 00:00:00)

Target distribution:
Train: 0.500
Val: 0.499
Test: 0.499


In [43]:
train,val,test=create_time_splits(df)
X_train,y_train=train[feature_cols],train['target_regression']
X_val,y_val=val[feature_cols],val['target_regression']
X_test,y_test=test[feature_cols],test['target_regression']

print(f"Training samples: {len(X_train):,}")
print(f"Validation samples: {len(X_val):,}")
print(f"Test samples: {len(X_test):,}")

Training samples: 889,440
Validation samples: 377,045
Test samples: 35,991


In [44]:
reg_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.05,
    random_state=42
)

print("Training regression model...")
reg_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=10
)

Training regression model...
[0]	validation_0-rmse:0.04201
[10]	validation_0-rmse:0.04208
[20]	validation_0-rmse:0.04218
[30]	validation_0-rmse:0.04227
[40]	validation_0-rmse:0.04234
[50]	validation_0-rmse:0.04239
[60]	validation_0-rmse:0.04243
[70]	validation_0-rmse:0.04248
[80]	validation_0-rmse:0.04253
[90]	validation_0-rmse:0.04258
[100]	validation_0-rmse:0.04261
[110]	validation_0-rmse:0.04265
[120]	validation_0-rmse:0.04267
[130]	validation_0-rmse:0.04270
[140]	validation_0-rmse:0.04272
[150]	validation_0-rmse:0.04274
[160]	validation_0-rmse:0.04277
[170]	validation_0-rmse:0.04280
[180]	validation_0-rmse:0.04282
[190]	validation_0-rmse:0.04285
[200]	validation_0-rmse:0.04286
[210]	validation_0-rmse:0.04289
[220]	validation_0-rmse:0.04292
[230]	validation_0-rmse:0.04295
[240]	validation_0-rmse:0.04296
[250]	validation_0-rmse:0.04297
[260]	validation_0-rmse:0.04298
[270]	validation_0-rmse:0.04299
[280]	validation_0-rmse:0.04300
[290]	validation_0-rmse:0.04302
[300]	validation_0-rms

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [45]:
y_val_pred=reg_model.predict(X_val)
mse=mean_squared_error(y_val,y_val_pred)
r2=r2_score(y_val,y_val_pred)
print(f"Validation MSE: {mse:.6f}")
print(f"Validation R²: {r2:.4f}")

Validation MSE: 0.001899
Validation R²: -0.0758


In [46]:
# What is the model learning?
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': reg_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

Feature Importance:
                 feature  importance
4        sp500_return_5d    0.151042
8         ema_8_21_cross    0.089440
3         volatility_20d    0.088115
2         volatility_10d    0.078166
15               roc_21d    0.067672
7           stochastic_d    0.063115
1                rsi_14d    0.060394
11        macd_histogram    0.053465
14   bollinger_percent_b    0.051201
0              return_5d    0.050322
6           stochastic_k    0.048036
5   relative_strength_5d    0.044034
12            obv_scaled    0.042720
13               atr_14d    0.042185
10                ema_8d    0.036891
9                ema_21d    0.033201


In [13]:
# Check these:
print(y_train.describe())
print(f"Mean: {y_train.mean():.4f}")
print(f"Std: {y_train.std():.4f}")
print(f"% positive: {(y_train > 0).mean():.1%}")

count    960962.000000
mean          0.003100
std           0.040655
min          -0.122154
25%          -0.017606
50%           0.003715
75%           0.024431
max           0.128970
Name: target_regression, dtype: float64
Mean: 0.0031
Std: 0.0407
% positive: 54.9%


In [48]:
# Pick a random day from validation
sample_date = val['Date'].unique()[6]
day_data = val[val['Date'] == sample_date].copy()

# Get predictions for that day
day_predictions = reg_model.predict(day_data[feature_cols])
day_data['predicted_return'] = day_predictions

# Sort by prediction and actual
day_data_sorted = day_data.sort_values('predicted_return', ascending=False)

print(f"\nTop 10 predicted stocks for {sample_date}:")
print(day_data_sorted[['Ticker', 'predicted_return', 'target_regression']].head(10))

print(f"\nActual top 10 performers that day:")
print(day_data.nlargest(10, 'target_regression')[['Ticker', 'predicted_return', 'target_regression']])


Top 10 predicted stocks for 2022-06-14 00:00:00:
        Ticker  predicted_return  target_regression
962074    PLTR          0.050695           0.128970
690546    KLAC          0.047374           0.025184
611645    IDXX          0.045539           0.012523
920777     OXY          0.041961          -0.101788
102605     APP          0.041908           0.111592
1114463    TDG          0.038928          -0.003942
323972    DASH          0.038071           0.114437
1248452    WDC          0.037245          -0.075548
105276    APTV          0.036741          -0.041719
571225     HLT          0.036036          -0.059762

Actual top 10 performers that day:
        Ticker  predicted_return  target_regression
962074    PLTR          0.050695           0.128970
131994      BA         -0.016006           0.122790
823240    MRNA         -0.040017           0.119365
323972    DASH          0.038071           0.114437
619660    INCY          0.007234           0.114385
419335    EPAM         -0.0299

In [230]:
correlation=val['return_5d'].corr(val['target_regression'])
print(f"Overall correlation between past 5d return and future 5d return: {correlation:.4f}")
correlation=val['relative_strength_5d'].corr(val['target_regression'])
print(f"Overall correlation between relative_strength 5d return and future 5d return: {correlation:.4f}")

Overall correlation between past 5d return and future 5d return: -0.0217
Overall correlation between relative_strength 5d return and future 5d return: -0.0217


In [112]:
y_pred_test=reg_model.predict(X_test)
direction_pred=np.mean(np.sign(y_pred_test)==np.sign(y_test))
y_pred_val=reg_model.predict(X_val)
direction_pred_test=np.mean(np.sign(y_pred_test)==np.sign(y_test))
direction_pred_val=np.mean(np.sign(y_pred_val)==np.sign(y_val))
print(f"Directional Accuracy {direction_pred_test:.3f}% test")
print(f"Directional Accuracy {direction_pred_val:.3f}% val")


Directional Accuracy 0.519% test
Directional Accuracy 0.520% val


In [None]:
#since top 5 stock prediction has too many outliers
#i will train a binary classification model to see what stock consistenly outperform the market
#its accuracy must be higher than 50% ie winrate 
#i will then add other metrics 

In [49]:
train,val,test=create_time_splits(df)

X_train,y_train=train[feature_cols],train['target_binary']
X_val,y_val=val[feature_cols],val['target_binary']
X_test,y_test=test[feature_cols],test['target_binary']

model_clas=XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    objective='binary:logistic'
)

model_clas.fit(X_train,y_train,eval_set=[(X_val,y_val)])

y_pred=model_clas.predict(X_val)
y_pred_prob=model_clas.predict_proba(X_val)[:,1]

y_pred_test=model_clas.predict(X_test)
y__test_pred_prob=model_clas.predict_proba(X_test)[:,1]

accuracy=accuracy_score(y_val,y_pred)
print(f"Accuracy val: {accuracy:.3f}")

accuracy_test=accuracy_score(y_test,y_pred_test)
print(f"Accuracy Test: {accuracy_test:.3f}")

baseline=max(y_val.mean(),1-y_val.mean())
print(f"Baseline: {baseline:.3f}")



[0]	validation_0-logloss:0.69314
[1]	validation_0-logloss:0.69314
[2]	validation_0-logloss:0.69314
[3]	validation_0-logloss:0.69314
[4]	validation_0-logloss:0.69312
[5]	validation_0-logloss:0.69312
[6]	validation_0-logloss:0.69311
[7]	validation_0-logloss:0.69310
[8]	validation_0-logloss:0.69309
[9]	validation_0-logloss:0.69309
[10]	validation_0-logloss:0.69310
[11]	validation_0-logloss:0.69310
[12]	validation_0-logloss:0.69309
[13]	validation_0-logloss:0.69309
[14]	validation_0-logloss:0.69310
[15]	validation_0-logloss:0.69310
[16]	validation_0-logloss:0.69310
[17]	validation_0-logloss:0.69310
[18]	validation_0-logloss:0.69310
[19]	validation_0-logloss:0.69311
[20]	validation_0-logloss:0.69311
[21]	validation_0-logloss:0.69311
[22]	validation_0-logloss:0.69311
[23]	validation_0-logloss:0.69311
[24]	validation_0-logloss:0.69311
[25]	validation_0-logloss:0.69312
[26]	validation_0-logloss:0.69311
[27]	validation_0-logloss:0.69311
[28]	validation_0-logloss:0.69312
[29]	validation_0-loglos

In [50]:
print("Target distribution:")
print(f"Class 0: {y_train.mean():.3f}")
print(f"Class 1: {1 - y_train.mean():.3f}")

Target distribution:
Class 0: 0.500
Class 1: 0.500


In [108]:
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}") 
print(f"y_pred shape: {y_pred.shape}")
print(f"val data shape: {val.shape}")


X_val shape: (249819, 5)
y_val shape: (249819,)
y_pred shape: (249819,)
val data shape: (249819, 23)


In [252]:
# 1. Check feature importance
importance = model_clas.feature_importances_
feature_imp = pd.DataFrame({'feature': X_train.columns, 'importance': importance})
print(feature_imp.sort_values('importance', ascending=False).head(10))

# 2. Test trading performance
test_data = val.copy()
test_data['pred'] = y_pred
test_data['pred_prob'] = model_clas.predict_proba(X_val)[:,1]

# Returns when model predicts 1 vs 0
returns_1 = test_data[test_data['pred'] == 1]['target_5d'].mean()
returns_0 = test_data[test_data['pred'] == 0]['target_5d'].mean()
print(f"Returns when predicted ↑: {returns_1:.4f}")
print(f"Returns when predicted ↓: {returns_0:.4f}")
print(f"Spread: {returns_1 - returns_0:.4f}")


                 feature  importance
10                ema_8d    0.090333
3         volatility_20d    0.089532
8         ema_8_21_cross    0.086613
5   relative_strength_5d    0.086132
9                ema_21d    0.085660
2         volatility_10d    0.084528
1                rsi_14d    0.084524
11        macd_histogram    0.083874
7           stochastic_d    0.081490
0              return_5d    0.078756
Returns when predicted ↑: 0.0056
Returns when predicted ↓: -0.0033
Spread: 0.0089


In [110]:
#XGBRanker to rank top stocks
#evaluate on market performance vs model picked stock performance
#check annual return
#if model outperforms the market you have a usuable model


In [51]:
# Sort all data by Date first
train,val,test=create_time_splits(df)
train = train.sort_values('Date')
val = val.sort_values('Date') 
test = test.sort_values('Date')

# Then get group sizes
train_group = [len(group) for date, group in train.groupby("Date")]
val_group = [len(group) for date, group in val.groupby("Date")]
test_group = [len(group) for date, group in test.groupby("Date")]

# Re-extract features after sorting
X_train, y_train = train[feature_cols], train['target_regression']
X_val, y_val = val[feature_cols], val['target_regression']
X_test,y_test=test[feature_cols], test["target_regression"]

model_rank = XGBRanker(
    objective="rank:pairwise",
    learning_rate=0.05,
    n_estimators=150,
    max_depth=6,
    random_state=42
)

model_rank.fit(
    X_train, y_train,
    group=train_group,
    verbose=True
)

0,1,2
,objective,'rank:pairwise'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [52]:
def evaluate_rank_pred_correct(df,model,risk_free_rate):
    strategy_returns = []
    market_returns = []
    
    # Get all unique dates sorted
    all_dates = sorted(df['Date'].unique())
    
    # Trade every 5 days to avoid overlapping positions
    for i in range(0, len(all_dates), 5):
        if i >= len(all_dates): break
            
        trade_date = all_dates[i]
        day_data = df[df['Date'] == trade_date]
        
        if len(day_data) < 15: continue
        
        #predict on this 
        X=day_data[feature_cols]
        #predict on the days data
        y_pred=model.predict(X)
        day_data=day_data.copy()
        day_data['pred_score']=y_pred
        # Your model's picks (top 50 by prediction score)
        top_50_pred = day_data.nlargest(15, 'pred_score')
        
        # Calculate 5-day returns for your picks
        pred_tickers = set(top_50_pred['Ticker'])

        actual_ret_data = day_data[day_data['Ticker'].isin(pred_tickers)]
        if len(actual_ret_data) > 0:
            actual_ret = np.mean(actual_ret_data["target_regression"])
            strategy_returns.append(actual_ret)
        
        #market return
        market_return=np.mean(day_data["target_regression"])
        market_returns.append(market_return)
    
    # Convert to annual returns
    trades_per_year = 252 / 5  # ~50 trades per year
    
    # Strategy performance
    avg_strategy_return = np.mean(strategy_returns)
    annual_strategy_return = (1 + avg_strategy_return) ** trades_per_year - 1
    
    # Market performance  
    avg_market_return = np.mean(market_returns)
    annual_market_return = (1 + avg_market_return) ** trades_per_year - 1
    
    # Excess return (alpha)
    excess_return = annual_strategy_return - annual_market_return
        # Sharpe ratio
    annual_risk_free_rate = 0.02
    per_period_risk_free_rate = annual_risk_free_rate / trades_per_year
    
    # Calculate excess returns per trade
    excess_returns = [r - per_period_risk_free_rate for r in strategy_returns]
    
    # Calculate average and standard deviation of excess returns
    avg_excess_return = np.mean(excess_returns)
    std_dev_excess_return = np.std(excess_returns)
    
    if std_dev_excess_return == 0:
        sharpe_ratio = 0.0
    else:
        # Calculate the Sharpe Ratio for the period
        sharpe_ratio = avg_excess_return / std_dev_excess_return
    
    # Annualize the Sharpe Ratio
    annualized_sharpe_ratio = sharpe_ratio * np.sqrt(trades_per_year)

    
    print(f"Strategy Average 5-day Return: {avg_strategy_return:.3%}")
    print(f"Market Average 5-day Return: {avg_market_return:.3%}")
    print(f"Annual Strategy Return: {annual_strategy_return:.1%}")
    print(f"Annual Market Return: {annual_market_return:.1%}")
    print(f"Excess Return (Alpha): {excess_return:.1%}")
    print(f"Sharpe Ratio: {annualized_sharpe_ratio}")

In [53]:
def calculate_hit_rate(df, model, top_n=15):
    """What % of time do top picks beat market average?"""
    hit_rates = []
    win_rates=[]
    for date in df['Date'].unique():
        day_data = df[df['Date'] == date].copy()
        if len(day_data) < top_n: continue
            
        X = day_data[feature_cols]
        day_data['pred_score'] = model.predict(X)
        
        top_stocks = day_data.nlargest(top_n, 'pred_score')
        market_avg = day_data['target_regression'].mean()
        
        hit_rate = (top_stocks['target_regression'] > 0).mean()
        win_rate=(top_stocks['target_regression']> market_avg).mean()
        hit_rates.append(hit_rate)
        win_rates.append(win_rate)
    
    print(f"Hit rate {np.mean(hit_rates)*100:.4f} %")
    print(f"Win rate {np.mean(win_rates)*100:.4f} %")

In [54]:
train,test,val=create_time_splits(df)
test_and_val=pd.concat([test,val])
evaluate_rank_pred_correct(test_and_val,model_rank,0)
calculate_hit_rate(val,model_rank)


Strategy Average 5-day Return: 0.647%
Market Average 5-day Return: 0.287%
Annual Strategy Return: 38.4%
Annual Market Return: 15.5%
Excess Return (Alpha): 22.9%
Sharpe Ratio: 1.412094561404039
Hit rate 56.2963 %
Win rate 50.7407 %


In [None]:
def simple_time_split_tuning(train_data, val_data, feature_cols):
    """Simple tuning using your existing train/val split"""
    
    # Prepare training data with groups
    X_train = train_data[feature_cols]
    y_train = train_data['target_regression']
    train_groups = [len(group) for _, group in train_data.groupby('Date')]
    val_groups = [len(group) for _, group in val_data.groupby('Date')]
    X_val = val_data[feature_cols]
    y_val = val_data['target_regression']
    
    param_combinations = [
        # Baseline style
        {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300},
        {'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 300, 'min_child_weight': 1},
        {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 150},
        {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'min_child_weight': 5},

        # Try deeper vs shallower trees
        {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 300, 'min_child_weight': 3},
        {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 400, 'min_child_weight': 1},

        # Add subsampling for regularization
        {'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 500, 'subsample': 0.8, 'colsample_bytree': 0.8},
        {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 250, 'subsample': 0.7, 'colsample_bytree': 0.9},

        # Smaller learning rate + more estimators
        {'learning_rate': 0.005, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 0.9, 'colsample_bytree': 0.9},
        {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.8, 'colsample_bytree': 0.8},

        # Try different min_child_weight
        {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 400, 'min_child_weight': 10, 'subsample': 0.8},
        {'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 300, 'min_child_weight': 2, 'colsample_bytree': 0.7},

        # Very aggressive learning rate
        {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8, 'colsample_bytree': 0.8}
    ]

    
    best_ic = -np.inf
    best_params = None
    
    for params in param_combinations:
        model = XGBRanker(
            objective='rank:pairwise',
            **params,
            random_state=42,
        )
        
        model.fit(
            X_train, y_train, 
            group=train_groups, 
            verbose=False, 
        )

        # Evaluate using Information Coefficient
        val_pred = model.predict(X_val)
        ic = np.corrcoef(val_pred, y_val)[0,1]
        
        print(f"Params: {params} -> IC: {ic:.4f}")
        
        if ic > best_ic:
            best_ic = ic
            best_params = params
    
    return best_params, best_ic

In [57]:
best_params, best_ic = simple_time_split_tuning(train, val, feature_cols)
print(f"Best parameters: {best_params}")
print(f"Best IC: {best_ic:.4f}")


Params: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300} -> IC: 0.0801
Params: {'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 300, 'min_child_weight': 1} -> IC: 0.0860
Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'min_child_weight': 5} -> IC: 0.0821
Params: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 300, 'min_child_weight': 3} -> IC: 0.0887
Params: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 400, 'min_child_weight': 1} -> IC: 0.0805
Params: {'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 500, 'subsample': 0.8, 'colsample_bytree': 0.8} -> IC: 0.0829
Params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 250, 'subsample': 0.7, 'colsample_bytree': 0.9} -> IC: 0.0894
Params: {'learning_rate': 0.005, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 0.9, 'colsample_bytree': 0.9} -> IC: 0.0749
Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.8, 'colsample_bytree': 0.8} -> 

In [58]:
best_model=XGBRanker(objective="rank:pairwise", 
                     eval_metric="ndcg@15",
                     learning_rate=best_params['learning_rate'],
                     max_depth=best_params['max_depth'],
                     n_estimators=best_params['n_estimators'],
                     random_state=42)

X_train=train[feature_cols]
y_train=train['target_regression']
train_group=[len(group) for _, group in train.groupby("Date")]
best_model.fit(X_train,y_train,group=train_group, verbose=10,           
               )


0,1,2
,objective,'rank:pairwise'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [59]:
test_and_val=pd.concat([test,val])
evaluate_rank_pred_correct(test_and_val,best_model,0.02)
calculate_hit_rate(test_and_val,best_model)

Strategy Average 5-day Return: 0.700%
Market Average 5-day Return: 0.287%
Annual Strategy Return: 42.1%
Annual Market Return: 15.5%
Excess Return (Alpha): 26.6%
Sharpe Ratio: 1.2498298501366074
Hit rate 54.2384 %
Win rate 51.6364 %


In [60]:
joblib.dump(best_model,"../models/xgbranker.pkl")

['../models/xgbranker.pkl']

In [None]:
# Strategy Average 5-day Return: 0.530%
# Market Average 5-day Return: 0.287%
# Annual Strategy Return: 30.5%
# Annual Market Return: 15.5%
# Excess Return (Alpha): 15.0%
# Sharpe Ratio: 1.225653425559378
# Hit rate 53.4788 %
# Win rate 50.1414 %

In [61]:
# See which features drive the big wins
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

                 feature  importance
8         ema_8_21_cross    0.200609
4        sp500_return_5d    0.147741
3         volatility_20d    0.097837
15               roc_21d    0.063803
2         volatility_10d    0.063528
14   bollinger_percent_b    0.042767
7           stochastic_d    0.042434
1                rsi_14d    0.041203
6           stochastic_k    0.041113
11        macd_histogram    0.040363
5   relative_strength_5d    0.039242
0              return_5d    0.038556
10                ema_8d    0.036978
13               atr_14d    0.036137
12            obv_scaled    0.034311
9                ema_21d    0.033376
