In [19]:
features = [
    'lead_speed_diff',
    'hp_advantage_seen','mons_revealed_diff','team_status_diff','end_boost_diff',
    'total_damage_dealt','total_healing_done','status_turns',
    'first_faint_turn','total_stats_diff','damage_diff_turn10',
    'damage_diff_turn20','damage_diff_turn25','damage_diff_turn30',
    'hp_trend_diff','feat_switch_diff','feat_aggression_diff','hp_diff_std',
    'hp_diff_range','momentum_shift_turn','comeback_score','early_sustain',
    'status_balance','boost_volatility','boost_trend','move_power_diff',
    'move_diversity_diff','stall_ratio','aggression_index',
    'stats_speed_interaction',
    'hp_vs_stats_ratio','damage_ratio_turn25_30','damage_ratio_turn20_25',
    'damage_ratio_turn10_20','damage_ratio_turn10_30',
    'atk_def_ratio_p1','atk_def_ratio_p2','hp_speed_interaction_lead','hp_def_ratio_p1',
    'hp_def_ratio_p2','p1_hp_mean','p2_hp_mean','hp_diff_mean','hp_diff_last',
    'p1_boost_mean','p2_boost_mean','boost_diff_mean','p1_status_total',
    'p2_status_total','momentum_flips','p1_aggression','p2_aggression',
    'aggression_diff','feat_team_emb_sim',
    'lead_type_adv','meta_diff','feat_status_diff_inflicted','status_setup_diff',
    
]


In [None]:
from main import load_data
from Features.features_olya import create_advanced_features_gen2
import os
import pandas as pd
from utils.load_json import load_jsonl
from sklearn.model_selection import train_test_split

test_size=0.2
random_state=42
train_df, test_df = load_data()

# Feature engineering
X_train_features = create_advanced_features_gen2(train_df)
X_test_features = create_advanced_features_gen2(test_df)

# Target
y_train = train_df.set_index('battle_id')['player_won']

# Train/val split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features,
    y_train,
    test_size=test_size,
    random_state=random_state,
    stratify=y_train
)

print("Shapes:")
print(X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape)


# Now you can create pipelines and call optimizers




✓ train.jsonl loaded successfully. Shape: (10000, 5)
✓ test.jsonl loaded successfully. Shape: (5000, 4)


Generating advanced features:   0%|          | 0/10000 [00:00<?, ?it/s]

Generating advanced features:   0%|          | 0/5000 [00:00<?, ?it/s]

Shapes:
(8000, 84) (2000, 84) (8000,) (2000,)


In [None]:
from Models.pipeline import get_pipeline

"""
Available models and recommended scaler usage:

1. Logistic Regression ('logistic')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: C, penalty ('l1', 'l2'), class_weight

2. Random Forest ('random_forest')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features

3. XGBoost ('xgboost')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma

4. LightGBM ('lightgbm')
   - Recommended scaler: StandardScaler (default 'auto')
   - Key parameters: n_estimators, num_leaves, learning_rate, max_depth, feature_fraction, bagging_fraction, min_child_samples, lambda_l1, lambda_l2

5. CatBoost ('catboost')
   - Recommended scaler: StandardScaler (default 'auto')
   - Key parameters: depth, learning_rate, iterations, l2_leaf_reg, random_seed, task_type

6. Gradient Boosting ('gradient_boost')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, learning_rate, min_samples_split, min_samples_leaf, subsample
"""

pipeline_xgb = get_pipeline('xgboost', numerical_features=features, scaler='false')


In [29]:
from sklearn.metrics import accuracy_score

# Drop problematic object columns
X_train_split = X_train_split#.drop(columns=['p1_seen_pokemons', 'p2_seen_pokemons'])
X_val_split = X_val_split#.drop(columns=['p1_seen_pokemons', 'p2_seen_pokemons'])

# Fit pipeline on cleaned data
pipeline_xgb.fit(X_train_split, y_train_split)

# Predict on validation set
val_preds = pipeline_xgb.predict(X_val_split)

# Evaluate
val_acc = accuracy_score(y_val_split, val_preds)
print(f"Validation Accuracy: {val_acc:.4f}")


Validation Accuracy: 0.8150


In [30]:
from sklearn.metrics import classification_report, confusion_matrix

# Probabilities
val_probs = pipeline_xgb.predict_proba(X_val_split)[:,1]  # for binary classification

# Detailed metrics
print(classification_report(y_val_split, val_preds))
print(confusion_matrix(y_val_split, val_preds))


              precision    recall  f1-score   support

       False       0.81      0.83      0.82      1000
        True       0.82      0.80      0.81      1000

    accuracy                           0.81      2000
   macro avg       0.82      0.81      0.81      2000
weighted avg       0.82      0.81      0.81      2000

[[829 171]
 [199 801]]


In [31]:
from paramethers.cat_grid import param_grid as catboost_param_grid
from paramethers.gb_grid import param_grid as gradientboost_param_grid
from paramethers.lgb_grid import param_grid as lightgbm_param_grid
from paramethers.log_grid import param_grid as logistic_param_grid
from paramethers.rf_grid import param_grid as randomforest_param_grid
from paramethers.xgb_grid import param_grid as xgboost_param_grid


from optimisers.gridsearch_optimizer import run_grid_search
from optimisers.optuna_optimizer import optimize_optuna
from optimisers.randomsearch_optimizer import run_random_search


In [32]:
xgboost_param_grid

{'classifier__max_depth': [3, 4, 5, 6, 7, 8],
 'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
 'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
 'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
 'classifier__min_child_weight': [1, 2, 3, 5, 7, 10],
 'classifier__gamma': [0, 0.5, 1, 2, 3, 5],
 'classifier__reg_alpha': [0.001, 0.01, 0.1, 1, 5, 10],
 'classifier__reg_lambda': [0.001, 0.01, 0.1, 1, 5, 10],
 'classifier__n_estimators': [100, 200, 300, 400, 500]}

In [33]:
best_params, best_score = optimize_optuna(
    lambda: pipeline_xgb,
    X_train_split,
    y_train_split,
    X_val_split,
    y_val_split,
    xgboost_param_grid,
    n_trials=50
)


[19:32:13] INFO: Starting Optuna optimization for 50 trials...
[I 2025-11-14 19:32:13,133] A new study created in memory with name: no-name-fda70c6c-a6c7-474c-8fd0-3020ceeb4c47
[19:32:14] INFO: Trial 1/50 - Accuracy: 0.8160 - Params: {'classifier__max_depth': 3, 'classifier__learning_rate': 0.01, 'classifier__subsample': 0.7, 'classifier__colsample_bytree': 1.0, 'classifier__min_child_weight': 1, 'classifier__gamma': 2, 'classifier__reg_alpha': 10, 'classifier__reg_lambda': 0.1, 'classifier__n_estimators': 500}
[I 2025-11-14 19:32:14,605] Trial 0 finished with value: 0.816 and parameters: {'classifier__max_depth': 3, 'classifier__learning_rate': 0.01, 'classifier__subsample': 0.7, 'classifier__colsample_bytree': 1.0, 'classifier__min_child_weight': 1, 'classifier__gamma': 2, 'classifier__reg_alpha': 10, 'classifier__reg_lambda': 0.1, 'classifier__n_estimators': 500}. Best is trial 0 with value: 0.816.
[19:32:15] INFO: Trial 2/50 - Accuracy: 0.8125 - Params: {'classifier__max_depth': 7,