In [1]:
from main import load_data
from Features.features_olya import create_advanced_features_gen2
import os
import pandas as pd
from utils.load_json import load_jsonl
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

test_size=0.2
random_state=42
train_df, test_df = load_data()

# Subset already created
fraction_to_use = 1.0
train_df_subset = train_df.sample(frac=fraction_to_use, random_state=42).reset_index(drop=True)
test_df_subset = test_df.sample(frac=fraction_to_use, random_state=42).reset_index(drop=True)

# Feature engineering
X_train_features = create_advanced_features_gen2(train_df_subset)
X_test_features = create_advanced_features_gen2(test_df_subset)


# Target
y_train = train_df_subset.set_index('battle_id')['player_won']
y_test = test_df_subset.set_index('battle_id')

# Train/validation split on the subset ONLY
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features,  # must match y_train
    y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print("Shapes:")
print(X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape)


✓ Local environment detected. Loading data from: Data
✓ train.jsonl loaded successfully. Shape: (10000, 5)
✓ test.jsonl loaded successfully. Shape: (5000, 4)


Generating advanced features:   0%|          | 0/10000 [00:00<?, ?it/s]

Generating advanced features:   0%|          | 0/5000 [00:00<?, ?it/s]

Shapes:
(8000, 110) (2000, 110) (8000,) (2000,)


In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import numpy as np

# --- Expand Pokémon columns ---
def expand_seen_pokemon_features(df, prefix='p1_seen_pokemons'):
    if prefix not in df.columns:
        return df
    num_pokemon = len(df[prefix].iloc[0])
    expanded = pd.DataFrame(
        df[prefix].tolist(),
        index=df.index,
        columns=[f"{prefix}_{i}" for i in range(num_pokemon)]
    )
    df = df.drop(columns=[prefix])
    return pd.concat([df, expanded], axis=1)

for col_prefix in ['p1_seen_pokemons', 'p2_seen_pokemons']:
    X_train_split = expand_seen_pokemon_features(X_train_split, col_prefix)
    X_val_split = expand_seen_pokemon_features(X_val_split, col_prefix)
    X_test_features = expand_seen_pokemon_features(X_test_features, col_prefix)

# --- Existing feature groups ---
lead_embedding_columns = [
    'p1_lead_hp','p1_lead_atk','p1_lead_def','p1_lead_spa','p1_lead_spd','p1_lead_spe',
    'p2_lead_hp','p2_lead_atk','p2_lead_def','p2_lead_spa','p2_lead_spd','p2_lead_spe',
    'p1_team_emb_sum_hp', 'p1_team_emb_sum_atk', 'p1_team_emb_sum_def', 'p1_team_emb_sum_spa',
    'p1_team_emb_sum_spd', 'p1_team_emb_sum_spe',
    'p1_team_emb_mean_hp', 'p1_team_emb_mean_atk', 'p1_team_emb_mean_def', 'p1_team_emb_mean_spa',
    'p1_team_emb_mean_spd', 'p1_team_emb_mean_spe',
    'p2_team_emb_sum_hp', 'p2_team_emb_sum_atk', 'p2_team_emb_sum_def', 'p2_team_emb_sum_spa',
    'p2_team_emb_sum_spd', 'p2_team_emb_sum_spe',
    'p2_team_emb_mean_hp', 'p2_team_emb_mean_atk', 'p2_team_emb_mean_def', 'p2_team_emb_mean_spa',
    'p2_team_emb_mean_spd', 'p2_team_emb_mean_spe'
]

pokemon_columns = [c for c in X_train_split.columns if c.startswith(('p1_seen_pokemons_', 'p2_seen_pokemons_'))]

core_features = [
    'lead_speed_diff','hp_advantage_seen','mons_revealed_diff','team_status_diff',
    'total_damage_dealt','status_turns','lead_type_adv','meta_diff','hp_trend_diff',
    'aggression_index','hp_diff_std','momentum_shift_turn'
]

correlation_features = [
    'p1_lead_special_total','p2_lead_special_total','special_total_diff',
    'p1_lead_physical_total','p2_lead_physical_total','physical_total_diff',
    'atk_def_ratio_p1','atk_def_ratio_p2',
    'hp_speed_interaction_lead',
    'hp_def_ratio_p1','hp_def_ratio_p2',
    'hp_vs_total_stats_p1','hp_vs_total_stats_p2'
]

# --- Gen3 advanced features ---
advanced_features = [
    # Existing advanced features
    'feat_switch_diff','feat_aggression_diff','comeback_score','damage_ratio_turn25_30',
    'stall_ratio','boost_volatility','boost_trend',
    'p1_hp_min', 'p1_hp_max', 'p1_hp_mean', 'p1_hp_trend',
    'p1_boost_min', 'p1_boost_max', 'p1_boost_mean', 'p1_boost_trend',
    'p1_status_total', 'p1_status_ratio',
    'p2_hp_min', 'p2_hp_max', 'p2_hp_mean', 'p2_hp_trend',
    'p2_boost_min', 'p2_boost_max', 'p2_boost_mean', 'p2_boost_trend',
    'p2_status_total', 'p2_status_ratio',
    'hp_diff_std', 'hp_diff_range', 'momentum_shift_turn',
    'comeback_score', 'early_sustain', 'feat_move_power_diff',
    'status_balance',           # <- newly added
    'move_diversity_diff',      # <- newly added
    'feat_status_diff_inflicted', # <- newly added
]


advanced_features += correlation_features

extra_features = [
    'end_boost_diff',
    'total_healing_done','first_faint_turn',
    'status_setup_diff','total_stats_diff',
    'damage_diff_turn10','damage_diff_turn20','damage_diff_turn25','damage_diff_turn30',
    'boost_volatility',
    'move_power_diff','stats_speed_interaction',
    'hp_vs_stats_ratio','lead_total_stats_p1','lead_total_stats_p2',
    'atk_hp_ratio_p1','atk_hp_ratio_p2','def_hp_ratio_p1','def_hp_ratio_p2',
    'feat_hp_trend_diff','hp_diff_mean',
    'hp_diff_last','boost_diff_mean',
    'momentum_flips','p1_aggression','p2_aggression'
]

advanced_features += extra_features


# Utility to evaluate and print
def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    print(f"{name} Validation Accuracy: {acc:.4f}")
    return acc


In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import numpy as np

# Combine all features you want
all_features = [
    'lead_speed_diff','hp_advantage_seen','mons_revealed_diff','team_status_diff',
    'end_boost_diff','total_damage_dealt','total_healing_done','status_turns',
    'first_faint_turn','lead_type_adv','meta_diff','status_setup_diff','total_stats_diff',
    'damage_diff_turn10','damage_diff_turn20','damage_diff_turn25','damage_diff_turn30',
    'hp_trend_diff','feat_switch_diff','feat_aggression_diff','hp_diff_std','hp_diff_range',
    'momentum_shift_turn','comeback_score','early_sustain','status_balance','boost_volatility',
    'boost_trend','move_power_diff','move_diversity_diff','stall_ratio','aggression_index',
    'stats_speed_interaction','hp_vs_stats_ratio','damage_ratio_turn25_30','damage_ratio_turn20_25',
    'damage_ratio_turn10_20','damage_ratio_turn10_30','p1_lead_special_total','p2_lead_special_total',
    'special_total_diff','p1_lead_physical_total','p2_lead_physical_total','physical_total_diff',
    'atk_def_ratio_p1','atk_def_ratio_p2','hp_speed_interaction_lead','hp_def_ratio_p1','hp_def_ratio_p2',
    'hp_vs_total_stats_p1','hp_vs_total_stats_p2','lead_total_stats_p1','lead_total_stats_p2',
    'atk_hp_ratio_p1','atk_hp_ratio_p2','def_hp_ratio_p1','def_hp_ratio_p2','feat_hp_trend_diff',
    'feat_status_diff_inflicted','p1_hp_mean','p2_hp_mean','hp_diff_mean','hp_diff_last','p1_boost_mean',
    'p2_boost_mean','boost_diff_mean','p1_status_total','p2_status_total','momentum_flips',
    'p1_aggression','p2_aggression','aggression_diff'
] + ['p1_lead_hp','p1_lead_atk','p1_lead_def','p1_lead_spa','p1_lead_spd','p1_lead_spe',
     'p2_lead_hp','p2_lead_atk','p2_lead_def','p2_lead_spa','p2_lead_spd','p2_lead_spe']

# Keep only the features present in the dataframe
existing_features = [f for f in all_features if f in X_train_split.columns]



In [None]:
log_features = existing_features

preprocessor_log = ColumnTransformer(
    [('num', StandardScaler(), log_features)]
)

pipeline_log = Pipeline([
    ('preprocessor', preprocessor_log),
    ('classifier', LogisticRegression(max_iter=8000, solver='saga', random_state=42))
])

param_dist_log = {
    'classifier__C': [10,50,100,150,175,200],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__class_weight': [None, 'balanced']
}

search_log = RandomizedSearchCV(
    pipeline_log, param_distributions=param_dist_log,
    n_iter=15, scoring='accuracy', n_jobs=-1, cv=5, random_state=42
)
search_log.fit(X_train_split, y_train_split)

log_best = search_log.best_estimator_
log_acc = accuracy_score(y_val_split, log_best.predict(X_val_split))
print("Logistic Regression")
print("Best Params:", search_log.best_params_)
print(f"Validation Accuracy: {log_acc:.4f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- Extract numeric features from preprocessor
num_features = log_best.named_steps['preprocessor'].transformers_[0][2]

# --- Get absolute coefficient values as importances
coefs = log_best.named_steps['classifier'].coef_[0]
feature_importance_df = pd.DataFrame({
    'feature': num_features,
    'importance': np.abs(coefs)
}).sort_values(by='importance', ascending=False)

print("Feature importances (LR):\n", feature_importance_df)

# --- Keep features above threshold (e.g., 0.01)
threshold = 0.01
selected_features = feature_importance_df[feature_importance_df['importance'] > threshold]['feature'].tolist()
print(f"\nKeeping {len(selected_features)} features out of {len(num_features)}")

# --- Build new pipeline with selected features
preprocessor_selected = ColumnTransformer([('num', StandardScaler(), selected_features)])
pipeline_selected = Pipeline([
    ('preprocessor', preprocessor_selected),
    ('classifier', LogisticRegression(max_iter=8000, solver='saga', random_state=42))
])

param_dist_selected = {
    'classifier__C': [10,50,100,150,175],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__class_weight': [None, 'balanced']
}

search_selected = RandomizedSearchCV(
    pipeline_selected,
    param_distributions=param_dist_selected,
    n_iter=15,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    random_state=42
)

search_selected.fit(X_train_split, y_train_split)

# --- Evaluate
log_best_selected = search_selected.best_estimator_
log_acc_selected = accuracy_score(y_val_split, log_best_selected.predict(X_val_split))

print("Logistic Regression (selected features)")
print("Best Params:", search_selected.best_params_)
print(f"Validation Accuracy: {log_acc_selected:.4f}")
print(f"Accuracy change: {log_acc_selected - log_acc:.4f}")


Feature importances (LR):
                        feature  importance
2           mons_revealed_diff   29.924924
66             p1_status_total   26.829265
3             team_status_diff   15.246327
58  feat_status_diff_inflicted   15.246327
25              status_balance   15.246327
..                         ...         ...
31            aggression_index    0.000000
40          special_total_diff    0.000000
41      p1_lead_physical_total    0.000000
43         physical_total_diff    0.000000
42      p2_lead_physical_total    0.000000

[84 rows x 2 columns]

Keeping 58 features out of 84


In [None]:
rf_features = existing_features

preprocessor_rf = ColumnTransformer([('num', StandardScaler(), rf_features)])

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor_rf),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_dist_rf = {
    'classifier__n_estimators': [200, 300, 400],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 0.5, 0.7]
}

search_rf = RandomizedSearchCV(
    pipeline_rf, param_distributions=param_dist_rf,
    n_iter=15, scoring='accuracy', n_jobs=-1, cv=5, random_state=42
)
search_rf.fit(X_train_split, y_train_split)

rf_best = search_rf.best_estimator_
rf_acc = accuracy_score(y_val_split, rf_best.predict(X_val_split))
print("Random Forest")
print("Best Params:", search_rf.best_params_)
print(f"Validation Accuracy: {rf_acc:.4f}")


Random Forest
Best Params: {'classifier__n_estimators': 300, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__max_depth': None}
Validation Accuracy: 0.8350


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# 1️⃣ Extract numeric feature names from ColumnTransformer
num_features = rf_best.named_steps['preprocessor'].transformers_[0][2]

# 2️⃣ Get feature importances
rf_model = rf_best.named_steps['classifier']
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': num_features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Feature importances:\n", feature_importance_df)

# 3️⃣ Select only features above a threshold (e.g., 1% importance)
threshold = 0.01  # adjust based on how aggressive you want to be
selected_features = feature_importance_df[feature_importance_df['importance'] > threshold]['feature'].tolist()

print(f"\nKeeping {len(selected_features)} features out of {len(num_features)}")

# 4️⃣ Build new preprocessor and pipeline with selected features
preprocessor_selected = ColumnTransformer([('num', StandardScaler(), selected_features)])

pipeline_selected = Pipeline([
    ('preprocessor', preprocessor_selected),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_dist_selected = {
    'classifier__n_estimators': [200, 300, 400],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 0.5, 0.7]
}

# 5️⃣ Randomized Search on selected features
search_selected = RandomizedSearchCV(
    pipeline_selected,
    param_distributions=param_dist_selected,
    n_iter=15,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    random_state=42
)

search_selected.fit(X_train_split, y_train_split)

# 6️⃣ Evaluate
rf_best_selected = search_selected.best_estimator_
rf_acc_selected = accuracy_score(y_val_split, rf_best_selected.predict(X_val_split))

print("Random Forest (selected features)")
print("Best Params:", search_selected.best_params_)
print(f"Validation Accuracy: {rf_acc_selected:.4f}")

# 7️⃣ Compare
print(f"\nAccuracy change: {rf_acc_selected - rf_acc:.4f}")


Feature importances:
                       feature  importance
1           hp_advantage_seen    0.262347
3            team_status_diff    0.195363
2          mons_revealed_diff    0.093715
8               hp_trend_diff    0.074596
9            aggression_index    0.064927
15     damage_ratio_turn25_30    0.042100
4          total_damage_dealt    0.040265
10                hp_diff_std    0.026228
11        momentum_shift_turn    0.025986
5                status_turns    0.025930
14             comeback_score    0.024937
17           boost_volatility    0.016876
0             lead_speed_diff    0.011748
24        physical_total_diff    0.009156
21         special_total_diff    0.008902
18                boost_trend    0.006951
43                p2_lead_spe    0.004982
16                stall_ratio    0.004352
37                p1_lead_spe    0.004030
27  hp_speed_interaction_lead    0.003956
28            hp_def_ratio_p1    0.003612
29            hp_def_ratio_p2    0.003297
25          

In [None]:
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# --- 1. Define Top 50 features (from your previous list) ---
top50_features = [
    'lead_speed_diff', 'total_stats_diff', 'p1_lead_hp', 'p2_lead_hp', 'p1_lead_atk', 'p2_lead_atk',
    'p1_lead_def', 'p2_lead_def', 'p1_lead_spa', 'p2_lead_spa', 'p1_lead_spd', 'p2_lead_spd',
    'p1_lead_spe', 'p2_lead_spe', 'hp_advantage_seen', 'mons_revealed_diff', 'team_status_diff',
    'feat_hp_trend_diff', 'total_damage_dealt', 'damage_diff_turn10', 'damage_diff_turn25', 'damage_diff_turn30',
    'hp_diff_std', 'hp_diff_range', 'comeback_score', 'early_sustain', 'status_turns', 'first_faint_turn',
    'status_balance', 'boost_volatility', 'boost_trend', 'status_setup_diff', 'feat_status_diff_inflicted',
    'feat_switch_diff', 'feat_aggression_diff', 'move_power_diff', 'move_diversity_diff', 'stall_ratio',
    'aggression_index', 'stats_speed_interaction', 'hp_vs_stats_ratio', 'damage_ratio_turn25_30',
    'damage_ratio_turn20_25', 'damage_ratio_turn10_20', 'damage_ratio_turn10_30', 'special_total_diff',
    'physical_total_diff', 'hp_speed_interaction_lead', 'atk_def_ratio_p1', 'atk_def_ratio_p2'
] 
top50_features+= existing_features
# --- 2. Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split[top50_features])
X_val_scaled = scaler.transform(X_val_split[top50_features])

# --- 3. Optuna objective for XGBoost ---
def xgb_objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'n_estimators': 300,
        'random_state': 42,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)
    model.fit(
        X_train_scaled, y_train_split,
        eval_set=[(X_val_scaled, y_val_split)],
        verbose=False
    )

    val_pred = model.predict(X_val_scaled)
    return accuracy_score(y_val_split, val_pred)

# --- 4. Run Optuna study ---
study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=40)

# --- 5. Best XGBoost model ---
best_xgb_params = study.best_params
best_xgb = XGBClassifier(**best_xgb_params, n_estimators=300, 
                         random_state=42, use_label_encoder=False, eval_metric='logloss')
best_xgb.fit(X_train_scaled, y_train_split,
             eval_set=[(X_val_scaled, y_val_split)],
             verbose=False)

# --- 6. Evaluate ---
print("Best XGBoost Accuracy:", accuracy_score(y_val_split, best_xgb.predict(X_val_scaled)))


[I 2025-11-15 18:03:40,510] A new study created in memory with name: no-name-1040c3e1-74a1-45ff-a367-90e3feff92c4
[I 2025-11-15 18:03:41,678] Trial 0 finished with value: 0.82 and parameters: {'max_depth': 4, 'learning_rate': 0.20591800753925607, 'subsample': 0.7963113978206501, 'colsample_bytree': 0.7117088662531388}. Best is trial 0 with value: 0.82.
[I 2025-11-15 18:03:43,155] Trial 1 finished with value: 0.833 and parameters: {'max_depth': 5, 'learning_rate': 0.06399734459170783, 'subsample': 0.8429877457601078, 'colsample_bytree': 0.705090901396117}. Best is trial 1 with value: 0.833.
[I 2025-11-15 18:03:44,231] Trial 2 finished with value: 0.8295 and parameters: {'max_depth': 3, 'learning_rate': 0.08260070486689643, 'subsample': 0.9148641683315223, 'colsample_bytree': 0.809509968960166}. Best is trial 1 with value: 0.833.
[I 2025-11-15 18:03:46,035] Trial 3 finished with value: 0.83 and parameters: {'max_depth': 6, 'learning_rate': 0.11329452138859876, 'subsample': 0.939421652236

[I 2025-11-15 18:05:18,411] Trial 33 finished with value: 0.8385 and parameters: {'max_depth': 7, 'learning_rate': 0.01822834684910135, 'subsample': 0.7892078213369778, 'colsample_bytree': 0.8241649308875245}. Best is trial 21 with value: 0.84.
[I 2025-11-15 18:05:21,202] Trial 34 finished with value: 0.8345 and parameters: {'max_depth': 7, 'learning_rate': 0.014887936148241007, 'subsample': 0.8151710637023682, 'colsample_bytree': 0.8109633746603037}. Best is trial 21 with value: 0.84.
[I 2025-11-15 18:05:29,874] Trial 35 finished with value: 0.837 and parameters: {'max_depth': 7, 'learning_rate': 0.018383287559310076, 'subsample': 0.7887893158592039, 'colsample_bytree': 0.8904479797874435}. Best is trial 21 with value: 0.84.
[I 2025-11-15 18:05:32,400] Trial 36 finished with value: 0.8365 and parameters: {'max_depth': 7, 'learning_rate': 0.03306796311363276, 'subsample': 0.8246649379917339, 'colsample_bytree': 0.7434808983336767}. Best is trial 21 with value: 0.84.
[I 2025-11-15 18:05

Best XGBoost Accuracy: 0.84


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

# --- define features ---
lgb_features = existing_features

# --- preprocessing ---
preprocessor_lgb = ColumnTransformer([
    ('num', StandardScaler(), lgb_features)
])

# --- base model ---
lgb_model = LGBMClassifier(
    boosting_type='dart',
    random_state=42
)

# --- pipeline for hyperparameter search ---
pipeline_lgb = Pipeline([
    ('preprocessor', preprocessor_lgb),
    ('classifier', lgb_model)
])

# --- hyperparameter space ---
param_dist_lgb = {
    'classifier__num_leaves': [31, 63, 127],
    'classifier__learning_rate': [0.03, 0.05, 0.1],
    'classifier__n_estimators': [300, 500],
    'classifier__max_depth': [10, -1],
    'classifier__feature_fraction': [0.8, 0.9, 1.0],
    'classifier__bagging_fraction': [0.8, 1.0]
}

# --- randomized search ---
search_lgb = RandomizedSearchCV(
    pipeline_lgb,
    param_distributions=param_dist_lgb,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    random_state=42
)

# --- fit search ---
search_lgb.fit(X_train_split, y_train_split)

# --- extract best preprocessor and model ---
best_preprocessor = search_lgb.best_estimator_.named_steps['preprocessor']
best_model_params = search_lgb.best_params_
best_model = LGBMClassifier(
    boosting_type='dart',
    random_state=42,
    n_estimators=best_model_params['classifier__n_estimators'],
    num_leaves=best_model_params['classifier__num_leaves'],
    learning_rate=best_model_params['classifier__learning_rate'],
    max_depth=best_model_params['classifier__max_depth'],
    feature_fraction=best_model_params['classifier__feature_fraction'],
    bagging_fraction=best_model_params['classifier__bagging_fraction']
)

# --- preprocess data manually ---
X_train_scaled = best_preprocessor.transform(X_train_split)
X_val_scaled = best_preprocessor.transform(X_val_split)

# --- train with early stopping ---
best_model.fit(
    X_train_scaled, y_train_split,
    eval_set=[(X_val_scaled, y_val_split)],
    eval_metric='logloss',
)

# --- evaluate ---
lgb_preds = best_model.predict(X_val_scaled)
lgb_acc = accuracy_score(y_val_split, lgb_preds)
print("LightGBM")
print("Best Params:", best_model_params)
print(f"Validation Accuracy: {lgb_acc:.4f}")


[LightGBM] [Info] Number of positive: 4000, number of negative: 4000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6544
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 4000, number of negative: 4000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6544
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM
Best Params: {'classifier__num_leaves': 31, 'classifier__n_estimators': 300, 'classifier__max_depth': -1, 'classifier__



In [None]:
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# --- 1. Define all features for CatBoost ---
cat_features = existing_features

# Fill NaNs with 0 to prevent fold mismatch issues
X_train_cat = X_train_split[cat_features].fillna(0)
X_val_cat   = X_val_split[cat_features].fillna(0)

# --- 2. CatBoost model ---
cat_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    task_type='CPU',
    early_stopping_rounds=20
)

# --- 3. Optional hyperparameter search ---
from sklearn.model_selection import RandomizedSearchCV

param_dist_cat = {
    'depth': [6, 8, 10],
    'learning_rate': [0.03, 0.05, 0.1],
    'iterations': [300, 500, 800],
    'l2_leaf_reg': [3, 5, 7]
}

search_cat = RandomizedSearchCV(
    cat_model,
    param_distributions=param_dist_cat,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    random_state=42
)

# --- 4. Fit search, pass eval_set directly ---
search_cat.fit(
    X_train_cat, y_train_split,
    eval_set=(X_val_cat, y_val_split)
)

cat_best = search_cat.best_estimator_
cat_acc = accuracy_score(y_val_split, cat_best.predict(X_val_cat))
print("CatBoost Best Accuracy:", cat_acc)
print("Best Params:", search_cat.best_params_)

# --- 5. Ensemble with your other models ---
from sklearn.ensemble import VotingClassifier




CatBoost Best Accuracy: 0.8355
Best Params: {'learning_rate': 0.03, 'l2_leaf_reg': 3, 'iterations': 800, 'depth': 8}


In [None]:
# --- 5. Ensemble with your other models ---
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('lgb', best_model),
        ('cat', cat_best),
        ('rf', rf_best),
        ('log', log_best)
    ],
    voting='soft',
    weights=[3, 1, 3, 1, 3]
)

# --- 6. Fit ensemble ---
ensemble.fit(X_train_cat, y_train_split)
y_pred_ens = ensemble.predict(X_val_cat)
ens_acc = accuracy_score(y_val_split, y_pred_ens)
print(f"\nEnsemble Validation Accuracy: {ens_acc:.4f}")


In [None]:
# --- 8. Predict ---
test_pred = ensemble.predict(X_test_features[existing_features])

# Convert to int if needed
test_pred_int = test_pred.astype(int)

# --- 9. Create submission DataFrame ---
submission = pd.DataFrame({
    'battle_id': test_df_subset['battle_id'],
    'player_won': test_pred_int
})

# --- 10. Save to CSV ---
submission.to_csv('submission_olya.csv', index=False)
print("Submission saved to submission.csv")

In [None]:
final_model = ensemble

from Submission.submit import save_submission

save_submission(X_test_features[existing_features], final_model, name = 'olya_model')