In [1]:
categorical_features = ['p1_lead_name', 'p2_lead_name']

logreg_numeric_features = [
    'lead_speed_diff',
    'hp_advantage_seen',
    'mons_revealed_diff',
    'team_status_diff',
    'end_boost_diff',
    'num_turns'
]

xgb_numeric_features = [
    'lead_type_adv',
    'lead_atk_diff',
    'lead_bulk_diff',
    'p1_team_avg_speed',
    'p1_team_avg_bulk',
    'p1_meta_threat_count',
    'p2_lead_is_meta_threat',
    'p1_lead_stay_duration',
    'p2_lead_forced_out',
    'first_ko_turn',
    'setup_advantage',
    'key_attack_adv',
    'weighted_status_diff'
]

In [None]:
from main import load_data
from Features.features_denise import create_specialist_features
import os
import pandas as pd
from utils.load_json import load_jsonl
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

test_size=0.2
random_state=42
train_df, test_df = load_data()

# Feature engineering
X_train_features = create_specialist_features(train_df)
X_test_features = create_specialist_features(test_df)

# Target
y_train = train_df.set_index('battle_id')['player_won'].loc[X_train_features.index]

# Train/val split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features,
    y_train,
    test_size=test_size,
    random_state=random_state,
    stratify=y_train
)

print("Shapes:")
print(X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape)


# Now you can create pipelines and call optimizers



  from .autonotebook import tqdm as notebook_tqdm


Riga 4877 rimossa con successo.
âœ“ train.jsonl loaded successfully. Shape: (9996, 5)
âœ“ test.jsonl loaded successfully. Shape: (5000, 4)


Analisi 'Specialist': 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 9996/9996 [00:02<00:00, 3713.12it/s]
Analisi 'Specialist': 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:00<00:00, 5487.47it/s]

Shapes:
(7996, 21) (2000, 21) (7996,) (2000,)





In [3]:
from Models.pipeline import get_pipeline

"""
Available models and recommended scaler usage:

1. Logistic Regression ('logistic')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: C, penalty ('l1', 'l2'), class_weight

2. Random Forest ('random_forest')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features

3. XGBoost ('xgboost')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma

4. LightGBM ('lightgbm')
   - Recommended scaler: StandardScaler (default 'auto')
   - Key parameters: n_estimators, num_leaves, learning_rate, max_depth, feature_fraction, bagging_fraction, min_child_samples, lambda_l1, lambda_l2

5. CatBoost ('catboost')
   - Recommended scaler: StandardScaler (default 'auto')
   - Key parameters: depth, learning_rate, iterations, l2_leaf_reg, random_seed, task_type

6. Gradient Boosting ('gradient_boost')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, learning_rate, min_samples_split, min_samples_leaf, subsample
"""



"\nAvailable models and recommended scaler usage:\n\n1. Logistic Regression ('logistic')\n   - Recommended scaler: RobustScaler (default 'auto')\n   - Key parameters: C, penalty ('l1', 'l2'), class_weight\n\n2. Random Forest ('random_forest')\n   - Recommended scaler: RobustScaler (default 'auto')\n   - Key parameters: n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features\n\n3. XGBoost ('xgboost')\n   - Recommended scaler: RobustScaler (default 'auto')\n   - Key parameters: n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma\n\n4. LightGBM ('lightgbm')\n   - Recommended scaler: StandardScaler (default 'auto')\n   - Key parameters: n_estimators, num_leaves, learning_rate, max_depth, feature_fraction, bagging_fraction, min_child_samples, lambda_l1, lambda_l2\n\n5. CatBoost ('catboost')\n   - Recommended scaler: StandardScaler (default 'auto')\n   - Key parameters: depth, learning_rate, iterations, l2_leaf_reg, random_seed, task_type\n\n6. Gradi

In [4]:
from paramethers.cat_grid import param_grid as catboost_param_grid
from paramethers.gb_grid import param_grid as gradientboost_param_grid
from paramethers.lgb_grid import param_grid as lightgbm_param_grid
from paramethers.log_grid import param_grid as logistic_param_grid
from paramethers.rf_grid import param_grid as randomforest_param_grid
from paramethers.xgb_grid import param_grid as xgboost_param_grid


from optimisers.gridsearch_optimizer import run_grid_search
from optimisers.optuna_optimizer import optimize_optuna
from optimisers.randomsearch_optimizer import run_random_search


In [5]:

# Logistic Regression pipeline
pipeline_logreg = get_pipeline(
    model_name='logistic',
    numerical_features=logreg_numeric_features,
     categorical_features= categorical_features,  
    scaler='standard'  # or 'auto'
)

pipeline_logreg.named_steps['classifier'].C = 10
pipeline_logreg.named_steps['classifier'].penalty = 'l1'



In [6]:
# XGBoost pipeline
pipeline_xgb = get_pipeline(
    model_name='xgboost',
    numerical_features= xgb_numeric_features,
    categorical_features= categorical_features,
    scaler='false'  # skip scaling for XGB numeric features
)

# Set XGB hyperparameters
pipeline_xgb.named_steps['classifier'].n_estimators = 600
pipeline_xgb.named_steps['classifier'].learning_rate = 0.03
pipeline_xgb.named_steps['classifier'].max_depth = 3
pipeline_xgb.named_steps['classifier'].subsample = 0.8
pipeline_xgb.named_steps['classifier'].colsample_bytree = 0.9
pipeline_xgb.named_steps['classifier'].gamma = 0.5


In [7]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# New param_grid for optimization
param_grid = {
    # --- Tune the meta-model (final_estimator) ---
    'final_estimator__C': [0.1, 1, 10, 100],
    
    # --- Tune the 'logreg' base estimator ---
    # Note: 'logreg' is the name you gave it in the StackingClassifier
    # 'classifier' is the name of the model step inside your 'pipeline_logreg'
    'logreg__classifier__C': [0.1, 1, 10], 
    'logreg__classifier__penalty': ['l1', 'l2'],

    # --- Tune the 'xgb' base estimator ---
    # Note: 'xgb' is the name you gave it in the StackingClassifier
    'xgb__classifier__n_estimators': [200, 400, 600],
    'xgb__classifier__max_depth': [3, 4, 5],
    'xgb__classifier__learning_rate': [0.01, 0.03, 0.1]
}

meta_model = LogisticRegression(C=1.0, random_state=42)

def stacking_clf_factory():
    meta_model = LogisticRegression(C=1.0, random_state=42)
    return StackingClassifier(
        estimators=[
            ('xgb', pipeline_xgb),
            ('logreg', pipeline_logreg)
        ],
        final_estimator=meta_model,
        passthrough=False,
        cv=5,
        n_jobs=-1,
        verbose=1
    )

stacking_clf = stacking_clf_factory()

print("ðŸš€ Starting Cross-Validation for the Meta-Model...")
scores = cross_val_score(
    stacking_clf,                
    X_train_features, 
    y_train,              
    cv=5,                 
    scoring='accuracy',
    n_jobs=-1
)

print("\\n--- âœ…Cross-Validation Results ---")
print(f"Accuracy (mean): {scores.mean() * 100:.2f}%")
print(f"Standard Deviation: {scores.std() * 100:.2f}%")

stacking_clf.fit(X_train_features, y_train)



ðŸš€ Starting Cross-Validation for the Meta-Model...


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.1s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.3s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.2s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   32.2s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   33.2s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   33.2s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   34.4s finished


\n--- âœ…Cross-Validation Results ---
Accuracy (mean): 84.14%
Standard Deviation: 0.83%


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.6s finished


0,1,2
,estimators,"[('xgb', ...), ('logreg', ...)]"
,final_estimator,LogisticRegre...ndom_state=42)
,cv,5
,stack_method,'auto'
,n_jobs,-1
,passthrough,False
,verbose,1

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,8000

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [None]:
# final_model = stacking_clf

# from Submission.submit import save_submission
# import pandas as pd

# save_submission(X_test_features, final_model)

[INFO] Submission created: submissions/StackingClassifier_20251114_230408.csv


'submissions/StackingClassifier_20251114_230408.csv'

In [None]:
best_pipeline, best_params, best_score = optimize_optuna(
    stacking_clf_factory,
    X_train_split,
    y_train_split,
    X_val_split,
    y_val_split,
    param_grid,
    n_trials=50,
    # cv = 5,
    # verbose = 2
)


[23:04:58] INFO: Starting Optuna optimization for 5 trials...
[I 2025-11-14 23:04:58,327] A new study created in memory with name: no-name-af26be2a-91e7-4f29-889b-b91e8fa7ee32
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished
[23:05:02] INFO: Trial 1/5 - Accuracy: 0.8135 - Params: {'final_estimator__C': 1, 'logreg__classifier__C': 10, 'logreg__classifier__penalty': 'l2', 'xgb__classifier__n_estimators': 400, 'xgb__classifier__max_depth': 5, 'xgb__classifier__learning_rate': 0.01}
[I 2025-11-14 23:05:02,094] Trial 0 finished with value: 0.8135 and parameters: {'final_estimator__C': 1, 'logreg__classifier__C': 10, 'logreg__classifier__penalty': 'l2', 'xgb__classifier__n_estimators': 400, 'xgb__classifier__max_depth': 5, 'xgb__classifier

In [10]:
final_model = best_pipeline

from Submission.submit import save_submission


save_submission(X_test_features, final_model)

[INFO] Submission created: submissions/StackingClassifier_20251114_230610.csv


'submissions/StackingClassifier_20251114_230610.csv'