In [1]:
from main import load_data
from Features.features_kayo import create_advanced_features
import os
import pandas as pd
from utils.load_json import load_jsonl
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

test_size=0.2
random_state=42
train_df, test_df = load_data()

# Feature engineering
X_train_features = create_advanced_features(train_df)
X_test_features = create_advanced_features(test_df)

# Target
y_train = train_df.set_index('battle_id')['player_won'].loc[X_train_features.index]

# Train/val split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_features,
    y_train,
    test_size=test_size,
    random_state=random_state,
    stratify=y_train
)

print("Shapes:")
print(X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape)

  from .autonotebook import tqdm as notebook_tqdm


✓ Local environment detected. Loading data from: Data
Riga 4877 rimossa con successo.
✓ train.jsonl loaded successfully. Shape: (9996, 5)
✓ test.jsonl loaded successfully. Shape: (5000, 4)


Creazione features: 100%|██████████| 9996/9996 [00:02<00:00, 3345.43it/s]
Creazione features: 100%|██████████| 5000/5000 [00:01<00:00, 4569.09it/s]


Shapes:
(7996, 26) (2000, 26) (7996,) (2000,)


In [2]:
from Models.pipeline import get_pipeline

"""
Available models and recommended scaler usage:

1. Logistic Regression ('logistic')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: C, penalty ('l1', 'l2'), class_weight

2. Random Forest ('random_forest')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features

3. XGBoost ('xgboost')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma

4. LightGBM ('lightgbm')
   - Recommended scaler: StandardScaler (default 'auto')
   - Key parameters: n_estimators, num_leaves, learning_rate, max_depth, feature_fraction, bagging_fraction, min_child_samples, lambda_l1, lambda_l2

5. CatBoost ('catboost')
   - Recommended scaler: StandardScaler (default 'auto')
   - Key parameters: depth, learning_rate, iterations, l2_leaf_reg, random_seed, task_type

6. Gradient Boosting ('gradient_boost')
   - Recommended scaler: RobustScaler (default 'auto')
   - Key parameters: n_estimators, max_depth, learning_rate, min_samples_split, min_samples_leaf, subsample
"""

"\nAvailable models and recommended scaler usage:\n\n1. Logistic Regression ('logistic')\n   - Recommended scaler: RobustScaler (default 'auto')\n   - Key parameters: C, penalty ('l1', 'l2'), class_weight\n\n2. Random Forest ('random_forest')\n   - Recommended scaler: RobustScaler (default 'auto')\n   - Key parameters: n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features\n\n3. XGBoost ('xgboost')\n   - Recommended scaler: RobustScaler (default 'auto')\n   - Key parameters: n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma\n\n4. LightGBM ('lightgbm')\n   - Recommended scaler: StandardScaler (default 'auto')\n   - Key parameters: n_estimators, num_leaves, learning_rate, max_depth, feature_fraction, bagging_fraction, min_child_samples, lambda_l1, lambda_l2\n\n5. CatBoost ('catboost')\n   - Recommended scaler: StandardScaler (default 'auto')\n   - Key parameters: depth, learning_rate, iterations, l2_leaf_reg, random_seed, task_type\n\n6. Gradi

In [3]:
object_cols = ['p1_seen_pokemons', 'p2_seen_pokemons', 'p1_moves_used', 'p2_moves_used']


all_categorical_features = ['p1_lead_name', 'p2_lead_name']

kayo_features_list = [
    col for col in X_train_split.columns 
    if col not in object_cols and col not in all_categorical_features
]

print(f"Total de colunas no DataFrame (sem as de objeto): {len(kayo_features_list) + len(all_categorical_features)}")
print(f"Encontradas {len(kayo_features_list)} features numéricas.")
print(f"Encontradas {len(all_categorical_features)} features categóricas.")

pipeline_logistic = get_pipeline(
    model_name='logistic',
    numerical_features=kayo_features_list,        
    categorical_features=all_categorical_features,
    scaler='robust' 
)

Total de colunas no DataFrame (sem as de objeto): 26
Encontradas 24 features numéricas.
Encontradas 2 features categóricas.


In [4]:
# Correct way to change 'C' for LogisticRegression inside a pipeline
pipeline_logistic.set_params(classifier__C=1.0)


0,1,2
,steps,"[('preprocessor', ...), ('remove_constant_features', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,threshold,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,8000


In [5]:
# Train the model
pipeline_logistic.fit(X_train_split, y_train_split)

# Predict on validation set
val_preds = pipeline_logistic.predict(X_val_split)

# Evaluate
from sklearn.metrics import accuracy_score
val_acc = accuracy_score(y_val_split, val_preds)

print(f"Validation Accuracy: {val_acc:.4f}")

Validation Accuracy: 0.8050


In [6]:
from sklearn.metrics import classification_report, confusion_matrix

# Probabilities
val_probs = pipeline_logistic.predict_proba(X_val_split)[:,1]  # for binary classification

# Detailed metrics
print(classification_report(y_val_split, val_preds))
print(confusion_matrix(y_val_split, val_preds))


              precision    recall  f1-score   support

       False       0.80      0.81      0.81      1000
        True       0.81      0.80      0.80      1000

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.80      2000
weighted avg       0.81      0.81      0.80      2000

[[806 194]
 [196 804]]


In [7]:
param_grid = {
    'selectkbest__k': [40, 50, 60],
    'classifier__penalty': ['l1'],
    'classifier__C': [100, 500, 1000],
    'classifier__solver': ['liblinear'] 
}

In [8]:
logistic_param_grid = param_grid

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

# Get existing preprocessor and classifier from your pipeline
pipeline_logistic_base = get_pipeline(
    model_name='logistic',
    numerical_features=kayo_features_list,
    categorical_features=all_categorical_features,
    scaler='robust' 
)

preprocessor = pipeline_logistic_base.named_steps['preprocessor']
classifier = pipeline_logistic_base.named_steps['classifier']

# Rebuild pipeline with SelectKBest
tuning_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selectkbest', SelectKBest(score_func=f_classif)),
    ('classifier', classifier)
])

In [10]:
from paramethers.log_grid import param_grid as logistic_param_grid
from optimisers.gridsearch_optimizer import run_grid_search

# 3. Run Grid Search with the tuning pipeline
best_pipeline, best_params, best_score = run_grid_search(
    tuning_pipeline,  
    X_train_features, 
    y_train,          
    param_grid=logistic_param_grid,
    cv=10,
)

# 4. Display results
print("\n--- Risultati GridSearchCV ---")
print(f"Migliori parametri trovati: {best_params}")
print(f"Migliore Accuracy (media CV): {best_score:.4f}")

Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV 7/10] END classifier__C=1.0, classifier__class_weight=None, classifier__penalty=l1, classifier__solver=liblinear;, score=0.806 total time=  18.1s
[CV 3/10] END classifier__C=1.0, classifier__class_weight=None, classifier__penalty=l1, classifier__solver=liblinear;, score=0.832 total time=  18.2s
[CV 9/10] END classifier__C=1.0, classifier__class_weight=None, classifier__penalty=l1, classifier__solver=liblinear;, score=0.803 total time=  18.5s
[CV 10/10] END classifier__C=1.0, classifier__class_weight=None, classifier__penalty=l1, classifier__solver=liblinear;, score=0.842 total time=  18.6s
[CV 6/10] END classifier__C=1.0, classifier__class_weight=None, classifier__penalty=l1, classifier__solver=liblinear;, score=0.813 total time=  18.8s
[CV 8/10] END classifier__C=1.0, classifier__class_weight=None, classifier__penalty=l1, classifier__solver=liblinear;, score=0.841 total time=  18.4s
[CV 1/10] END classifier__C=1.0, cla

KeyboardInterrupt: 

In [12]:
final_model = best_pipeline

from Submission.submit import save_submission
import pandas as pd

save_submission(X_test_features, final_model)

[INFO] Submission created: submissions/Pipeline_20251115_030152.csv


'submissions/Pipeline_20251115_030152.csv'