In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv


In [2]:
seed=1
folds=5
k=3
trial=150

In [3]:
train_org = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')

In [4]:
train_new = pd.concat([train.drop(columns='id'), train_org], axis=0)

In [5]:
X = train_new.drop(columns=['Fertilizer Name'])
y = train_new['Fertilizer Name']
class_mapping = {
    '14-35-14':0,
    '10-26-26':1,
    '17-17-17':2,
    '28-28':3,
    '20-20':4,
    'DAP':5,
    'Urea':6
}
rev_class_mapping = {
    0:'14-35-14',
    1:'10-26-26',
    2:'17-17-17',
    3:'28-28',
    4:'20-20',
    5:'DAP',
    6:'Urea'
}
y = y.map(class_mapping)

In [6]:
def trial_domain_features(df):
    df = df.copy()
    df['N_deficient'] = df['Nitrogen'] < 20
    df['P_deficient'] = df['Phosphorous'] < 10  
    df['K_deficient'] = df['Potassium'] < 15
    return df.copy()

X = trial_domain_features(X)

In [7]:
cols = X.columns

In [8]:
X[cols] = X[cols].astype('category')

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import optuna
from optuna.pruners import HyperbandPruner, MedianPruner
from optuna.samplers import TPESampler
from sklearn.metrics import make_scorer

def mapk_score(y_true, y_score, k=3):
    sorted_predictions = np.argsort(y_score, axis=1)[:, -k:][:, ::-1]
    map_at_k = 0
    for i in range(k):
        map_at_k += (sorted_predictions[:, i] == y_true).sum() / (i+1)
    return map_at_k / len(y_score)

def xgb_objective(trial):
    params = {
        'tree_method': 'hist',  
        'predictor': 'gpu_predictor',
        'device': 'cuda',
        'random_state': seed,
        'objective': 'multi:softprob',
        'num_class': len(np.unique(y)),
        'verbosity': 0,
        'eval_metric': 'mlogloss',
        'enable_categorical': True,
        
        # VERY conservative for coarse search
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 0, 2),
        
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        
        'grow_policy': 'depthwise',
        'max_bin': trial.suggest_categorical('max_bin', [64, 128, 256]),
        'max_cat_threshold': trial.suggest_int('max_cat_threshold', 4, 32),
        'max_cat_to_onehot': trial.suggest_int('max_cat_to_onehot', 2, 6),
        
        'early_stopping_rounds': 100,
    }
    
    # If grow_policy is depthwise, max_leaves should be 0
    if params['grow_policy'] == 'depthwise':
        params['max_leaves'] = 0
    
    scores = []
    skf = StratifiedKFold(random_state=seed, n_splits=folds, shuffle=True)
    
    for fold, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        # Create model
        model = xgb.XGBClassifier(**params)
        
        # Fit with early stopping
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )
        
        # Get predictions and calculate MAP@3
        probas = model.predict_proba(X_valid)
        score = mapk_score(y_valid, probas, k=k)
        scores.append(score)
        
        # Report intermediate score for pruning after 2 folds
        if fold >= 2:
            intermediate_score = np.mean(scores)
            trial.report(intermediate_score, fold)
            
            # Check if trial should be pruned
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
    
    return np.mean(scores)

# sampler = TPESampler(
#     n_startup_trials=30,
#     n_ei_candidates=100,
#     multivariate=True,
#     group=True,
#     warn_independent_sampling=False
# )

# pruner = MedianPruner(
#     n_startup_trials=100,      # Let first 30 trials complete fully to establish median baseline
#     n_warmup_steps=2,         # After trial 30, start pruning after 2 folds (not before)
#     interval_steps=1          # Check for pruning after every fold (once warmup is done)
# )
# pruner = HyperbandPruner(
#     min_resource=2,
#     max_resource=folds,
#     reduction_factor=3,
#     bootstrap_count=5
# )

xgb_study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(n_startup_trials=10),
    pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=2),
    study_name='xgb_optimization'
)

xgb_study.optimize(xgb_objective, n_trials = trial, show_progress_bar=True)

[I 2025-06-15 18:34:41,172] A new study created in memory with name: xgb_optimization


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-06-15 18:41:17,575] Trial 0 finished with value: 0.3016996078431372 and parameters: {'n_estimators': 3944, 'learning_rate': 0.22879932221072577, 'max_depth': 28, 'min_child_weight': 3.960708205594546, 'gamma': 1.4015859765469676, 'subsample': 0.7567931196448453, 'colsample_bytree': 0.868376974866955, 'reg_alpha': 0.32845445325463185, 'reg_lambda': 3.2568651335593497, 'max_bin': 256, 'max_cat_threshold': 31, 'max_cat_to_onehot': 3}. Best is trial 0 with value: 0.3016996078431372.


In [None]:
print("Best trial:")
trial = study_xgb.best_trial
print(f"  RMSLE: {trial.value:.5f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Train final model
best_params = study_xgb.best_params.copy()
best_params.update({'early_stopping_rounds': None}) 