In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
/kaggle/input/optimal-fertilizers-features/training_data.csv
/kaggle/input/optimal-fertilizers-features/test_data.csv


In [2]:
seed=1
folds=5

In [3]:
X_new = pd.read_csv('/kaggle/input/optimal-fertilizers-features/training_data.csv')
test_new = pd.read_csv('/kaggle/input/optimal-fertilizers-features/test_data.csv')

train_org = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
y = train_org['Fertilizer Name']

In [4]:
class_mapping = {
    '14-35-14':0,
    '10-26-26':1,
    '17-17-17':2,
    '28-28':3,
    '20-20':4,
    'DAP':5,
    'Urea':6
}
rev_class_mapping = {
    0:'14-35-14',
    1:'10-26-26',
    2:'17-17-17',
    3:'28-28',
    4:'20-20',
    5:'DAP',
    6:'Urea'
}

y = y.map(class_mapping)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

def mapk_score(y_true, y_score, k=3):
    sorted_predictions = np.argsort(y_score, axis=1)[:, -3:][:, ::-1]
    map_at_3 = 0
    for i in range(3):
        map_at_3 += (sorted_predictions[:, i] == y_true).sum() / (i+1)

    return map_at_3 / len(y_score)

def cv_score_dict(X, y, model_dict, folds=folds, seed=seed, k=3):
    skf = StratifiedKFold(random_state=seed, n_splits=folds, shuffle=True)
    results = {}
    mapk_scorer = make_scorer(
        mapk_score, 
        needs_proba=True,
        greater_is_better=True,
        k=k
    )
    for name, model in model_dict.items():
        print(f'current model being processed :{name}')
        map_scores = []
        scores = []
        for i, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            # clf = clone(model)
            model.fit(X_train, y_train)
            # probas = model.predict_proba(X_valid)

            # score = mapk_score(y_valid, probas, k=k)
            score = mapk_scorer(model, X_valid, y_valid)
            print(f'Score for {name} on fold {i} is {score}')
            scores.append(score)
        
        results[name] = np.mean(scores)
        print(f'Mean score for {name} is {score}')

    return results

def cv_score(X, y, model, folds=folds, seed=seed, k=3):
    skf = StratifiedKFold(random_state=seed, n_splits=folds, shuffle=True)
    mapk_scorer = make_scorer(
        mapk_score, 
        needs_proba=True,
        greater_is_better=True,
        k=k
    )
    
    scores = []
    for i, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        clf = clone(model)
        clf.fit(X_train, y_train)
        
        score = mapk_scorer(clf, X_valid, y_valid)
        # print(f'Score on fold {i}: {score:.4f}')
        scores.append(score)
    
    mean_score = np.mean(scores)
    print(f'Mean CV score: {mean_score:.4f}')
    return mean_score

In [6]:
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tqdm.notebook import tqdm, trange  

def hill_climbing_forward_selection_continuous(X, y, model, foundational_features,
                                             folds=5, seed=1, k=3, iterations=2,
                                             verbose=True):
    """
    TRULY CONTINUOUS Hill Climbing - each iteration builds on the previous
    """
    all_features = list(X.columns)
    history = {}
    
    missing = set(foundational_features) - set(all_features)
    if missing:
        raise ValueError(f"Foundational features missing in X: {missing}")
 
    baseline_score = cv_score(X[foundational_features], y, model, folds, seed, k)
    
    # 🔥 KEY FIX: These persist across ALL iterations
    current_features = foundational_features.copy()  # Grows continuously
    remaining_features = [f for f in all_features if f not in current_features]
    current_score = baseline_score
    
    if verbose:
        print(f"Starting hill climbing with {len(foundational_features)} foundational features")
        print(f"   Baseline MAP@{k}: {baseline_score:.5f}")
        print(f"   Total features available: {len(all_features)}")
        print(f"   Features to evaluate: {len(remaining_features)}")
        print("-" * 60)

    # Outer loop - each iteration continues from where the last left off
    for i in trange(iterations, desc="Iterations", total=iterations, position=0):
        if verbose:
            print(f"\nITERATION {i+1}/{iterations}")
            print(f"   Starting with {len(current_features)} features")
            print(f"   Current MAP@{k}: {current_score:.5f}")
            print(f"   Remaining to test: {len(remaining_features)}")
            print("-" * 30)
        
        added_in_iteration = []
        
        # Forward selection pass - continues from current state
        improved = True
        while improved and remaining_features:
            improved = False
            best_candidate = None
            best_candidate_score = current_score
            
            feature_progress = tqdm(remaining_features, 
                                  desc=f"Iter {i+1} Features",
                                  leave=False,
                                  disable=not verbose,
                                  position=1)
            
            # Evaluate all remaining candidates
            for feature in feature_progress:
                candidate_set = current_features + [feature]
                candidate_score = cv_score(X[candidate_set], y, model, folds, seed, k)
                
                if candidate_score > best_candidate_score:
                    best_candidate_score = candidate_score
                    best_candidate = feature
            
            # Add feature if it improves score
            if best_candidate and (best_candidate_score > current_score):
                current_features.append(best_candidate)  
                remaining_features.remove(best_candidate) 
                added_in_iteration.append(best_candidate)
                improvement = best_candidate_score - current_score
                current_score = best_candidate_score  
                improved = True
                
                if verbose:
                    print(f"➕ Added '{best_candidate}' | Δ+{improvement:.6f} | New MAP@{k}: {current_score:.6f}")
            else:
                if verbose:
                    print("No improving features found in this pass")
        
        # Record iteration history
        history[f'iter_{i+1}'] = {
            'features': current_features.copy(),
            'score': current_score,
            'added': added_in_iteration,
            'n_features': len(current_features),
            'improvement': current_score - baseline_score
        }
        
        if verbose:
            print(f"\nITERATION {i+1} SUMMARY:")
            print(f"   Features added this iteration: {len(added_in_iteration)}")
            print(f"   Total features now: {len(current_features)}")
            print(f"   Current MAP@{k}: {current_score:.5f}")
            print(f"   Total improvement: {current_score - baseline_score:+.5f}")
            if added_in_iteration:
                print(f"   Added this iteration: {', '.join(added_in_iteration)}")
            print("-" * 60)
        
        # Early stopping if no features were added
        if not added_in_iteration:
            if verbose:
                print(f"🛑 EARLY STOPPING: No features added in iteration {i+1}")
                print("   Algorithm has converged!")
            break
    
    if verbose:
        print("\n🏆 FINAL RESULTS:")
        print(f"   Total features: {len(current_features)}")
        print(f"   Final MAP@{k}: {current_score:.5f}")
        print(f"   Total improvement: {current_score - baseline_score:+.5f}")
        print(f"   Final feature set: {current_features}")
    
    return current_features, current_score, history


foundational = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
       'Nitrogen', 'Potassium', 'Phosphorous']

model = xgb.XGBClassifier(random_state=seed, tree_method='hist', device='cuda')
# model = lgb.LGBMClassifier(random_state=seed, device='gpu', verbose=0)

# Run feature selection
# best_features, best_score, history = hill_climbing_forward_selection_continuous(
#     X_new.iloc[:100], y.iloc[:100], model, foundational_features=foundational,
#     folds=folds, seed=seed, k=3, iterations=2, verbose=True
# )

In [7]:
# best_features

In [8]:
# best_score

In [9]:
# history

In [10]:
# from sklearn.feature_selection import RFECV
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import make_scorer
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

# mapk_scorer = make_scorer(
#     mapk_score, 
#     needs_proba=True,
#     greater_is_better=True,
#     k=3
# )

# # model = XGBClassifier(random_state=seed, tree_method='hist', device='cuda')
# model = LGBMClassifier(random_state=seed, device='gpu')
# # model = CatBoostClassifier(random_state=seed, task_type='GPU')

# cv = StratifiedKFold(
#     n_splits=folds,
#     shuffle=True,
#     random_state=seed
# )

# rfecv = RFECV(
#     estimator=model,
#     step=1,
#     cv=cv,
#     scoring=mapk_scorer,
#     n_jobs=1,
#     verbose=1,
# )

# rfecv.fit(X_new, y)

# print('Optimal number of features :', rfecv.n_features_)
# print('Best CV Score :', max(rfecv.cv_results_['mean_test_score']))
# print('Selected features :', X_new.columns[rfecv.support_])

In [11]:
# cols = X_new.columns[rfecv.support_]

In [12]:
best_features = ['Temparature',
   'Humidity',
   'Moisture',
   'Soil Type',
   'Crop Type',
   'Nitrogen',
   'Potassium',
   'Phosphorous',
   'Phosphorous_mean',
   'Potassium_std',
   'Moisture_mean',
   'Potassium_mean',
   'Nitrogen_deficient',
   'Temparature_std',
   'Potassium_deficient',
   'Potassium_median',
   'Temparature_mean']

In [13]:
import xgboost as xgb
import catboost as cb 
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier


model_dict = {
    'xgb_1': xgb.XGBClassifier(random_state=seed, tree_method='hist', device='cuda', verbose=0),
    'lgbm_1': lgb.LGBMClassifier(random_state=seed, device='gpu', verbose=0),
    'cat_1': cb.CatBoostClassifier(random_state=seed, task_type='GPU', verbose=0)
    # 'etr_1': ExtraTreesClassifier(random_state=seed, n_jobs=-1),
    # 'lgb_1': lgb.LGBMClassifier(random_state=seed),
    # 'hgb_1': HistGradientBoostingClassifier(random_state=seed, verbose=0,)
}

# results = cv_score(X_new[best_features], y, model)

In [14]:
from xgboost import XGBClassifier
xgbc = XGBClassifier(random_state=seed, tree_method='hist', device='cuda')
xgbc.fit(X_new[best_features], y)



In [15]:
sample_sub = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')

In [16]:
preds = xgbc.predict_proba(test_new[best_features])
top3_indices = np.argsort(-preds, axis=1)[:, :3] 
top3_labels = [[rev_class_mapping[idx] for idx in row] for row in top3_indices]
formatted_predictions = [' '.join(preds) for preds in top3_labels]

sample_sub['Fertilizer Name'] = formatted_predictions
sample_sub.to_csv('submission.csv', index=False)

In [17]:
sample_sub

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 10-26-26 28-28
1,750001,17-17-17 20-20 10-26-26
2,750002,14-35-14 10-26-26 28-28
3,750003,14-35-14 10-26-26 17-17-17
4,750004,20-20 28-28 10-26-26
...,...,...
249995,999995,14-35-14 17-17-17 28-28
249996,999996,14-35-14 17-17-17 20-20
249997,999997,14-35-14 10-26-26 DAP
249998,999998,28-28 17-17-17 10-26-26


In [18]:
# # Use the SAME model instance that RFECV used
# same_model = rfecv.estimator_

# def cv_score_same_model(X, y, model, folds=folds, seed=seed, k=3):
#     skf = StratifiedKFold(random_state=seed, n_splits=folds, shuffle=True)
    
#     scores = []
#     print('Using the exact same model instance as RFECV')
    
#     for i, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
#         model.fit(X_train, y_train)
#         probas = model.predict_proba(X_valid)
#         score = mapk_score(y_valid, probas, k=k)
#         print(f'Score on fold {i} is {score}')
#         scores.append(score)
    
#     mean_score = np.mean(scores)
#     print(f'Mean score is {mean_score}')
#     return mean_score

# # Test with the same model RFECV used
# result = cv_score_same_model(X_new[cols], y, same_model)