In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/playground-series-s5e6'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")
print(f"Optuna version: {optuna.__version__}")


In [None]:
# MAP@3 implementation from Kaggle forum
def apk(actual, predicted, k=3):
    """Average precision at k"""
    if len(predicted) > k:
        predicted = predicted[:k]
    
    score = 0.0
    num_hits = 0.0
    
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    
    if not actual:
        return 0.0
    
    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """Mean average precision at k"""
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def map3_score_from_proba(y_true, y_pred_proba):
    """Calculate MAP@3 from probability predictions"""
    # Get top 3 predictions
    top3_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    # Calculate MAP@3
    map3_scores = []
    for i, true_label in enumerate(y_true):
        predicted_labels = top3_indices[i]
        map3_scores.append(apk([true_label], predicted_labels, k=3))
    
    return np.mean(map3_scores)

print("MAP@3 evaluation functions defined!")


In [None]:
# Load data
train_df = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")
print(f"Target distribution:")
print(train_df['Fertilizer Name'].value_counts(normalize=True).sort_index())

# Display basic info
print("\nTraining data info:")
print(train_df.info())
print("\nFirst few rows:")
print(train_df.head())


In [None]:
def create_ultra_competitive_features(df):
    """
    Advanced feature engineering based on Kaggle forum intelligence and our analysis
    """
    df = df.copy()
    
    # Fix column name typo in dataset - 'Temparature' should be 'Temperature'
    if 'Temparature' in df.columns:
        df = df.rename(columns={'Temparature': 'Temperature'})
    
    # 1. CATEGORICAL VERSIONS OF ALL NUMERICAL FEATURES (PROVEN +0.006)
    # This is the single most important technique from the forum
    numerical_cols = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Phosphorous', 'Potassium']
    for col in numerical_cols:
        if col in df.columns:  # Safety check
            # Create categorical bins
            df[f'{col}_cat'] = pd.cut(df[col], bins=20, labels=False, duplicates='drop')
        
    # 2. CONSTANT FEATURE (PROVEN +0.005)
    # Simple but effective technique
    df['const'] = 1
    
    # 3. ENVIRONMENTAL FEATURES (PROVEN)
    # env_max is specifically mentioned as effective
    env_cols = [col for col in ['Temperature', 'Humidity', 'Moisture'] if col in df.columns]
    if len(env_cols) >= 2:
        df['env_max'] = df[env_cols].max(axis=1)
        df['env_min'] = df[env_cols].min(axis=1)
        df['env_range'] = df['env_max'] - df['env_min']
        df['climate_comfort'] = df[env_cols].mean(axis=1)
    
    if 'Temperature' in df.columns and 'Humidity' in df.columns:
        df['temp_humidity_index'] = df['Temperature'] * df['Humidity'] / 100
    
    # 4. NPK RATIOS (HIDDEN SIGNAL - CRITICAL)
    # Forum emphasizes ratios over absolute values
    epsilon = 1e-8  # Avoid division by zero
    npk_cols = ['Nitrogen', 'Phosphorous', 'Potassium']
    
    if all(col in df.columns for col in npk_cols):
        df['N_P_ratio'] = df['Nitrogen'] / (df['Phosphorous'] + epsilon)
        df['N_K_ratio'] = df['Nitrogen'] / (df['Potassium'] + epsilon)
        df['P_K_ratio'] = df['Phosphorous'] / (df['Potassium'] + epsilon)
        df['Total_NPK'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
        df['NPK_balance'] = df[npk_cols].std(axis=1)
        
        # Clip extreme ratios for stability (mentioned in forum)
        ratio_cols = ['N_P_ratio', 'N_K_ratio', 'P_K_ratio']
        for col in ratio_cols:
            df[col] = np.clip(df[col], 0, 100)
            
        # NPK dominance features
        df['N_dominance'] = df['Nitrogen'] / (df['Total_NPK'] + epsilon)
        df['P_dominance'] = df['Phosphorous'] / (df['Total_NPK'] + epsilon)  
        df['K_dominance'] = df['Potassium'] / (df['Total_NPK'] + epsilon)
        
    # 5. TEMPERATURE SUITABILITY (PROVEN DOMAIN KNOWLEDGE FEATURE)
    if 'Temperature' in df.columns and 'Crop Type' in df.columns:
        crop_temp_map = {
            'Sugarcane': (26, 35), 'Maize': (25, 32), 'Wheat': (20, 30),
            'Paddy': (25, 35), 'Cotton': (25, 35), 'Tobacco': (20, 30),
            'Barley': (15, 25), 'Millets': (25, 35), 'Pulses': (20, 30),
            'Oil seeds': (20, 30), 'Ground Nuts': (25, 32)
        }
        
        def temp_suitable(row):
            temp_range = crop_temp_map.get(row['Crop Type'], (25, 32))
            return 1 if temp_range[0] <= row['Temperature'] <= temp_range[1] else 0
            
        df['temp_suitability'] = df.apply(temp_suitable, axis=1)
    
    # 6. CROP-SOIL INTERACTIONS (HIGH IMPORTANCE FROM OUR ANALYSIS)
    if 'Crop Type' in df.columns and 'Soil Type' in df.columns:
        df['Crop_Soil_combo'] = df['Crop Type'].astype(str) + '_' + df['Soil Type'].astype(str)
    
    # 7. ADDITIONAL ADVANCED FEATURES
    # Environmental stress indicators
    if 'Temperature' in df.columns:
        df['temp_stress'] = np.abs(df['Temperature'] - 30)  # 30°C as optimal
    if 'Humidity' in df.columns:
        df['humidity_stress'] = np.abs(df['Humidity'] - 60)  # 60% as optimal
    if 'Moisture' in df.columns:
        df['moisture_stress'] = np.abs(df['Moisture'] - 45)  # 45% as optimal
    
    # Nutrient efficiency features
    if 'Nitrogen' in df.columns and 'Temperature' in df.columns:
        df['N_efficiency'] = df['Nitrogen'] / (df['Temperature'] + epsilon)
    if 'Phosphorous' in df.columns and 'Humidity' in df.columns:
        df['P_efficiency'] = df['Phosphorous'] / (df['Humidity'] + epsilon)
    if 'Potassium' in df.columns and 'Moisture' in df.columns:
        df['K_efficiency'] = df['Potassium'] / (df['Moisture'] + epsilon)
    
    return df

print("Advanced feature engineering function defined!")
print("Key techniques implemented:")
print("✓ Categorical versions of all numerical features (+0.006)")
print("✓ Constant feature (+0.005)")
print("✓ Environmental max feature (proven)")
print("✓ NPK ratios (hidden signal)")
print("✓ Temperature suitability (domain knowledge)")
print("✓ Crop-soil interactions (high importance)")
print("✓ Additional advanced features")


In [None]:
# Apply feature engineering
print("Applying ultra-competitive feature engineering...")

# Prepare training data
X_train = train_df.drop(['Fertilizer Name', 'id'], axis=1, errors='ignore')
y_train = train_df['Fertilizer Name']

# Apply feature engineering
X_train_engineered = create_ultra_competitive_features(X_train)

print(f"Original features: {X_train.shape[1]}")
print(f"Engineered features: {X_train_engineered.shape[1]}")
print(f"Features added: {X_train_engineered.shape[1] - X_train.shape[1]}")

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

print(f"\nTarget classes: {len(label_encoder.classes_)}")
print(f"Target mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

# Handle categorical variables for XGBoost
categorical_cols = ['Crop Type', 'Soil Type', 'Crop_Soil_combo']
for col in categorical_cols:
    if col in X_train_engineered.columns:
        X_train_engineered[col] = X_train_engineered[col].astype('category')

print(f"\nFinal training data shape: {X_train_engineered.shape}")
print("Feature engineering completed successfully!")


In [None]:
# CORRECTED: Load original dataset from the correct path
try:
    print("Attempting to load original dataset...")
    # The original dataset is actually in the same competition folder or external datasets
    # Based on forum discussions, there should be an original 100-sample dataset
    
    # Check if there's an original dataset in the competition folder or external datasets
    original_paths = [
        '/kaggle/input/playground-series-s5e6/original_dataset.csv',  # Sometimes included in competition
        '/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv',  # External dataset if added
        '/kaggle/input/fertilizer-recommendation/Fertilizer Prediction.csv',
        '/kaggle/input/fertilizer-dataset/Fertilizer Prediction.csv',
        '/kaggle/input/fertilizer-prediction-original/Fertilizer Prediction.csv'
    ]
    
    original_df = None
    for path in original_paths:
        try:
            original_df = pd.read_csv(path)
            print(f"✓ Original dataset loaded from: {path}")
            print(f"Original dataset shape: {original_df.shape}")
            break
        except FileNotFoundError:
            continue
        except Exception as e:
            print(f"Error trying {path}: {e}")
            continue
    
    if original_df is None:
        print("⚠ Original dataset not found in any expected location")
        print("Continuing without original dataset - this is normal if not added as external data")
        print("To use original dataset, add it as an external dataset in Kaggle")
        X_orig_engineered = None
        y_orig_encoded = None
    else:
        # Process original dataset
        print("Processing original dataset...")
        
        # Check if it has the same structure as competition data
        print(f"Original dataset columns: {list(original_df.columns)}")
        
        # Remove id column if it exists, keep target
        id_cols = ['id', 'Id', 'ID']
        for id_col in id_cols:
            if id_col in original_df.columns:
                original_df = original_df.drop(id_col, axis=1)
        
        # Separate features and target
        target_col = 'Fertilizer Name'
        if target_col not in original_df.columns:
            print(f"⚠ Target column '{target_col}' not found in original dataset")
            print(f"Available columns: {list(original_df.columns)}")
            X_orig_engineered = None
            y_orig_encoded = None
        else:
            X_orig = original_df.drop([target_col], axis=1)
            y_orig = original_df[target_col]
            
            print(f"Original features shape: {X_orig.shape}")
            print(f"Original target shape: {y_orig.shape}")
            print(f"Original target classes: {y_orig.unique()}")
            
            # Apply same feature engineering
            X_orig_engineered = create_ultra_competitive_features(X_orig)
            
            # Handle categorical variables
            categorical_cols = ['Crop Type', 'Soil Type', 'Crop_Soil_combo']
            for col in categorical_cols:
                if col in X_orig_engineered.columns:
                    X_orig_engineered[col] = X_orig_engineered[col].astype('category')
            
            # Encode target (make sure all classes exist in label_encoder)
            try:
                y_orig_encoded = label_encoder.transform(y_orig)
                print(f"✓ Original dataset processed successfully: {X_orig_engineered.shape}")
            except ValueError as e:
                print(f"⚠ Target encoding failed: {e}")
                print("This might be due to new classes in original dataset")
                print(f"Competition classes: {label_encoder.classes_}")
                print(f"Original classes: {y_orig.unique()}")
                X_orig_engineered = None
                y_orig_encoded = None
        
except Exception as e:
    print(f"⚠ Unexpected error loading original dataset: {e}")
    X_orig_engineered = None
    y_orig_encoded = None

print(f"Original dataset integration: {'✓ Success' if X_orig_engineered is not None else '✗ Not available'}")


In [None]:
# Data expansion technique (proven from forum)
def expand_training_data(X, y, expansion_factor=2):
    """
    Expand training data by duplicating it multiple times
    This technique is proven to improve performance in the forum
    """
    print(f"Expanding training data by factor of {expansion_factor}...")
    
    expanded_X = []
    expanded_y = []
    
    for i in range(expansion_factor):
        expanded_X.append(X.copy())
        expanded_y.append(y.copy())
        
    X_expanded = pd.concat(expanded_X, ignore_index=True)
    y_expanded = np.concatenate(expanded_y)
    
    print(f"Original size: {len(X)} -> Expanded size: {len(X_expanded)}")
    return X_expanded, y_expanded

# Apply aggressive data expansion (4x as requested)
X_train_expanded, y_train_expanded = expand_training_data(
    X_train_engineered, y_train_encoded, expansion_factor=4
)

print("Aggressive 4x data expansion completed!")


In [None]:
# Feature Selection - Remove noisy features, keep top performers
def select_top_features(X, y, n_features=50, use_gpu=True):
    """
    Select top features using XGBoost feature importance
    """
    print(f"Performing feature selection to keep top {n_features} features...")
    
    # Quick model to get feature importance - FIXED: Added enable_categorical
    temp_model = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=7,
        n_estimators=100,  # Fast for feature selection
        max_depth=6,
        random_state=42,
        verbosity=0,
        enable_categorical=True,  # CRITICAL: Enable categorical support
        tree_method='gpu_hist' if use_gpu else 'hist'
    )
    
    if use_gpu:
        temp_model.set_params(gpu_id=0)
    else:
        temp_model.set_params(n_jobs=-1)
    
    print("Training feature selection model...")
    temp_model.fit(X, y)
    
    # Get feature importance
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': temp_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Select top features
    top_features = importance_df.head(n_features)['feature'].tolist()
    
    print(f"Selected top {len(top_features)} features:")
    print("Top 10:", top_features[:10])
    
    return top_features, importance_df

# Check GPU availability for feature selection
try:
    test_model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0, enable_categorical=True)
    use_gpu_fs = True
    print("✓ Using GPU for feature selection")
except:
    use_gpu_fs = False
    print("⚠ Using CPU for feature selection")

# Apply feature selection with fixed categorical handling
top_features, feature_importance_df = select_top_features(
    X_train_expanded, y_train_expanded, 
    n_features=50, 
    use_gpu=use_gpu_fs
)
X_train_selected = X_train_expanded[top_features]

print(f"Feature selection completed: {X_train_expanded.shape[1]} -> {X_train_selected.shape[1]} features")


In [None]:
# Enhanced Optuna optimization with original dataset integration
def create_enhanced_xgboost_objective(X, y, X_orig=None, y_orig=None, n_splits=3, use_gpu=True):
    """
    Enhanced Optuna objective function with original dataset integration
    Uses MAP@3 as the optimization metric
    """
    def objective(trial):
        # Suggest hyperparameters
        params = {
            'objective': 'multi:softprob',
            'num_class': 7,
            'eval_metric': 'mlogloss',
            'tree_method': 'gpu_hist' if use_gpu else 'hist',
            'enable_categorical': True,
            'random_state': 42,
            'verbosity': 0,
            
            # Hyperparameters to optimize
            'max_depth': trial.suggest_int('max_depth', 4, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        }
        
        if use_gpu:
            params['gpu_id'] = 0
        else:
            params['n_jobs'] = -1
        
        # Cross-validation with proper original dataset handling
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        cv_scores = []
        
        for train_idx, val_idx in skf.split(X, y):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]
            
            # Add original dataset ONLY to training fold (CRITICAL)
            if X_orig is not None and y_orig is not None:
                X_tr = pd.concat([X_tr, X_orig], ignore_index=True)
                y_tr = np.concatenate([y_tr, y_orig])
            
            # Train model
            model = xgb.XGBClassifier(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=50,
                verbose=False
            )
            
            # Predict and calculate MAP@3
            y_pred_proba = model.predict_proba(X_val)
            map3_score = map3_score_from_proba(y_val, y_pred_proba)
            cv_scores.append(map3_score)
        
        return np.mean(cv_scores)
    
    return objective

print("Optuna objective function created!")
print("Will optimize XGBoost hyperparameters using MAP@3 metric")


In [None]:
# Run Optuna optimization
print("Starting Optuna hyperparameter optimization...")
print("This will take some time but will find the best parameters automatically")

# Check if GPU is available
use_gpu = True
try:
    # Test if GPU is available for XGBoost
    test_model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    print("✓ GPU detected and will be used for training")
except:
    use_gpu = False
    print("⚠ GPU not available, using CPU training")

# Create enhanced objective function with all improvements
objective = create_enhanced_xgboost_objective(
    X_train_selected, y_train_expanded,  # Use feature-selected data
    X_orig=X_orig_engineered, y_orig=y_orig_encoded,  # Add original dataset
    n_splits=3,  # Fast CV for optimization
    use_gpu=use_gpu
)

# Create study and optimize
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
)

# Run optimization
n_trials = 10  # Reduced to avoid overfitting
print(f"Running {n_trials} optimization trials...")

study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

# Get best parameters
best_params = study.best_params
best_score = study.best_value

print(f"\n🏆 Optimization completed!")
print(f"Best MAP@3 score: {best_score:.6f}")
print(f"Best parameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")


In [None]:
# Detailed cross-validation with best parameters
print("Performing detailed cross-validation with optimized parameters...")

# Create final model configuration
final_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'eval_metric': 'mlogloss',
    'tree_method': 'gpu_hist' if use_gpu else 'hist',
    'enable_categorical': True,
    'random_state': 42,
    'verbosity': 0,
    **best_params
}

if use_gpu:
    final_params['gpu_id'] = 0
else:
    final_params['n_jobs'] = -1

# Enhanced Multi-Seed Ensemble Cross-Validation
def multi_seed_cv(X, y, X_orig=None, y_orig=None, params=None, seeds=[42, 123, 456], n_splits=10):
    """
    Multi-seed ensemble cross-validation for better stability
    """
    all_oof_predictions = []
    all_cv_scores = []
    
    for seed_idx, seed in enumerate(seeds):
        print(f"\n🌱 Training with seed {seed} ({seed_idx+1}/{len(seeds)})")
        
        # Update params with current seed
        current_params = params.copy()
        current_params['random_state'] = seed
        
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        cv_scores = []
        oof_predictions = np.zeros((len(X), 7))
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]
            
            # Add original dataset ONLY to training fold
            if X_orig is not None and y_orig is not None:
                X_tr = pd.concat([X_tr, X_orig], ignore_index=True)
                y_tr = np.concatenate([y_tr, y_orig])
            
            # Train model
            model = xgb.XGBClassifier(**current_params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=100,
                verbose=False
            )
            
            # Predict
            y_pred_proba = model.predict_proba(X_val)
            oof_predictions[val_idx] = y_pred_proba
            
            # Calculate MAP@3
            map3_score = map3_score_from_proba(y_val, y_pred_proba)
            cv_scores.append(map3_score)
        
        seed_cv_score = np.mean(cv_scores)
        print(f"Seed {seed} CV MAP@3: {seed_cv_score:.6f}")
        
        all_oof_predictions.append(oof_predictions)
        all_cv_scores.append(seed_cv_score)
    
    # Ensemble predictions (average across seeds)
    ensemble_oof = np.mean(all_oof_predictions, axis=0)
    ensemble_cv_score = map3_score_from_proba(y, ensemble_oof)
    
    print(f"\n🏆 Multi-Seed Ensemble Results:")
    print(f"Individual seed scores: {[f'{score:.6f}' for score in all_cv_scores]}")
    print(f"Ensemble MAP@3: {ensemble_cv_score:.6f}")
    print(f"Mean individual: {np.mean(all_cv_scores):.6f}")
    print(f"Ensemble improvement: +{ensemble_cv_score - np.mean(all_cv_scores):.6f}")
    
    return ensemble_oof, all_oof_predictions, ensemble_cv_score

# Run multi-seed ensemble CV
ensemble_oof, individual_oofs, ensemble_score = multi_seed_cv(
    X_train_selected, y_train_expanded,
    X_orig=X_orig_engineered, y_orig=y_orig_encoded,
    params=final_params,
    seeds=[42, 123, 456, 789, 999],  # 5 seeds for robust ensemble
    n_splits=5  # Reduced splits due to multi-seed approach
)


In [None]:
# Train Multi-Seed Ensemble Models
print("Training multi-seed ensemble models on complete dataset...")

final_models = []
seeds = [42, 123, 456, 789, 999]

for seed in seeds:
    print(f"Training model with seed {seed}...")
    
    # Prepare training data with original dataset
    X_train_final = X_train_selected.copy()
    y_train_final = y_train_expanded.copy()
    
    if X_orig_engineered is not None and y_orig_encoded is not None:
        # Add original dataset to training
        X_train_final = pd.concat([X_train_final, X_orig_engineered], ignore_index=True)
        y_train_final = np.concatenate([y_train_final, y_orig_encoded])
    
    # Update params with current seed
    current_params = final_params.copy()
    current_params['random_state'] = seed
    
    # Train model
    model = xgb.XGBClassifier(**current_params)
    model.fit(X_train_final, y_train_final)
    final_models.append(model)

print(f"✓ Multi-seed ensemble training completed! ({len(final_models)} models)")

# Feature importance analysis (using first model)
feature_importance = final_models[0].feature_importances_
feature_names = X_train_selected.columns
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(f"\n📊 Top 20 Most Important Features:")
print(importance_df.head(20).to_string(index=False))

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances - Ultra-Competitive XGBoost Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Verify our key engineered features are important
key_features = ['const', 'env_max', 'temp_suitability', 'N_P_ratio', 'Total_NPK']
print(f"\n🔍 Importance of Key Engineered Features:")
for feature in key_features:
    if feature in importance_df['feature'].values:
        importance = importance_df[importance_df['feature'] == feature]['importance'].iloc[0]
        rank = importance_df[importance_df['feature'] == feature].index[0] + 1
        print(f"  {feature}: {importance:.4f} (rank #{rank})")
    else:
        print(f"  {feature}: Not found in features")

In [None]:
# Prepare test data and make predictions
print("Preparing test data...")

# Apply same feature engineering to test data
X_test = test_df.drop('id', axis=1, errors='ignore')
X_test_engineered = create_ultra_competitive_features(X_test)

# Handle categorical variables
for col in categorical_cols:
    if col in X_test_engineered.columns:
        X_test_engineered[col] = X_test_engineered[col].astype('category')

# Select same features as training
X_test_selected = X_test_engineered[top_features]

print(f"Test data shape after engineering and selection: {X_test_selected.shape}")

# Make ensemble predictions with calibration
print("Making ensemble predictions...")

# Get predictions from all models
all_test_predictions = []
for i, model in enumerate(final_models):
    print(f"Predicting with model {i+1}/{len(final_models)}...")
    pred = model.predict_proba(X_test_selected)
    all_test_predictions.append(pred)

# Ensemble average
test_probabilities = np.mean(all_test_predictions, axis=0)

# Post-processing calibration for better MAP@3
from sklearn.calibration import CalibratedClassifierCV
print("Applying probability calibration...")

# Simple temperature scaling calibration
def calibrate_probabilities(probs, temperature=1.0):
    """Apply temperature scaling to probabilities"""
    return np.exp(np.log(probs + 1e-8) / temperature) / np.sum(np.exp(np.log(probs + 1e-8) / temperature), axis=1, keepdims=True)

# Find optimal temperature using ensemble OOF predictions
def find_optimal_temperature(oof_probs, y_true):
    """Find optimal temperature for calibration"""
    best_temp = 1.0
    best_score = map3_score_from_proba(y_true, oof_probs)
    
    for temp in np.arange(0.5, 2.0, 0.1):
        calibrated_probs = calibrate_probabilities(oof_probs, temp)
        score = map3_score_from_proba(y_true, calibrated_probs)
        if score > best_score:
            best_score = score
            best_temp = temp
    
    return best_temp, best_score

optimal_temp, calibrated_score = find_optimal_temperature(ensemble_oof, y_train_expanded)
print(f"Optimal temperature: {optimal_temp:.2f}")
print(f"Calibrated score: {calibrated_score:.6f} (vs original: {ensemble_score:.6f})")

# Apply calibration to test predictions
test_probabilities = calibrate_probabilities(test_probabilities, optimal_temp)

# Get top 3 predictions for each sample (for MAP@3)
top3_predictions = np.argsort(test_probabilities, axis=1)[:, ::-1][:, :3]

# Convert back to original fertilizer names
top3_fertilizer_names = []
for i in range(len(top3_predictions)):
    fertilizer_names = [label_encoder.inverse_transform([pred])[0] for pred in top3_predictions[i]]
    top3_fertilizer_names.append(fertilizer_names)

print("✓ Predictions completed!")

# Create submission file
print("Creating submission file...")
submission = pd.DataFrame()
submission['id'] = test_df['id']

# Add top 3 predictions
for i in range(3):
    submission[f'Fertilizer Name_{i+1}'] = [pred[i] for pred in top3_fertilizer_names]

# Display first few predictions
print("\n📋 First 10 predictions:")
print(submission.head(10))

# Save submission to Kaggle working directory
submission_filename = '/kaggle/working/ultra_competitive_submission.csv'
submission.to_csv(submission_filename, index=False)
print(f"\n💾 Submission saved as: {submission_filename}")

# Also save to current directory for backup
submission.to_csv('ultra_competitive_submission.csv', index=False)
print("💾 Backup submission saved to current directory")

# Summary statistics
print(f"\n📈 Ultra-Competitive Model Performance Summary:")
print(f"  • Best Optuna MAP@3: {best_score:.6f}")
print(f"  • Multi-Seed Ensemble MAP@3: {ensemble_score:.6f}")
print(f"  • Calibrated MAP@3: {calibrated_score:.6f}")
print(f"  • Features used: {len(top_features)} (selected from {X_train_engineered.shape[1]})")
print(f"  • Training samples: {len(y_train_expanded):,} (4x expanded)")
print(f"  • Original dataset: {'✓ Integrated' if X_orig_engineered is not None else '✗ Not found'}")
print(f"  • Ensemble models: {len(final_models)}")
print(f"  • Test predictions: {len(submission):,}")

print(f"\n🏆 All High-Impact Techniques Applied:")
print(f"  ✅ Categorical features (+0.006)")
print(f"  ✅ Constant feature (+0.005)")
print(f"  ✅ NPK ratios (hidden signal)")
print(f"  ✅ 4x data expansion")
print(f"  ✅ Feature selection (top 50)")
print(f"  ✅ Multi-seed ensemble (5 models)")
print(f"  ✅ Original dataset integration")
print(f"  ✅ Probability calibration")

print(f"\n🎯 Expected Leaderboard Performance:")
print(f"  • Champion score to beat: 0.383")
print(f"  • Our calibrated score: {calibrated_score:.6f}")
print(f"  • Gap to champion: {0.383 - calibrated_score:.6f}")
print(f"  • Expected LB performance: {calibrated_score:.3f} - {calibrated_score + 0.005:.3f}")

if calibrated_score >= 0.383:
    print(f"  🏆 LIKELY TO BEAT CHAMPION SCORE!")
elif calibrated_score >= 0.380:
    print(f"  🥈 VERY COMPETITIVE - Close to champion!")
elif calibrated_score >= 0.375:
    print(f"  🥉 STRONG PERFORMANCE - Top tier submission!")
else:
    print(f"  📈 GOOD PERFORMANCE - Room for improvement")
