In [None]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from autogluon.tabular import TabularPredictor
import time
import gc

np.random.seed(42)

print("✅ Libraries imported - AutoGluon + Hill Climbing approach")
try:
    import autogluon
    print(f"AutoGluon version: {autogluon.__version__}")
except:
    print("AutoGluon version: Available")
print("🎯 Ready for ensemble modeling with automated optimization")


In [None]:
# MAP@3 evaluation functions
def apk(actual, predicted, k=3):
    if len(predicted) > k:
        predicted = predicted[:k]
    
    score = 0.0
    num_hits = 0.0
    
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    
    return score / min(len(actual), k) if actual else 0.0

def mapk(actual, predicted, k=3):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def map3_score_from_proba(y_true, y_pred_proba):
    top3_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :3]
    
    map3_scores = []
    for i, true_label in enumerate(y_true):
        predicted_labels = top3_indices[i]
        map3_scores.append(apk([true_label], predicted_labels, k=3))
    
    return np.mean(map3_scores)

print("✅ MAP@3 evaluation functions defined")


In [None]:
# Load competition data
train_df = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')

print(f"Training data: {train_df.shape}")
print(f"Test data: {test_df.shape}")

# Load original dataset (with proper handling)
original_df = None
try:
    original_paths = [
        '/kaggle/input/fertilizer-recommendation/Fertilizer_Prediction.csv',
        '/kaggle/input/original-fertilizer/Fertilizer_Prediction.csv',
        'datasets/Fertilizer_Prediction.csv'
    ]
    
    for path in original_paths:
        if os.path.exists(path):
            original_df = pd.read_csv(path)
            print(f"✅ Original dataset loaded: {original_df.shape}")
            break
    else:
        print("⚠ Original dataset not found - continuing without it")
        
except Exception as e:
    print(f"⚠ Could not load original dataset: {e}")

# Fix column name typo if present
if 'Temparature' in train_df.columns:
    train_df = train_df.rename(columns={'Temparature': 'Temperature'})
    test_df = test_df.rename(columns={'Temparature': 'Temperature'})
    if original_df is not None and 'Temparature' in original_df.columns:
        original_df = original_df.rename(columns={'Temparature': 'Temperature'})

print("✅ Data loaded and preprocessed")


In [None]:
def create_structural_features(df):
    """
    FIXED: Complete structural feature engineering with ALL proven techniques
    Based on forum intelligence and competitive strategy
    """
    df = df.copy()
    
    # ===== TECHNIQUE 1: ALL FEATURES AS CATEGORICAL (+0.006) =====
    # Strategy: "treat ALL features as categorical" with quantile-based binning
    numerical_cols = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Phosphorous', 'Potassium']
    for col in numerical_cols:
        if col in df.columns:
            # CRITICAL: Use quantile-based binning (not equal-width)
            df[f'{col}_cat'] = pd.qcut(df[col], q=20, labels=False, duplicates='drop')
            
            # Threshold patterns
            df[f'{col}_high'] = (df[col] > df[col].median()).astype(int)
    
    # MISSING: Make ALL other numerical features categorical too
    other_numerical = df.select_dtypes(include=[np.number]).columns
    for col in other_numerical:
        if col not in [f'{c}_cat' for c in numerical_cols] and col not in ['const']:
            try:
                df[f'{col}_cat'] = pd.qcut(df[col], q=10, labels=False, duplicates='drop')
            except:
                df[f'{col}_cat'] = pd.cut(df[col], bins=10, labels=False)
    
    # ===== TECHNIQUE 2: CONSTANT FEATURE (+0.005) =====
    df['const'] = 1
    
    # ===== TECHNIQUE 3: ENVIRONMENTAL MAX (PROVEN HIGH-IMPACT) =====
    if all(col in df.columns for col in ['Temperature', 'Humidity', 'Moisture']):
        df['env_max'] = df[['Temperature', 'Humidity', 'Moisture']].max(axis=1)
        df['temp_humidity_index'] = df['Temperature'] * df['Humidity'] / 100
        df['climate_comfort'] = (df['Temperature'] + df['Humidity'] + df['Moisture']) / 3
    
    # ===== TECHNIQUE 4: NPK CHEMISTRY + BALANCE (HIDDEN SIGNAL) =====
    epsilon = 1e-8
    npk_cols = ['Nitrogen', 'Phosphorous', 'Potassium']
    
    if all(col in df.columns for col in npk_cols):
        # Basic ratios
        df['N_P_ratio'] = df['Nitrogen'] / (df['Phosphorous'] + epsilon)
        df['N_K_ratio'] = df['Nitrogen'] / (df['Potassium'] + epsilon)  
        df['P_K_ratio'] = df['Phosphorous'] / (df['Potassium'] + epsilon)
        df['Total_NPK'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
        
        # ADDED: NPK balance (missing from original)
        df['NPK_balance'] = df[npk_cols].std(axis=1)
        
        # Fertilizer-specific chemistry scoring (CRITICAL)
        df['NPK_17_17_17_score'] = 1 / (1 + np.abs(df['N_P_ratio'] - 1) + np.abs(df['N_K_ratio'] - 1))
        df['NPK_28_28_score'] = 1 / (1 + np.abs(df['N_P_ratio'] - 1))
        df['NPK_10_26_26_score'] = 1 / (1 + np.abs(df['N_P_ratio'] - 0.38) + np.abs(df['P_K_ratio'] - 1))
        df['NPK_20_20_score'] = 1 / (1 + np.abs(df['N_P_ratio'] - 1))
        df['NPK_14_35_14_score'] = 1 / (1 + np.abs(df['N_P_ratio'] - 0.4) + np.abs(df['N_K_ratio'] - 1))
        df['DAP_score'] = 1 / (1 + np.abs(df['N_P_ratio'] - 0.78))  # DAP is ~18-46
        df['Urea_score'] = df['Nitrogen'] / (df['Total_NPK'] + epsilon)  # Urea is high N
        
        # Clip extreme ratios
        for col in ['N_P_ratio', 'N_K_ratio', 'P_K_ratio']:
            df[col] = np.clip(df[col], 0, 10)
    
    # ===== TECHNIQUE 5: TEMPERATURE SUITABILITY (PROVEN) =====
    if 'Temperature' in df.columns and 'Crop Type' in df.columns:
        crop_temp_map = {
            'Sugarcane': (26, 35), 'Maize': (25, 32), 'Wheat': (20, 30),
            'Paddy': (25, 35), 'Cotton': (25, 35), 'Tobacco': (20, 30)
        }
        
        def get_temp_suitability(row):
            crop = row['Crop Type']
            temp = row['Temperature']
            if crop in crop_temp_map:
                min_temp, max_temp = crop_temp_map[crop]
                return 1 if min_temp <= temp <= max_temp else 0
            return 1 if 20 <= temp <= 32 else 0  # Default range
        
        df['temp_suitability'] = df.apply(get_temp_suitability, axis=1)
    
    # ===== TECHNIQUE 6: ENVIRONMENTAL CATEGORIZATION =====
    if 'Temperature' in df.columns:
        df['temp_zone'] = pd.qcut(df['Temperature'], q=5, labels=[0,1,2,3,4], duplicates='drop')
        df['temp_stress'] = ((df['Temperature'] < 20) | (df['Temperature'] > 35)).astype(int)
        
    if 'Humidity' in df.columns:
        df['humidity_zone'] = pd.qcut(df['Humidity'], q=5, labels=[0,1,2,3,4], duplicates='drop')
        df['humidity_stress'] = ((df['Humidity'] < 40) | (df['Humidity'] > 80)).astype(int)
        
    if 'Moisture' in df.columns:
        df['moisture_zone'] = pd.qcut(df['Moisture'], q=5, labels=[0,1,2,3,4], duplicates='drop')
        df['moisture_stress'] = ((df['Moisture'] < 30) | (df['Moisture'] > 70)).astype(int)
    
    # ===== TECHNIQUE 7: CROP-SOIL INTERACTION =====
    if 'Crop Type' in df.columns and 'Soil Type' in df.columns:
        df['Crop_Soil_combo'] = df['Crop Type'].astype(str) + '_' + df['Soil Type'].astype(str)
        
        # Agricultural compatibility patterns
        crop_soil_strength = {
            'Maize_Loamy': 1.0, 'Sugarcane_Black': 1.0, 'Cotton_Black': 1.0,
            'Paddy_Clayey': 1.0, 'Wheat_Loamy': 1.0, 'Tobacco_Red': 1.0,
            'Maize_Black': 0.8, 'Sugarcane_Red': 0.8, 'Cotton_Red': 0.8,
            'Paddy_Loamy': 0.8, 'Wheat_Black': 0.8, 'Tobacco_Loamy': 0.8
        }
        df['crop_soil_strength'] = df['Crop_Soil_combo'].map(crop_soil_strength).fillna(0.5)
    
    # CRITICAL: Ensure ALL categorical columns are properly typed for XGBoost
    categorical_cols = [col for col in df.columns if '_cat' in col or '_zone' in col]
    for col in categorical_cols:
        df[col] = df[col].astype('category')
    
    # Categorical versions of string columns
    string_cols = ['Crop Type', 'Soil Type', 'Crop_Soil_combo']
    for col in string_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    # STRATEGY: Treat as many features as categorical as possible
    # Convert binary features to categorical too
    binary_cols = [col for col in df.columns if '_high' in col or '_stress' in col or '_suitability' in col]
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    return df

print("✅ Structural feature engineering function defined")


In [None]:
def add_target_encoding_cv(X, y, feature_col, n_folds=5):
    """
    FIXED: Add CV-based target encoding to prevent data leakage
    Handles categorical columns properly
    """
    X = X.copy()
    encoded_col = f'{feature_col}_target_encoded'
    X[encoded_col] = 0.0
    
    # Convert categorical column to string for mapping
    if X[feature_col].dtype.name == 'category':
        X[feature_col] = X[feature_col].astype(str)
    
    # Use StratifiedKFold for proper CV
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    for train_idx, val_idx in skf.split(X, y):
        # Calculate encoding on training fold only
        encoding_map = X.iloc[train_idx].groupby(feature_col).apply(lambda x: y.iloc[x.index].mean())
        
        # Apply to validation fold - handle missing categories
        val_encoded = X.loc[val_idx, feature_col].map(encoding_map)
        val_encoded = val_encoded.fillna(y.mean())  # Fill missing with global mean
        X.loc[val_idx, encoded_col] = val_encoded
    
    return X

print("✅ Target encoding function defined")


In [None]:
# Apply structural feature engineering
print("Applying structural feature engineering...")

X_train_raw = train_df.drop(['id', 'Fertilizer Name'], axis=1)
y_train_raw = train_df['Fertilizer Name']
X_test_raw = test_df.drop(['id'], axis=1)

# Create structural features
X_train_engineered = create_structural_features(X_train_raw)
X_test_engineered = create_structural_features(X_test_raw)

print(f"Features: {X_train_raw.shape[1]} -> {X_train_engineered.shape[1]}")

# Encode target
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_raw)

print(f"Target classes: {len(le.classes_)}")
print(f"Classes: {le.classes_}")

# Handle original dataset if available
X_orig_engineered = None
y_orig_encoded = None

if original_df is not None:
    print("\nProcessing original dataset...")
    
    # Check for same columns
    orig_features = original_df.drop(['Fertilizer Name'], axis=1)
    orig_target = original_df['Fertilizer Name']
    
    # Only use samples with known fertilizer types
    known_fertilizers = set(le.classes_)
    mask = orig_target.isin(known_fertilizers)
    
    if mask.sum() > 0:
        orig_features_filtered = orig_features[mask]
        orig_target_filtered = orig_target[mask]
        
        X_orig_engineered = create_structural_features(orig_features_filtered)
        y_orig_encoded = le.transform(orig_target_filtered)
        
        print(f"Original dataset processed: {X_orig_engineered.shape[0]} valid samples")
    else:
        print("No overlapping fertilizer types found in original dataset")

print("\n✅ Feature engineering completed")


In [None]:
# FIXED: Proper target encoding for Crop_Soil_combo (NO LEAKAGE)
if 'Crop_Soil_combo' in X_train_engineered.columns:
    print("Adding CV-based target encoding for Crop_Soil_combo...")
    
    # BEFORE target encoding - create encoding map from RAW training data
    # Convert to string first to handle categorical properly
    crop_soil_str = X_train_engineered['Crop_Soil_combo'].astype(str)
    encoding_map = crop_soil_str.groupby(crop_soil_str).apply(
        lambda x: pd.Series(y_train_encoded).iloc[x.index].mean()
    )
    
    # Apply CV-based target encoding to training data
    X_train_engineered = add_target_encoding_cv(
        X_train_engineered, 
        pd.Series(y_train_encoded), 
        'Crop_Soil_combo'
    )
    
    # Apply same encoding map to test and original data (NO LEAKAGE)
    test_crop_soil_str = X_test_engineered['Crop_Soil_combo'].astype(str)
    X_test_engineered['Crop_Soil_combo_target_encoded'] = (
        test_crop_soil_str.map(encoding_map).fillna(np.mean(y_train_encoded))
    )
    
    # Handle original dataset
    if X_orig_engineered is not None:
        orig_crop_soil_str = X_orig_engineered['Crop_Soil_combo'].astype(str)
        X_orig_engineered['Crop_Soil_combo_target_encoded'] = (
            orig_crop_soil_str.map(encoding_map).fillna(np.mean(y_train_encoded))
        )
    
    # CRITICAL FIX: Ensure Crop_Soil_combo is categorical dtype for XGBoost
    X_train_engineered['Crop_Soil_combo'] = X_train_engineered['Crop_Soil_combo'].astype('category')
    X_test_engineered['Crop_Soil_combo'] = X_test_engineered['Crop_Soil_combo'].astype('category')
    if X_orig_engineered is not None:
        X_orig_engineered['Crop_Soil_combo'] = X_orig_engineered['Crop_Soil_combo'].astype('category')
    
    print(f"✅ Target encoding added (NO LEAKAGE): {X_train_engineered.shape[1]} features")

print(f"\nFinal feature count: {X_train_engineered.shape[1]}")
print(f"Sample feature names: {list(X_train_engineered.columns[:10])}")


In [None]:
# REMOVED: Feature selection causes data leakage
# Strategy doesn't mention feature selection - keep all features
# All engineered features are based on domain knowledge and proven techniques

print(f"✅ Keeping all {X_train_engineered.shape[1]} engineered features (no selection to avoid leakage)")
print(f"Features include: {list(X_train_engineered.columns[:10])}...")


In [None]:
def hill_climbing_optimization(X_train, y_train_raw, X_orig=None, y_orig_raw=None, max_iterations=3):
    """
    Hill Climbing optimization with AutoGluon ensemble
    Iteratively improves feature engineering and model performance
    """
    
    def evaluate_autogluon(X_train_iter, y_train_iter, iteration=0):
        """Evaluate current feature set with AutoGluon ensemble"""
        
        # Prepare data with proper expansion (inside CV to prevent leakage)
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # 3-fold for speed
        cv_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_iter, y_train_iter)):
            X_tr, X_val = X_train_iter.iloc[train_idx], X_train_iter.iloc[val_idx]
            y_tr, y_val = y_train_iter.iloc[train_idx], y_train_iter.iloc[val_idx]
            
            # Data expansion exactly as per strategy
            X_tr_expanded = pd.concat([X_tr, X_tr, X_tr], ignore_index=True)
            y_tr_expanded = pd.concat([y_tr, y_tr, y_tr], ignore_index=True)
            
            # Add original data if available
            if X_orig is not None and y_orig_raw is not None:
                orig_features = [col for col in X_orig.columns if col in X_tr_expanded.columns]
                X_orig_subset = X_orig[orig_features]
                y_orig_subset = y_orig_raw
                
                # 2x multiplication of original
                X_orig_2x = pd.concat([X_orig_subset, X_orig_subset], ignore_index=True)
                y_orig_2x = pd.concat([y_orig_subset, y_orig_subset], ignore_index=True)
                
                X_tr_expanded = pd.concat([X_tr_expanded, X_orig_2x], ignore_index=True)
                y_tr_expanded = pd.concat([y_tr_expanded, y_orig_2x], ignore_index=True)
            
            # Create AutoGluon predictor with ensemble configuration
            predictor_path = f'./autogluon_models_iter{iteration}_fold{fold}'
            
            try:
                # Configure AutoGluon for competitive performance
                predictor = TabularPredictor(
                    label='target',
                    path=predictor_path,
                    eval_metric='accuracy',  # AutoGluon doesn't have MAP@3, use accuracy
                    verbosity=0
                )
                
                # Prepare training data
                train_data = X_tr_expanded.copy()
                train_data['target'] = y_tr_expanded.reset_index(drop=True)
                
                # AutoGluon fit with ensemble focus
                predictor.fit(
                    train_data,
                    time_limit=180,  # 3 minutes per fold for speed
                    presets='best_quality',  # Enable advanced ensembling
                    auto_stack=True,  # Enable stacking
                    num_bag_folds=3,  # Bagging for robustness
                    num_stack_levels=1,  # One level of stacking
                    hyperparameters={
                        'GBM': [
                            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
                            {},  # Default XGBoost
                            {'boosting': 'dart', 'ag_args': {'name_suffix': 'DART'}},
                        ],
                        'CAT': {},  # CatBoost
                        'NN_TORCH': [{'num_epochs': 50}],  # Neural network
                        'FASTAI': {},  # FastAI neural networks
                    },
                    excluded_model_types=['KNN']  # Exclude slow models
                )
                
                # Predict on validation fold
                val_data = X_val.copy()
                predictions = predictor.predict_proba(val_data)
                
                # Calculate MAP@3 score
                if hasattr(predictions, 'values'):
                    pred_proba = predictions.values
                else:
                    pred_proba = predictions
                
                # Convert string labels back to indices for MAP@3 calculation
                le_temp = LabelEncoder()
                y_val_encoded = le_temp.fit_transform(y_val)
                
                score = map3_score_from_proba(y_val_encoded, pred_proba)
                cv_scores.append(score)
                
                print(f"    Fold {fold}: MAP@3 = {score:.6f}")
                
                # Cleanup to save memory
                predictor.delete_models(models_to_keep=[], dry_run=False)
                
            except Exception as e:
                print(f"    Error in fold {fold}: {e}")
                cv_scores.append(0.0)
            
            finally:
                # Force cleanup
                try:
                    import shutil
                    shutil.rmtree(predictor_path, ignore_errors=True)
                except:
                    pass
                gc.collect()
        
        mean_score = np.mean(cv_scores)
        print(f"  Iteration {iteration}: Mean CV MAP@3 = {mean_score:.6f}")
        return mean_score
    
    # Hill climbing iterations
    print("🔥 Starting Hill Climbing Optimization with AutoGluon")
    
    best_score = 0.0
    best_features = X_train.copy()
    current_features = X_train.copy()
    
    for iteration in range(max_iterations):
        print(f"\n=== HILL CLIMBING ITERATION {iteration + 1}/{max_iterations} ===")
        
        # Evaluate current feature set
        current_score = evaluate_autogluon(current_features, y_train_raw, iteration)
        
        if current_score > best_score:
            best_score = current_score
            best_features = current_features.copy()
            print(f"✅ New best score: {best_score:.6f}")
        else:
            print(f"⚪ No improvement: {current_score:.6f} <= {best_score:.6f}")
        
        # Generate new features for next iteration (hill climbing step)
        if iteration < max_iterations - 1:
            print(f"🔧 Generating new features for iteration {iteration + 2}...")
            current_features = generate_hill_climbing_features(current_features, iteration)
    
    print(f"\n🏆 Hill Climbing completed! Best MAP@3: {best_score:.6f}")
    return best_features, best_score

def generate_hill_climbing_features(df, iteration):
    """Generate additional features for hill climbing iterations"""
    df = df.copy()
    
    if iteration == 0:
        # Iteration 1: Add polynomial interactions
        print("  Adding polynomial NPK interactions...")
        if all(col in df.columns for col in ['Nitrogen', 'Phosphorous', 'Potassium']):
            df['NPK_polynomial'] = (df['Nitrogen'] ** 2) + (df['Phosphorous'] ** 2) + (df['Potassium'] ** 2)
            df['NPK_log_sum'] = np.log1p(df['Nitrogen'] + df['Phosphorous'] + df['Potassium'])
            df['NPK_harmonic_mean'] = 3 / (1/np.maximum(df['Nitrogen'], 1) + 1/np.maximum(df['Phosphorous'], 1) + 1/np.maximum(df['Potassium'], 1))
        
        # Environmental interactions
        if all(col in df.columns for col in ['Temperature', 'Humidity', 'Moisture']):
            df['env_polynomial'] = (df['Temperature'] ** 2) + (df['Humidity'] ** 2) + (df['Moisture'] ** 2)
            df['temp_humidity_interaction'] = df['Temperature'] * df['Humidity'] / 1000
    
    elif iteration == 1:
        # Iteration 2: Add advanced domain features
        print("  Adding advanced agricultural features...")
        
        # Advanced crop suitability
        if 'Crop Type' in df.columns and 'Temperature' in df.columns:
            def advanced_crop_suitability(row):
                crop_optimal_temps = {
                    'Sugarcane': 30, 'Maize': 28, 'Wheat': 25, 
                    'Paddy': 30, 'Cotton': 30, 'Tobacco': 25
                }
                optimal = crop_optimal_temps.get(row['Crop Type'], 27)
                return 1 / (1 + abs(row['Temperature'] - optimal))
            
            df['advanced_crop_suitability'] = df.apply(advanced_crop_suitability, axis=1)
        
        # Fertilizer effectiveness scoring
        if all(col in df.columns for col in ['Nitrogen', 'Phosphorous', 'Potassium']):
            # Simulate fertilizer effectiveness based on NPK balance
            df['fertilizer_effectiveness'] = np.exp(-0.1 * df[['Nitrogen', 'Phosphorous', 'Potassium']].std(axis=1))
    
    return df

print("✅ Hill climbing optimization functions defined")


In [None]:
# Run Hill Climbing Optimization with AutoGluon
print("🚀 Starting Hill Climbing Optimization with AutoGluon Ensemble...")
print("Multiple algorithms + iterative feature engineering for maximum performance")

best_features, best_score = hill_climbing_optimization(
    X_train_engineered,
    y_train_raw,  # Use raw string labels for AutoGluon
    X_orig_engineered,
    original_df['Fertilizer Name'] if original_df is not None else None,
    max_iterations=3
)

print(f"\n🏆 Hill Climbing Optimization completed!")
print(f"Best CV MAP@3: {best_score:.6f}")
print(f"Final feature count: {best_features.shape[1]}")

if best_score > 0.38:
    print(f"🎯 EXCELLENT: Achieved target 0.38+ with {best_score:.6f}!")
elif best_score > 0.33:
    print(f"✅ SUCCESS: Beat 0.33 baseline with {best_score:.6f}!")
else:
    print(f"⚠ Score {best_score:.6f} - AutoGluon ensemble should improve performance")


In [None]:
# Train final AutoGluon ensemble with best features
print("🏗️ Training final AutoGluon ensemble with optimized features...")

# Prepare final training data with the best feature set
X_train_final = best_features.copy()
y_train_final = y_train_raw.copy()

# Data expansion exactly as strategy specifies
X_train_expanded = pd.concat([X_train_final, X_train_final, X_train_final], ignore_index=True)
y_train_expanded = pd.concat([y_train_final, y_train_final, y_train_final], ignore_index=True)

# Add original dataset with 2x multiplication
if X_orig_engineered is not None and original_df is not None:
    orig_features = [col for col in X_orig_engineered.columns if col in X_train_expanded.columns]
    X_orig_subset = X_orig_engineered[orig_features]
    y_orig_subset = original_df['Fertilizer Name']
    
    # 2x multiplication
    X_orig_2x = pd.concat([X_orig_subset, X_orig_subset], ignore_index=True)
    y_orig_2x = pd.concat([y_orig_subset, y_orig_subset], ignore_index=True)
    
    # Combine
    X_train_expanded = pd.concat([X_train_expanded, X_orig_2x], ignore_index=True)
    y_train_expanded = pd.concat([y_train_expanded, y_orig_2x], ignore_index=True)

print(f"Final training data: {X_train_expanded.shape}")

# Prepare AutoGluon training data
final_train_data = X_train_expanded.copy()
final_train_data['target'] = y_train_expanded.reset_index(drop=True)

# Train final AutoGluon ensemble with maximum quality
final_predictor = TabularPredictor(
    label='target',
    path='./autogluon_final_ensemble',
    eval_metric='accuracy',
    verbosity=1
)

print("🎯 Training final ensemble with maximum quality settings...")
final_predictor.fit(
    final_train_data,
    time_limit=600,  # 10 minutes for final model
    presets='best_quality',
    auto_stack=True,
    num_bag_folds=5,  # More bagging for final model
    num_stack_levels=2,  # Deeper stacking
    hyperparameters={
        'GBM': [
            {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
            {},  # Default XGBoost
            {'boosting': 'dart', 'ag_args': {'name_suffix': 'DART'}},
            {'boosting': 'goss', 'ag_args': {'name_suffix': 'GOSS'}},
        ],
        'CAT': {},  # CatBoost
        'NN_TORCH': [{'num_epochs': 100, 'learning_rate': 0.01}],
        'FASTAI': {},
        'RF': [{'n_estimators': 300}],  # Random Forest
    },
    excluded_model_types=['KNN', 'LR']  # Exclude simple models
)

print("✅ Final AutoGluon ensemble trained")

# Show model leaderboard
leaderboard = final_predictor.leaderboard(silent=True)
print(f"\n📊 AutoGluon Model Leaderboard (Top 5):")
print(leaderboard.head())


In [None]:
# Generate test predictions with AutoGluon ensemble
print("🔮 Generating test predictions with AutoGluon ensemble...")

# Apply same feature engineering to test data
X_test_final = best_features.columns  # Get feature names from best features
X_test_processed = X_test_engineered[X_test_final].copy()

# Generate ensemble predictions
test_predictions = final_predictor.predict_proba(X_test_processed)

# Convert to numpy array if needed
if hasattr(test_predictions, 'values'):
    test_proba = test_predictions.values
else:
    test_proba = test_predictions

# Get class names from the predictor
class_names = final_predictor.class_labels

# Get top 3 predictions for each sample
top3_indices = np.argsort(test_proba, axis=1)[:, ::-1][:, :3]

# Convert indices to fertilizer names
predictions = []
for i in range(len(top3_indices)):
    top3_names = [class_names[idx] for idx in top3_indices[i]]
    pred_str = ' '.join(top3_names)
    predictions.append(pred_str)

submission = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Sample predictions:")
print(submission.head())

# Save submission
submission.to_csv('submission_autogluon.csv', index=False)
print("\n✅ Submission saved as 'submission_autogluon.csv'")

# Show prediction distribution
all_predictions = []
for pred in predictions:
    all_predictions.extend(pred.split())

pred_counts = pd.Series(all_predictions).value_counts()
print(f"\n📊 Prediction distribution:")
print(pred_counts)

# Feature importance from best model
try:
    importance = final_predictor.feature_importance(X_test_processed)
    print(f"\n🎯 Top 10 Most Important Features:")
    print(importance.head(10))
except:
    print("\n⚠ Feature importance not available")

print(f"\n🏆 Expected performance: {best_score:.6f} MAP@3")
print(f"🔥 AutoGluon Ensemble Advantages:")
print(f"   ✅ Multiple algorithms: XGBoost, LightGBM, CatBoost, Neural Networks")
print(f"   ✅ Automated stacking and blending")
print(f"   ✅ Hill climbing feature optimization")
print(f"   ✅ Advanced categorical handling")

if best_score > 0.38:
    print("🎯 EXCELLENT: Target 0.38+ achieved with AutoGluon ensemble!")
elif best_score > 0.33:
    print("✅ SUCCESS: Beat 0.33 baseline with ensemble power!")
else:
    print("⚠ Score below target - but ensemble should provide significant boost")
