In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/optimal-fertilizers-features/training_data.csv
/kaggle/input/optimal-fertilizers-features/test_data.csv
/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv


In [2]:
# !pip install autofeat

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
X_new = pd.read_csv('/kaggle/input/optimal-fertilizers-features/training_data.csv')
test_new = pd.read_csv('/kaggle/input/optimal-fertilizers-features/test_data.csv')
train_org = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
sample_sub = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')

In [4]:
train_new = pd.concat([train.drop(columns='id'), train_org])

In [5]:
train_new.head(3)

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,27,69,65,Sandy,Millets,30,6,18,28-28
2,29,63,32,Sandy,Millets,24,12,16,17-17-17


In [6]:
seed=1
folds=10

np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import make_scorer

def mapk_score(y_true, y_score, k=3):
    sorted_predictions = np.argsort(y_score, axis=1)[:, -3:][:, ::-1]
    map_at_3 = 0
    for i in range(3):
        map_at_3 += (sorted_predictions[:, i] == y_true).sum() / (i+1)

    return map_at_3 / len(y_score)

def cv_score(X, y, model_dict, folds=folds, seed=seed, k=3):
    skf = StratifiedKFold(random_state=seed, n_splits=folds, shuffle=True)
    results = {}
    mapk_scorer = make_scorer(
        mapk_score, 
        needs_proba=True,
        greater_is_better=True,
        k=k
    )
    for name, model in model_dict.items():
        print(f'current model being processed :{name}')
        map_scores = []
        scores = []
        for i, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            # clf = clone(model)
            model.fit(X_train, y_train)
            # probas = model.predict_proba(X_valid)

            # score = mapk_score(y_valid, probas, k=k)
            score = mapk_scorer(model, X_valid, y_valid)
            print(f'Score for {name} on fold {i} is {score}')
            scores.append(score)
        
        results[name] = np.mean(scores)
        print(f'Mean score for {name} is {results[name]}')

    return results

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class IntenseTargetEncoder(BaseEstimator, TransformerMixin):
    """
    Comprehensive Target Encoder for multiclass classification with multiple encoding strategies
    """
    
    def __init__(self, categorical_cols=None, smoothing=1.0, min_samples_leaf=1, 
                 noise_level=0.01, cv_folds=5, random_state=42, 
                 encoding_methods=['mean', 'count', 'std', 'median']):
        """
        Parameters:
        -----------
        categorical_cols : list, columns to encode
        smoothing : float, smoothing parameter for regularization
        min_samples_leaf : int, minimum samples to consider for encoding
        noise_level : float, noise to add to prevent overfitting
        cv_folds : int, cross-validation folds for out-of-fold encoding
        encoding_methods : list, types of encodings to apply
        """
        self.categorical_cols = categorical_cols
        self.smoothing = smoothing
        self.min_samples_leaf = min_samples_leaf
        self.noise_level = noise_level
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.encoding_methods = encoding_methods
        self.label_encoder = LabelEncoder()
        self.target_encodings = {}
        self.global_stats = {}
        
    def _get_target_stats(self, df, col, target_values):
        """Calculate various statistics for target encoding"""
        stats = {}
        # Create a temporary dataframe with the column and target values
        temp_df = pd.DataFrame({
            'category': df[col],
            'target': target_values
        })
        grouped = temp_df.groupby('category')['target']
        
        if 'mean' in self.encoding_methods:
            stats['mean'] = grouped.mean()
        if 'count' in self.encoding_methods:
            stats['count'] = grouped.count()
        if 'std' in self.encoding_methods:
            stats['std'] = grouped.std().fillna(0)
        if 'median' in self.encoding_methods:
            stats['median'] = grouped.median()
        if 'min' in self.encoding_methods:
            stats['min'] = grouped.min()
        if 'max' in self.encoding_methods:
            stats['max'] = grouped.max()
        if 'nunique' in self.encoding_methods:
            stats['nunique'] = grouped.nunique()
            
        return stats
    
    def _smooth_encoding(self, category_stats, global_mean, count):
        """Apply smoothing to prevent overfitting"""
        return (category_stats * count + global_mean * self.smoothing) / (count + self.smoothing)
    
    def _add_noise(self, encoded_values):
        """Add small amount of noise to encoded values"""
        if self.noise_level > 0:
            noise = np.random.normal(0, self.noise_level, len(encoded_values))
            return encoded_values + noise
        return encoded_values
    
    def _create_interaction_features(self, df, col, target_values):
        """Create interaction features between categorical variables"""
        interactions = {}
        other_cats = [c for c in self.categorical_cols if c != col and c in df.columns]
        
        for other_col in other_cats[:2]:  # Limit to 2 interactions to prevent explosion
            interaction_col = f"{col}_{other_col}"
            temp_df = df.copy()
            temp_df[interaction_col] = temp_df[col].astype(str) + "_" + temp_df[other_col].astype(str)
            interactions[interaction_col] = self._get_target_stats(temp_df, interaction_col, target_values)
        
        return interactions
    
    def fit(self, X, y):
        """Fit the target encoder"""
        df = X.copy()
        
        # Encode target variable
        y_encoded = self.label_encoder.fit_transform(y)
        
        # Determine categorical columns if not specified
        if self.categorical_cols is None:
            self.categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Calculate global statistics
        for method in self.encoding_methods:
            if method == 'mean':
                self.global_stats['mean'] = y_encoded.mean()
            elif method == 'std':
                self.global_stats['std'] = y_encoded.std()
            elif method == 'median':
                self.global_stats['median'] = np.median(y_encoded)
            elif method == 'count':
                self.global_stats['count'] = len(y_encoded)
        
        # Cross-validation target encoding
        skf = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        
        for col in self.categorical_cols:
            print(f"Encoding column: {col}")
            self.target_encodings[col] = {}
            
            # Full dataset encoding for transform
            col_stats = self._get_target_stats(df, col, y_encoded)
            
            for method in self.encoding_methods:
                if method in col_stats:
                    # Apply smoothing
                    counts = col_stats['count'] if 'count' in col_stats else df.groupby(col).size()
                    smoothed_stats = {}
                    
                    for category in col_stats[method].index:
                        category_stat = col_stats[method][category]
                        category_count = counts[category] if category in counts else 1
                        
                        if category_count >= self.min_samples_leaf:
                            if method == 'mean':
                                smoothed_stats[category] = self._smooth_encoding(
                                    category_stat, self.global_stats['mean'], category_count
                                )
                            else:
                                smoothed_stats[category] = category_stat
                        else:
                            # Use global statistic for rare categories
                            smoothed_stats[category] = self.global_stats.get(method, category_stat)
                    
                    self.target_encodings[col][method] = smoothed_stats
            
            # Create interaction features
            interactions = self._create_interaction_features(df, col, y_encoded)
            for interaction_name, interaction_stats in interactions.items():
                self.target_encodings[interaction_name] = {}
                for method in self.encoding_methods:
                    if method in interaction_stats:
                        self.target_encodings[interaction_name][method] = dict(interaction_stats[method])
        
        return self
    
    def transform(self, X):
        """Transform the data using fitted encodings"""
        df = X.copy()
        
        # Convert categorical columns to object type to avoid categorical constraints
        for col in df.columns:
            if hasattr(df[col], 'cat'):  # Check if it's categorical
                df[col] = df[col].astype(str)
        
        for col in self.categorical_cols:
            if col not in df.columns:
                continue
                
            for method in self.encoding_methods:
                if method in self.target_encodings[col]:
                    new_col_name = f"{col}_target_{method}"
                    encoding_dict = self.target_encodings[col][method]
                    
                    # Map values, use global mean for unseen categories
                    default_value = self.global_stats.get(method, 0)
                    encoded_series = df[col].astype(str).map(encoding_dict)
                    df[new_col_name] = encoded_series.fillna(default_value).astype(float)
                    
                    # Add noise to prevent overfitting
                    df[new_col_name] = self._add_noise(df[new_col_name])
        
        # Handle interaction features
        other_cats = [c for c in self.categorical_cols if c in df.columns]
        for i, col1 in enumerate(other_cats):
            for col2 in other_cats[i+1:i+3]:  # Limit interactions
                interaction_name = f"{col1}_{col2}"
                if interaction_name in self.target_encodings:
                    df[interaction_name] = df[col1].astype(str) + "_" + df[col2].astype(str)
                    
                    for method in self.encoding_methods:
                        if method in self.target_encodings[interaction_name]:
                            new_col_name = f"{interaction_name}_target_{method}"
                            encoding_dict = self.target_encodings[interaction_name][method]
                            default_value = self.global_stats.get(method, 0)
                            encoded_series = df[interaction_name].astype(str).map(encoding_dict)
                            df[new_col_name] = encoded_series.fillna(default_value).astype(float)
                            df[new_col_name] = self._add_noise(df[new_col_name])
                    
                    # Drop the interaction column itself
                    df.drop(interaction_name, axis=1, inplace=True)
        
        return df
    
    def fit_transform(self, X, y):
        """Fit and transform in one step with proper CV to prevent overfitting"""
        df = X.copy()
        
        # Convert categorical columns to object type to avoid categorical constraints
        for col in df.columns:
            if hasattr(df[col], 'cat'):  # Check if it's categorical
                df[col] = df[col].astype(str)
        
        y_encoded = self.label_encoder.fit_transform(y)
        
        # Determine categorical columns
        if self.categorical_cols is None:
            self.categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Cross-validation encoding to prevent overfitting
        skf = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        
        for col in self.categorical_cols:
            print(f"CV Encoding column: {col}")
            
            for method in self.encoding_methods:
                df[f"{col}_target_{method}"] = 0.0
        
        # Out-of-fold encoding
        for fold, (train_idx, val_idx) in enumerate(skf.split(df, y_encoded)):
            train_df = df.iloc[train_idx].copy()
            
            for col in self.categorical_cols:
                col_stats = self._get_target_stats(train_df, col, y_encoded[train_idx])
                
                for method in self.encoding_methods:
                    if method in col_stats:
                        encoding_dict = dict(col_stats[method])
                        default_value = y_encoded[train_idx].mean() if method == 'mean' else 0
                        
                        # Apply to validation set
                        encoded_values = df.iloc[val_idx][col].astype(str).map(encoding_dict).fillna(default_value).astype(float)
                        df.iloc[val_idx, df.columns.get_loc(f"{col}_target_{method}")] = encoded_values
        
        # Fit on full data for future transforms
        self.fit(X, y)
        
        return df


def create_target_encoded_features(X_train, y_train, X_test=None, 
                                 categorical_cols=None, cv_folds=5, random_state=42):
    """
    Convenience function to create target encoded features
    """
    encoder = IntenseTargetEncoder(
        categorical_cols=categorical_cols,
        smoothing=1.0,
        min_samples_leaf=1,
        noise_level=0.01,
        cv_folds=cv_folds,
        random_state=random_state,
        encoding_methods=['mean', 'count', 'std', 'median', 'nunique']
    )
    
    # Fit and transform training data with CV
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    
    # Transform test data if provided
    if X_test is not None:
        X_test_encoded = encoder.transform(X_test)
        return X_train_encoded, X_test_encoded, encoder
    
    return X_train_encoded, encoder


# Analysis and visualization functions
def analyze_target_encoding_quality(X_original, X_encoded, y, categorical_cols, encoder=None):
    """
    Analyze the quality and effectiveness of target encoding
    """
    print("="*80)
    print("TARGET ENCODING QUALITY ANALYSIS")
    print("="*80)
    
    # Basic info
    print(f"Original dataset shape: {X_original.shape}")
    print(f"Encoded dataset shape: {X_encoded.shape}")
    print(f"New features added: {X_encoded.shape[1] - X_original.shape[1]}")
    print()
    
    # Show new columns
    original_cols = set(X_original.columns)
    encoded_cols = set(X_encoded.columns)
    new_cols = encoded_cols - original_cols
    
    print("NEW ENCODED FEATURES:")
    for col in sorted(new_cols):
        print(f"  - {col}")
    print()
    
    # Analyze each categorical column's encoding
    for cat_col in categorical_cols:
        if cat_col not in X_original.columns:
            continue
            
        print(f"ANALYSIS FOR '{cat_col}':")
        print("-" * 50)
        
        # Original column info
        unique_vals = X_original[cat_col].nunique()
        print(f"Unique categories: {unique_vals}")
        print(f"Value counts:")
        print(X_original[cat_col].value_counts().head())
        print()
        
        # Show encoded features for this column
        encoded_features = [col for col in new_cols if col.startswith(f"{cat_col}_target_")]
        
        for enc_col in encoded_features:
            if enc_col in X_encoded.columns:
                enc_values = X_encoded[enc_col]
                print(f"  {enc_col}:")
                print(f"    Range: [{enc_values.min():.4f}, {enc_values.max():.4f}]")
                print(f"    Mean: {enc_values.mean():.4f}")
                print(f"    Std: {enc_values.std():.4f}")
                print(f"    Unique values: {enc_values.nunique()}")
        print()
    
    # Show correlation with target (if possible)
    if hasattr(encoder, 'label_encoder') and encoder.label_encoder is not None:
        try:
            from sklearn.preprocessing import LabelEncoder
            le_temp = LabelEncoder()
            y_numeric = le_temp.fit_transform(y)
            
            print("CORRELATION WITH TARGET (top 10 features):")
            print("-" * 50)
            correlations = []
            for col in new_cols:
                if col in X_encoded.columns:
                    corr = np.corrcoef(X_encoded[col], y_numeric)[0, 1]
                    if not np.isnan(corr):
                        correlations.append((col, abs(corr)))
            
            correlations.sort(key=lambda x: x[1], reverse=True)
            for col, corr in correlations[:10]:
                print(f"  {col}: {corr:.4f}")
        except:
            print("Could not calculate correlations with target")
    
    print("="*80)
    return X_encoded

def compare_categorical_vs_encoded(X_original, X_encoded, categorical_col, target_col_prefix):
    """
    Compare original categorical values with their encoded counterparts
    """
    if categorical_col not in X_original.columns:
        print(f"Column {categorical_col} not found in original data")
        return
    
    encoded_cols = [col for col in X_encoded.columns if col.startswith(f"{categorical_col}_target_")]
    
    if not encoded_cols:
        print(f"No encoded columns found for {categorical_col}")
        return
    
    print(f"COMPARISON FOR '{categorical_col}':")
    print("="*60)
    
    # Create comparison dataframe
    comparison_df = X_original[[categorical_col]].copy()
    for enc_col in encoded_cols:
        comparison_df[enc_col] = X_encoded[enc_col]
    
    # Group by categorical value and show encoding stats
    grouped = comparison_df.groupby(categorical_col)
    
    for category, group in grouped:
        print(f"\nCategory: '{category}' (n={len(group)})")
        print("-" * 30)
        for enc_col in encoded_cols:
            values = group[enc_col]
            print(f"  {enc_col}:")
            print(f"    Mean: {values.mean():.4f}")
            print(f"    Std: {values.std():.4f}")
            print(f"    Range: [{values.min():.4f}, {values.max():.4f}]")
    
    return comparison_df

def visualize_encoding_distribution(X_encoded, categorical_cols, save_plots=False):
    """
    Create visualizations of the encoded features
    """
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        # Set style
        plt.style.use('default')
        sns.set_palette("husl")
        
        for cat_col in categorical_cols:
            encoded_features = [col for col in X_encoded.columns if col.startswith(f"{cat_col}_target_")]
            
            if not encoded_features:
                continue
                
            n_features = len(encoded_features)
            fig, axes = plt.subplots(2, (n_features + 1) // 2, figsize=(15, 8))
            fig.suptitle(f'Distribution of Encoded Features for {cat_col}', fontsize=16)
            
            if n_features == 1:
                axes = [axes]
            elif n_features <= 2:
                axes = axes.flatten()
            else:
                axes = axes.flatten()
            
            for i, enc_col in enumerate(encoded_features):
                if i < len(axes):
                    axes[i].hist(X_encoded[enc_col], bins=30, alpha=0.7, edgecolor='black')
                    axes[i].set_title(enc_col.replace(f"{cat_col}_target_", "").upper())
                    axes[i].set_xlabel('Encoded Value')
                    axes[i].set_ylabel('Frequency')
            
            # Hide empty subplots
            for j in range(len(encoded_features), len(axes)):
                axes[j].set_visible(False)
            
            plt.tight_layout()
            
            if save_plots:
                plt.savefig(f'{cat_col}_encoding_distribution.png', dpi=300, bbox_inches='tight')
            
            plt.show()
            
    except ImportError:
        print("Matplotlib/Seaborn not available for plotting")

# Updated convenience function with analysis
def create_target_encoded_features_with_analysis(X_train, y_train, X_test=None, 
                                               categorical_cols=None, cv_folds=5, 
                                               random_state=seed, analyze=True):
    """
    Convenience function to create target encoded features with quality analysis
    """
    print("Creating target encoded features...")
    
    encoder = IntenseTargetEncoder(
        categorical_cols=categorical_cols,
        smoothing=1.0,
        min_samples_leaf=1,
        noise_level=0.01,
        cv_folds=cv_folds,
        random_state=random_state,
        encoding_methods=['mean', 'count', 'std', 'median', 'nunique']
    )
    
    # Fit and transform training data with CV
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    
    if analyze:
        # Analyze encoding quality
        X_train_encoded = analyze_target_encoding_quality(
            X_train, X_train_encoded, y_train, 
            encoder.categorical_cols, encoder
        )
    
    # Transform test data if provided
    if X_test is not None:
        X_test_encoded = encoder.transform(X_test)
        return X_train_encoded, X_test_encoded, encoder
    
    return X_train_encoded, encoder
    """
    Cross-validation with proper target encoding to prevent data leakage
    """
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import make_scorer
    
    skf = StratifiedKFold(random_state=seed, n_splits=folds, shuffle=True)
    results = {}
    
    # Assuming you have mapk_score function defined
    mapk_scorer = make_scorer(
        mapk_score, 
        needs_proba=True,
        greater_is_better=True,
        k=k
    )
    
    for name, model in model_dict.items():
        print(f'Current model being processed: {name}')
        scores = []
        
        for i, (train_index, valid_index) in enumerate(skf.split(X, y), 1):
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            # Apply target encoding within each fold
            encoder = IntenseTargetEncoder(
                categorical_cols=categorical_cols,
                cv_folds=3,  # Nested CV for encoding
                random_state=seed
            )
            
            # Fit encoder on training data and transform both sets
            encoder.fit(X_train, y_train)
            X_train_encoded = encoder.transform(X_train)
            X_valid_encoded = encoder.transform(X_valid)
            
            # Train model
            model.fit(X_train_encoded, y_train)
            
            # Score model
            score = mapk_scorer(model, X_valid_encoded, y_valid)
            print(f'Score for {name} on fold {i} is {score}')
            scores.append(score)
        
        results[name] = np.mean(scores)
        print(f'Mean score for {name} is {results[name]}')
    
    return results


# Example usage for your fertilizer dataset
def apply_target_encoding_to_fertilizer_data(df, target_col='Fertilizer Name'):
    """
    Apply target encoding specifically to fertilizer dataset
    """
    # Identify categorical columns
    categorical_cols = ['Soil Type', 'Crop Type']  # Add other categorical columns as needed
    
    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    # Apply target encoding
    X_encoded, encoder = create_target_encoded_features(
        X, y, 
        categorical_cols=categorical_cols,
        cv_folds=5,
        random_state=42
    )
    
    print("Original features:", X.columns.tolist())
    print("Encoded features:", X_encoded.columns.tolist())
    print("New features added:", [col for col in X_encoded.columns if col not in X.columns])
    
    return X_encoded, y, encoder

In [9]:
class_mapping = {
    '14-35-14':0,
    '10-26-26':1,
    '17-17-17':2,
    '28-28':3,
    '20-20':4,
    'DAP':5,
    'Urea':6
}
rev_class_mapping = {
    0:'14-35-14',
    1:'10-26-26',
    2:'17-17-17',
    3:'28-28',
    4:'20-20',
    5:'DAP',
    6:'Urea'
}

X = train.drop(columns=['id', 'Fertilizer Name'])
X_full = train_new.drop(columns=['Fertilizer Name'])
y = train['Fertilizer Name']
y_full = train_new['Fertilizer Name']
y_full = y_full.map(class_mapping)
y = y.map(class_mapping)

In [10]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

le = OrdinalEncoder()
le_2 = OrdinalEncoder()
obj_dtypes = X.select_dtypes(include=['object', 'category']).columns
X[obj_dtypes] = X[obj_dtypes].astype('category')
X_full[obj_dtypes] = X_full[obj_dtypes].astype('category')
test[obj_dtypes] = test[obj_dtypes].astype('category')
X[obj_dtypes] = le.fit_transform(X[obj_dtypes])
X_full[obj_dtypes] = le_2.fit_transform(X_full[obj_dtypes])
# encoded_df = pd.DataFrame(encoded_features, 
#                          columns=le.get_feature_names_out(obj_dtypes),
#                          index=X.index)
# X = pd.concat([X, encoded_df], axis=1)
# X.drop(columns=obj_dtypes, inplace=True)

In [11]:
le_2.categories_

[array(['Black', 'Clayey', 'Loamy', 'Red', 'Sandy'], dtype=object),
 array(['Barley', 'Cotton', 'Ground Nuts', 'Maize', 'Millets', 'Oil seeds',
        'Paddy', 'Pulses', 'Sugarcane', 'Tobacco', 'Wheat'], dtype=object)]

In [12]:
test[obj_dtypes] = le_2.transform(test[obj_dtypes])

In [13]:
# from autofeat import AutoFeatRegressor

# af = AutoFeatRegressor(feateng_steps=2, verbose=1, n_jobs=1)
# X_full = af.fit_transform(X_full, y_full)
# test_new = af.transform(test.drop(columns='id'))

In [14]:
# from sklearn.preprocessing import OrdinalEncoder

# def artificial_features(df):
#     df = df.copy()
    
#     # Nutrient Ratios
#     df['N_P_ratio'] = df['Nitrogen'] / df['Phosphorous'].replace(0,1)
#     df['P_K_ratio'] = df['Phosphorous'] / df['Potassium'].replace(0,1)
#     df['N_K_ratio'] = df['Nitrogen'] / df['Potassium'].replace(0,1)
#     df['NPK_sum'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
#     df['Nitrogen_deficient'] = (df['Nitrogen'] < 20).astype(int)
#     df['Phosphorus_deficient'] = (df['Phosphorous'] < 15).astype(int)
#     df['Potassium_deficient'] = (df['Potassium'] < 10).astype(int)
    
#     # 3. Polynomial Terms
#     df['Temp_squared'] = df['Temparature'] ** 2
#     df['Humidity_squared'] = df['Humidity'] ** 2
#     df['Moisture_squared'] = df['Moisture'] ** 2
    
#     # 4. Interaction Terms
#     df['Temp_Humidity'] = df['Temparature'] * df['Humidity']
#     df['Temp_Moisture'] = df['Temparature'] * df['Moisture']
#     df['Humidity_Moisture'] = df['Humidity'] * df['Moisture']
#     df['Temp_Humidity_Moisture'] = df['Temparature'] * df['Humidity'] * df['Moisture']
#     df['N_Humidity'] = df['Nitrogen'] * df['Humidity']
#     df['P_Moisture'] = df['Phosphorous'] * df['Moisture']
#     df['K_Temp'] = df['Potassium'] * df['Temparature']
    
#     # 5. Binned Features
#     df['Temp_bin'] = pd.cut(df['Temparature'], bins=5, labels=[0, 1, 2, 3, 4])
#     df['Humidity_bin'] = pd.cut(df['Humidity'], bins=5, labels=[0, 1, 2, 3, 4])
#     df['Moisture_bin'] = pd.cut(df['Moisture'], bins=5, labels=[0, 1, 2, 3, 4])
    
#     # 6. Aggregations by Crop Type
#     crop_stats = df.groupby('Crop Type')[['Nitrogen', 'Phosphorous', 'Potassium']].agg(['mean', 'median', 'std'])
#     crop_stats.columns = ['_'.join(col).strip() for col in crop_stats.columns]
#     df = df.merge(crop_stats, on='Crop Type', how='left')
    
#     # 7. Aggregations by Soil Type
#     soil_stats = df.groupby('Soil Type')[['Moisture', 'Temparature']].agg(['mean', 'median', 'std'])
#     soil_stats.columns = ['_'.join(col).strip() for col in soil_stats.columns]
#     df = df.merge(soil_stats, on='Soil Type', how='left')
    
#     # 8. Soil × Crop Interaction Features (Modified to use Ordinal Encoder)
#     # interaction = df[['Soil Type', 'Crop Type']].astype(str).agg('_'.join, axis=1).to_frame(name='Soil_Crop')
#     # encoder = OrdinalEncoder()
#     # encoded = encoder.fit_transform(interaction)
#     # df['Soil_Crop_encoded'] = encoded.flatten()
    
#     # 9. Derived Indices
#     df['THI'] = (1.8 * df['Temparature'] + 32) - ((100 - df['Humidity']) / 10 * 1.8)  # Temperature-Humidity Index
#     df['NPK_Balance'] = abs(df['N_P_ratio'] - 1) + abs(df['P_K_ratio'] - 1) + abs(df['N_K_ratio'] - 1)  # Nutrient Balance Score
    
#     return df.copy()

# X_full = artificial_features(X_full)
# test_new = artificial_features(test_new)

In [15]:
# interaction = X_new[['Soil Type', 'Crop Type']].astype(str).agg('_'.join, axis=1).to_frame(name='Soil_Crop')
# interaction_test = test_new[['Soil Type', 'Crop Type']].astype(str).agg('_'.join, axis=1).to_frame(name='Soil_Crop')

# le_2 = OrdinalEncoder()
# train_encoded = le_2.fit_transform(interaction)
# test_encoded = le_2.transform(interaction_test)

# X_new['Soil_Crop_encoded'] = train_encoded.flatten()
# test_new['Soil_Crop_encoded'] = test_encoded.flatten()

In [16]:
# X_full.to_csv('training_data.csv',index=False)
# test_new.to_csv('test_data.csv', index=False)

In [17]:
cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium',
       'Phosphorous', 'Moisture*Nitrogen**3', 'SoilType**2/Nitrogen',
       'Nitrogen**3/Moisture', 'CropType*Nitrogen**3', 'CropType**2/Moisture',
       'CropType**2/Nitrogen', 'Moisture**3*SoilType', 'CropType**3*Moisture',
       'Phosphorous*Potassium', 'Potassium**3/Nitrogen',
       'CropType*Potassium**2', 'CropType*exp(SoilType)',
       'Nitrogen*sqrt(SoilType)', 'CropType**2*Moisture**3',
       'sqrt(CropType)/Nitrogen', 'Potassium*exp(SoilType)',
       'Humidity**3*SoilType**3', 'CropType*Phosphorous**3',
       'Phosphorous*exp(SoilType)', 'SoilType**3*Temparature**2',
       'sqrt(Phosphorous)/Nitrogen', 'CropType**3*sqrt(Nitrogen)',
       'Nitrogen**3*Phosphorous**2', 'CropType**2*sqrt(SoilType)',
       'CropType*sqrt(Phosphorous)', 'Nitrogen**3*sqrt(Potassium)',
       'sqrt(Nitrogen)*Phosphorous**2', 'Nitrogen**3*sqrt(Phosphorous)',
       'sqrt(Phosphorous)*Potassium**3', 'sqrt(Phosphorous)*log(Nitrogen)',
       'sqrt(Phosphorous)*sqrt(SoilType)', 'Potassium**3/Moisture',
       'exp(SoilType)/Moisture', 'Moisture*sqrt(SoilType)',
       'sqrt(CropType)*Moisture', 'Phosphorous**2/Nitrogen',
       'sqrt(SoilType)/Moisture', 'log(Nitrogen)/Moisture',
       'sqrt(SoilType)/Nitrogen', 'Nitrogen*exp(SoilType)',
       'CropType**3*sqrt(Phosphorous)', 'Nitrogen**3*SoilType',
       'sqrt(Nitrogen)*sqrt(Potassium)', 'N_P_ratio', 'N_K_ratio',
       'Nitrogen_mean', 'Nitrogen_std', 'Phosphorous_mean', 'Phosphorous_std',
       'Potassium_mean', 'Potassium_std', 'Moisture_mean', 'Moisture_std',
       'Temparature_mean', 'Soil_Crop_encoded']

In [18]:
# fertilizers = train["Fertilizer Name"].unique()

# # Step 3: Create transformed data
# transformed_data = []

# for _, row in train.iterrows():
#     group_id = row["id"]  
#     true_fertilizer = row["Fertilizer Name"]  
    
#     for fertilizer in fertilizers:
#         relevance = 1 if fertilizer == true_fertilizer else 0
#         new_row = row.to_dict()
#         new_row["fertilizer"] = fertilizer 
#         new_row["relevance"] = relevance 
#         transformed_data.append(new_row)

# expanded_df = pd.DataFrame(transformed_data)

# features = [col for col in expanded_df.columns if col not in ["Fertilizer Name", "fertilizer", "relevance", "id"]]
# X = expanded_df[features] 
# y = expanded_df["relevance"] 
# groups = expanded_df["id"].value_counts(sort=False).tolist()  

In [19]:
# import xgboost as xgb

# cat_cols = ["Soil Type", "Crop Type"]  # Update with your actual categorical columns
# for col in cat_cols:
#     expanded_df[col] = expanded_df[col].astype("category")
# train_mask = expanded_df["id"].isin(train["id"].sample(frac=0.8, random_state=42))  # 80% train
# val_mask = ~expanded_df["id"].isin(train["id"].sample(frac=0.8, random_state=42))    # 20% val

# X_train, y_train = X[train_mask], y[train_mask]
# X_val, y_val = X[val_mask], y[val_mask]
# groups_train = groups[:int(0.8 * len(groups))]  
# groups_val = groups[int(0.8 * len(groups)):]  

# for col in cat_cols:
#     X_train[col] = X_train[col].astype("category")
#     X_val[col] = X_val[col].astype("category")


# dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
# dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
# dtrain.set_group(groups_train)
# dval.set_group(groups_val)

# # Train with ranking objective
# params = {
#     "objective": "rank:pairwise",  # Ranking objective
#     "eval_metric": "map@3",        # Metric to optimize
#     "eta": 0.05,           # Lower learning rate
#     "max_depth": 6,        # Deeper trees
#     # "lambda": 1.0,         # L2 regularization
#     # "gamma": 0.1,          # Minimum loss reduction for splits
#     "min_child_weight": 1,
#     "subsample": 0.8,      # Prevent overfitting
#     "colsample_bytree": 0.7
# }

# model = xgb.train(
#     params,
#     dtrain,
#     num_boost_round=100,
#     evals=[(dval, "validation")]
# )

In [20]:
# n_features = int(X_train.shape[1])
# n_classes = int(y.nunique())

In [21]:
# class MAPK(tf.keras.metrics.Metric):
#     def __init__(self, k=3, name='mapk', **kwargs):
#         super(MAPK, self).__init__(name=name, **kwargs)
#         self.k = k
#         self.total = self.add_weight(name='total', initializer='zeros')
#         self.count = self.add_weight(name='count', initializer='zeros')

#     def update_state(self, y_true, y_pred, sample_weight=None):
#         # Fix: Keep y_pred as float32 for proper probability handling
#         y_true = tf.cast(y_true, tf.int32)
#         y_pred = tf.cast(y_pred, tf.float32)  # ← Don't cast to int32

#         # Get top-k indices
#         _, top_k_indices = tf.nn.top_k(y_pred, k=self.k)
#         top_k_indices = tf.reverse(top_k_indices, axis=[-1])

#         # Compute AP per sample
#         positions = tf.range(1, self.k + 1, dtype=tf.float32)
#         reciprocal_positions = 1.0 / positions

#         matches = tf.equal(tf.expand_dims(y_true, axis=1), top_k_indices)
#         matches_float = tf.cast(matches, tf.float32)

#         ap_per_sample = tf.reduce_sum(matches_float * reciprocal_positions, axis=1)
#         mean_ap = tf.reduce_mean(ap_per_sample)

#         self.total.assign_add(mean_ap)
#         self.count.assign_add(1.)

#     def result(self):
#         return self.total / self.count

#     def reset_states(self):
#         self.total.assign(0.)  # ← Use assign() instead of assign_add()
#         self.count.assign(0.)

# model = models.Sequential([
#     layers.Dense(512, activation='swish', input_dim=n_features, kernel_initializer='he_normal'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.3),
    
#     layers.Dense(256, activation='swish', kernel_initializer='he_normal'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.2),
    
#     layers.Dense(128, activation='swish', kernel_initializer='he_normal'),
#     layers.BatchNormalization(),
    
#     layers.Dense(n_classes, activation='softmax')
# ])

# optimizer = tf.keras.optimizers.Adam(
#     learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
#         initial_learning_rate=0.001,
#         decay_steps=1000,
#         decay_rate=0.9
#     ),
#     clipnorm=1.0
# )

# model.compile(
#     optimizer=optimizer,
#     loss='sparse_categorical_crossentropy',
#     metrics=[MAPK(k=3)]
# )

# # 4. Essential Callbacks
# callbacks = [
#     tf.keras.callbacks.EarlyStopping(
#         monitor='val_mapk',  # Monitor custom metric
#         mode='max',
#         patience=15,
#         restore_best_weights=True
#     ),
#     tf.keras.callbacks.ReduceLROnPlateau(
#         monitor='val_loss',
#         factor=0.5,
#         patience=5,
#         min_lr=1e-6
#     ),
#     tf.keras.callbacks.TerminateOnNaN()
# ]

In [22]:
# X_train = X_train.astype(np.float32)
# y_train = y_train.astype(np.int32)

# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_valid, y_valid),
#     epochs=200,
#     batch_size=128,
#     callbacks=callbacks,
#     verbose=1
# )

In [23]:
cols = ['Temparature',
 'Humidity',
 'Moisture',
 'Soil Type',
 'Crop Type',
 'Nitrogen',
 'Potassium',
 'Phosphorous',
 # 'Phosphorous_mean',
 # 'Potassium_std',
 # 'Moisture_mean',
 # 'Potassium_mean',
 # 'Nitrogen_deficient',
 # 'Temparature_std',
 # 'Potassium_deficient',
 # 'Potassium_median',
 # 'Temparature_mean'
       ]

In [24]:
def trial_domain_features(df):
    df = df.copy()
    # NPK ratios are fundamental in agriculture
    # df['N_P_ratio'] = df['Nitrogen'] / (df['Phosphorous'] + 1e-6)
    # df['N_K_ratio'] = df['Nitrogen'] / (df['Potassium'] + 1e-6)
    # df['P_K_ratio'] = df['Phosphorous'] / (df['Potassium'] + 1e-6)

    # # Total NPK content
    # df['NPK_total'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']

    # # Balanced vs imbalanced nutrients
    # df['NPK_balance'] = df[['Nitrogen', 'Phosphorous', 'Potassium']].std(axis=1)

    # Water stress indicator
    # df['water_stress'] = (df['Humidity'] < 40) & (df['Moisture'] < 30)

    # Heat stress
    # df['heat_stress'] = df['Temparature'] > 35

    # # Optimal growing conditions
    # df['optimal_temp_humidity'] = ((df['Temparature'] >= 20) & (df['Temparature'] <= 30) & 
    #                            (df['Humidity'] >= 50) & (df['Humidity'] <= 70))
    # Different crops have different NPK requirements
    # High N crops: leafy greens, corn
    # High P crops: root vegetables, flowering plants  
    # High K crops: fruits, stressed plants

    # df['high_N_crop'] = df['Crop Type'].isin(['Wheat', 'Maize', 'Rice', 'Sugarcane'])
    # df['high_P_crop'] = df['Crop Type'].isin(['Cotton', 'Groundnut'])
    # df['high_K_crop'] = df['Crop Type'].isin(['Banana', 'Grapes'])

    # # Interaction features
    # df['N_demand_match'] = df['Nitrogen'] * df['high_N_crop']
    # df['P_demand_match'] = df['Phosphorous'] * df['high_P_crop']
    # df['K_demand_match'] = df['Potassium'] * df['high_K_crop']
    # df['moisture_N_availability'] = df['Moisture'] * df['Nitrogen'] / 100
    # df['dry_soil_P_lockup'] = (df['Moisture'] < 25) * df['Phosphorous']

    # # Humidity affects disease pressure (impacts fertilizer strategy)
    # df['humid_disease_risk'] = (df['Humidity'] > 80) * 1
    
    df['N_deficient'] = df['Nitrogen'] < 20
    df['P_deficient'] = df['Phosphorous'] < 10  
    df['K_deficient'] = df['Potassium'] < 15

    # # Multiple deficiencies
    # df['multi_deficient'] = (df['N_deficient'] + df['P_deficient'] + df['K_deficient']) >= 2

    # # Severe deficiency
    # df['severe_deficiency'] = (df['Nitrogen'] < 10) | (df['Phosphorous'] < 5) | (df['Potassium'] < 8
    
    # df['vegetative_conditions'] = (df['Temparature'] >= 25) & (df['Humidity'] >= 60) & (df['Moisture'] >= 40)
    # df['reproductive_conditions'] = (df['Temparature'] <= 28) & (df['Phosphorous'] > df['Nitrogen'])
    # df['N_efficiency'] = df['Nitrogen'] / (df['Temparature'] + df['Moisture'])
    # df['moisture_nutrient_efficiency'] = df['NPK_total'] / (df['Moisture'] + 1)
    # df['temp_adjusted_NPK'] = df['NPK_total'] * (df['Temparature'] / 30)  # 30°C as reference
    # Clay soils bind phosphorus tightly (P-fixation)
    # df['clay_P_fixation'] = (df['Soil Type'] == 'Clayey') * df['Phosphorous']

    # Sandy soils have poor nutrient retention
    # df['sandy_nutrient_leaching'] = (df['Soil Type'] == 'Sandy') * df['NPK_total']

    # # Loamy soils have optimal nutrient availability
    # df['loamy_nutrient_efficiency'] = (df['Soil Type'] == 'Loamy') * df['NPK_total']

    # Sandy soils need more frequent, lighter applications
    # df['sandy_frequent_feeding'] = (df['Soil Type'] == 'Sandy') * 1
    # Clay holds water but may have drainage issues
    # df['clay_waterlogged_risk'] = (df['Soil Type'] == 'Clayey') & (df['Moisture'] > 70)

    # # Sandy drains fast, nutrients wash away
    # df['sandy_drought_stress'] = (df['Soil Type'] == 'Sandy') & (df['Moisture'] < 30)

    # # Soil-specific moisture effectiveness
    # df['clay_moisture_retention'] = (df['Soil Type'] == 'Clayey') * df['Moisture']
    # df['sandy_moisture_loss'] = (df['Soil Type'] == 'Sandy') * (100 - df['Moisture'])
    # df['soil_crop'] = df['Soil Type'].astype(str) + '_' + df['Crop Type'].astype(str)
    return df.copy()

X_trial = trial_domain_features(X_new[cols])
test_new = trial_domain_features(test_new[cols])

In [25]:
cat_trial_cols = ['Soil Type', 'Crop Type', 'Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Phosphorous', 'Potassium']
X_trial[cat_trial_cols] = X_trial[cat_trial_cols].astype('category')
test_new[cat_trial_cols] = test_new[cat_trial_cols].astype('category')

In [26]:
# X_trial.reset_index(drop=True, inplace=True)
# train_new.reset_index(drop=True, inplace=True)

In [27]:
# X_trial['Soil Type'] = train_new['Soil Type'].astype('category')
# X_trial['Crop Type'] = train_new['Crop Type'].astype('category')

In [28]:
# soil_type_dict = {
#     "Black": 0,
#     "Clayey": 1,
#     "Loamy": 2,
#     "Red": 3,
#     "Sandy": 4
# }
# crop_type_dict = {
#     "Barley": 0,
#     "Cotton": 1,
#     "Ground Nuts": 2,
#     "Maize": 3,
#     "Millets": 4,
#     "Oil seeds": 5,
#     "Paddy": 6,
#     "Pulses": 7,
#     "Sugarcane": 8,
#     "Tobacco": 9,
#     "Wheat": 10
# }
# X_trial['Soil Type'] = X_trial["Soil Type"].map(soil_type_dict).astype(int)
# X_trial['Crop Type'] = X_trial['Crop Type'].map(crop_type_dict).astype(int)

In [29]:
# X_trial[cols] = X_trial[cols].astype('float32')

In [30]:
# import seaborn as sns
# sns.histplot(X_trial['Potassium'], kde=True)
X_trial.dtypes

Temparature    category
Humidity       category
Moisture       category
Soil Type      category
Crop Type      category
Nitrogen       category
Potassium      category
Phosphorous    category
N_deficient        bool
P_deficient        bool
K_deficient        bool
dtype: object

In [31]:
test_new.dtypes

Temparature    category
Humidity       category
Moisture       category
Soil Type      category
Crop Type      category
Nitrogen       category
Potassium      category
Phosphorous    category
N_deficient        bool
P_deficient        bool
K_deficient        bool
dtype: object

In [32]:
X_trial.nunique()

Temparature    14
Humidity       23
Moisture       41
Soil Type       5
Crop Type      11
Nitrogen       39
Potassium      20
Phosphorous    43
N_deficient     2
P_deficient     2
K_deficient     2
dtype: int64

In [33]:
# for col in cat_trial_cols:
#     X_trial[col] = X_trial[col].astype(str)

In [34]:
X_trial['const'] = 0
test_new['const'] = 0

In [35]:
def test_multiple_folds(X, y, test, model_dict, folds=folds):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    test_preds_dict = {}
    results = {}

    for name, model in model_dict.items():
        n_classes = len(np.unique(y))
        oof_probas = np.zeros((len(X), n_classes))
        test_probas = np.zeros((len(test), n_classes))
        fold_scores = []
        for i, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
            clf = clone(model)
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
            clf.fit(X_train, y_train,
                   eval_set = [(X_valid, y_valid)],
                   eval_metric = 'mlogloss')
            valid_probas = clf.predict_proba(X_valid)
            score = mapk_score(y_valid, valid_probas, k=3)
            fold_scores.append(score)
            print(f'score for fold {i} is : {score}')
            oof_probas[valid_idx] = valid_probas
            test_probas+=clf.predict_proba(test)

        test_probas/=folds
        oof_score = mapk_score(y, oof_probas, k=3)
        results[name] = oof_score
        test_preds_dict[name] = test_probas
        print('----------')
        print(f'cv mean score : {np.mean(fold_scores)}')
        print(f'oofs mean score : {oof_score}')
        print('----------')
    return results, test_preds_dict
        
        

In [36]:
import xgboost as xgb
import catboost as cb 
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
import warnings 
warnings.filterwarnings('ignore')
params = {'n_estimators': 3373, 'learning_rate': 0.011259949991593646, 
'max_depth': 21, 'min_child_weight': 9.730207421568215, 'gamma': 1.9176916551823702,
'subsample': 0.6000599493499921, 'colsample_bytree': 0.6991286032909712, 'reg_alpha': 3.460930710395595, 
'reg_lambda': 1.6121748240796503, 'max_bin': 128, 'max_cat_threshold': 4, 'max_cat_to_onehot': 4,
'random_state':seed, 'tree_method': 'hist', 'device': 'cuda', 'enable_categorical':True}
model_dict = {
    # 'xgb_1': xgb.XGBClassifier(random_state=seed, tree_method='hist', device='cuda', enable_categorical=True),
    'xgb_1': xgb.XGBClassifier(**params)
    # 'cb': cb.CatBoostClassifier(random_state=seed, cat_features=cat_trial_cols, task_type='GPU', verbose=0)
    # 'etr_1': ExtraTreesClassifier(random_state=seed, n_jobs=-1),
    # 'lgb_1': lgb.LGBMClassifier(random_state=seed, categorical_feature=['Soil Type', 'Crop Type']),
    # 'hgb_1': HistGradientBoostingClassifier(random_state=seed, verbose=0,)
}

# results = cv_score(X_trial, y_full, model_dict)
# results = cv_score(X_trial, y_full, model_dict)

In [37]:
results, test_preds_dict = test_multiple_folds(X_trial, y_full, test_new, model_dict, folds)

[0]	validation_0-mlogloss:1.94573
[1]	validation_0-mlogloss:1.94555
[2]	validation_0-mlogloss:1.94540
[3]	validation_0-mlogloss:1.94522
[4]	validation_0-mlogloss:1.94505
[5]	validation_0-mlogloss:1.94490
[6]	validation_0-mlogloss:1.94472
[7]	validation_0-mlogloss:1.94453
[8]	validation_0-mlogloss:1.94434
[9]	validation_0-mlogloss:1.94417
[10]	validation_0-mlogloss:1.94400
[11]	validation_0-mlogloss:1.94383
[12]	validation_0-mlogloss:1.94366
[13]	validation_0-mlogloss:1.94349
[14]	validation_0-mlogloss:1.94335
[15]	validation_0-mlogloss:1.94320
[16]	validation_0-mlogloss:1.94304
[17]	validation_0-mlogloss:1.94287
[18]	validation_0-mlogloss:1.94270
[19]	validation_0-mlogloss:1.94254
[20]	validation_0-mlogloss:1.94238
[21]	validation_0-mlogloss:1.94223
[22]	validation_0-mlogloss:1.94206
[23]	validation_0-mlogloss:1.94190
[24]	validation_0-mlogloss:1.94176
[25]	validation_0-mlogloss:1.94161
[26]	validation_0-mlogloss:1.94147
[27]	validation_0-mlogloss:1.94134
[28]	validation_0-mlogloss:1.9

In [38]:
test_probas = []
for key, value in test_preds_dict.items():
    test_probas.append(value)

In [39]:
preds = test_preds_dict['xgb_1']
preds.shape

(250000, 7)

In [40]:
# baseline_cols = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
#        'Nitrogen', 'Potassium', 'Phosphorous']

In [41]:
# model = xgb.XGBClassifier(random_state=seed, enable_categorical=True, tree_method='hist', device='cuda')
# # model = xgb.XGBClassifier(**params)
# model.fit(X_trial, y_full)

In [42]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores_8, val_scores_8 = learning_curve(
#     model, X_new[baseline_cols], y_full, cv=5)
    
# train_sizes, train_scores_17, val_scores_17 = learning_curve(
#     model, X_new[cols], y_full, cv=5)

# # If 17-feature model shows larger gap between train/val scores,
# # it's likely overfitting

In [43]:
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10, 6))

# # Calculate means and standard deviations
# train_mean_8 = np.mean(train_scores_8, axis=1)
# train_std_8 = np.std(train_scores_8, axis=1)
# val_mean_8 = np.mean(val_scores_8, axis=1)
# val_std_8 = np.std(val_scores_8, axis=1)

# train_mean_17 = np.mean(train_scores_17, axis=1)
# train_std_17 = np.std(train_scores_17, axis=1)
# val_mean_17 = np.mean(val_scores_17, axis=1)
# val_std_17 = np.std(val_scores_17, axis=1)

# # Plot 8-feature model
# plt.plot(train_sizes, train_mean_8, 'o-', color='blue', label='8-feature Train')
# plt.fill_between(train_sizes, train_mean_8 - train_std_8, train_mean_8 + train_std_8, alpha=0.1, color='blue')

# plt.plot(train_sizes, val_mean_8, 'o-', color='orange', label='8-feature Val')
# plt.fill_between(train_sizes, val_mean_8 - val_std_8, val_mean_8 + val_std_8, alpha=0.1, color='orange')

# # Plot 17-feature model
# plt.plot(train_sizes, train_mean_17, 'o--', color='green', label='17-feature Train')
# plt.fill_between(train_sizes, train_mean_17 - train_std_17, train_mean_17 + train_std_17, alpha=0.1, color='green')

# plt.plot(train_sizes, val_mean_17, 'o--', color='red', label='17-feature Val')
# plt.fill_between(train_sizes, val_mean_17 - val_std_17, val_mean_17 + val_std_17, alpha=0.1, color='red')

# plt.title('Learning Curves Comparison')
# plt.xlabel('Training Examples')
# plt.ylabel('MAP@3 Score')
# plt.legend(loc='best')
# plt.grid()
# plt.show()

In [44]:
# preds = model.predict_proba(test_new)

In [45]:
# preds.shape

In [46]:
top3_indices = np.argsort(-preds, axis=1)[:, :3] 
top3_labels = [[rev_class_mapping[idx] for idx in row] for row in top3_indices]
formatted_predictions = [' '.join(preds) for preds in top3_labels]

sample_sub['Fertilizer Name'] = formatted_predictions
sample_sub.to_csv('submission.csv', index=False)

In [47]:
top3_indices

array([[1, 4, 5],
       [2, 6, 3],
       [4, 6, 3],
       ...,
       [5, 1, 6],
       [1, 3, 2],
       [0, 4, 2]])

In [48]:
sample_sub

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 20-20 DAP
1,750001,17-17-17 Urea 28-28
2,750002,20-20 Urea 28-28
3,750003,14-35-14 10-26-26 17-17-17
4,750004,Urea 20-20 28-28
...,...,...
249995,999995,Urea 14-35-14 28-28
249996,999996,28-28 17-17-17 10-26-26
249997,999997,DAP 10-26-26 Urea
249998,999998,10-26-26 28-28 17-17-17
