In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from collections import Counter
import uuid

In [None]:
train_df=pd.read_csv("train.csv")
test_df=pd.read_csv("test.csv")



In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
sns.heatmap(train_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.show()


In [None]:
# 1. Target Class Distribution

plt.figure(figsize=(10, 5))
sns.countplot(x='Fertilizer Name', data=train_df, order=train_df['Fertilizer Name'].value_counts().index)
plt.title('Distribution of Fertilizer Types')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 2. Feature Correlation Heatmap
plt.figure(figsize=(12, 8))
corr = train_df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Feature Correlation Heatmap")
plt.show()




In [None]:
# Label Encoding Categorical Variables
SoilType_encoder = LabelEncoder()
CropType_encoder = LabelEncoder()
FertilizerName_encoder = LabelEncoder()



In [None]:
# Combine Train and Test for Consistent Encoding
combined_soilType = pd.concat([train_df['Soil Type'], test_df['Soil Type']])
combined_cropType = pd.concat([train_df['Crop Type'], test_df['Crop Type']])


In [None]:
#Fitting the Encoders
SoilType_encoder.fit(combined_soilType)
CropType_encoder.fit(combined_cropType)
FertilizerName_encoder.fit(train_df['Fertilizer Name'])

In [None]:
train_df['Soil Type'] = SoilType_encoder.transform(train_df['Soil Type'])
train_df['Crop Type'] = CropType_encoder.transform(train_df['Crop Type'])
train_df['Fertilizer Name'] = FertilizerName_encoder.transform(train_df['Fertilizer Name'])
test_df['Soil Type'] = SoilType_encoder.transform(test_df['Soil Type'])
test_df['Crop Type'] = CropType_encoder.transform(test_df['Crop Type'])

In [None]:
train_df.head()

In [None]:
Feature_columns = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous',]

In [None]:
#Scaling Numerical Features
scaler = StandardScaler()


In [None]:
numerical_columns = ['Temparature', 'Humidity', 'Moisture','Nitrogen', 'Potassium', 'Phosphorous']

In [None]:
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])


In [None]:
#Preparing Final Inputs and Targets
X = train_df[Feature_columns].values
y = train_df['Fertilizer Name'].values


In [None]:
X_test = test_df[Feature_columns].values
test_ids = test_df['id'].values


In [None]:
def map_at_k(y_true, y_pred, k=3):
    total_score = 0.0
    for true_label, pred in zip(y_true, y_pred):
        pred_k = pred[:k]
        if true_label in pred_k:
            rank = list(pred_k).index(true_label)
            total_score += 1.0 / (rank + 1)
    return total_score / len(y_true) if len(y_true) > 0 else 0.0



In [None]:
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

# Stratified split to preserve class distribution
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Updated XGBoost model with improved params
model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(FertilizerName_encoder.classes_),
    eval_metric='mlogloss',
    max_depth=6,               # shallower trees generalize better
    learning_rate=0.05,        # slower learning
    n_estimators=2000,         # more estimators, let early stopping decide when to stop
    subsample=0.8,             # prevent overfitting
    colsample_bytree=0.8,      # random feature subset
    reg_alpha=1,               # L1 regularization
    reg_lambda=1,              # L2 regularization
    use_label_encoder=False,
    random_state=42
)

# Train with early stopping on validation set
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50,
    verbose=100
)

# Predict on validation
val_probs = model.predict_proba(X_val)
val_preds = np.argsort(-val_probs, axis=1)[:, :3]
map_score = map_at_k(y_val, val_preds)
print(f'Validation MAP@3: {map_score:.4f}')

# Predict on test
test_probs = model.predict_proba(X_test)
test_preds = np.argsort(-test_probs, axis=1)[:, :3]

# Convert predictions to fertilizer names
predictions = [
    '  '.join(FertilizerName_encoder.inverse_transform(pred))
    for pred in test_preds
]


In [None]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
# Initialize and train XGBoost model
model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(FertilizerName_encoder.classes_),
    eval_metric='mlogloss',
    max_depth=8,
    learning_rate=0.1,
    n_estimators=500,
    random_state=42
)

model.fit(X_train, y_train)
    
# Validate model
val_probs = model.predict_proba(X_val)
val_preds = np.argsort(-val_probs, axis=1)[:, :3]  # Top-3 predictions
map_score = map_at_k(y_val, val_preds)
print(f'Validation MAP@3: {map_score:.2f}')
    
# Predict on test set
test_probs = model.predict_proba(X_test)
test_preds = np.argsort(-test_probs, axis=1)[:, :3]  # Top-3 predictions
    
# Convert predictions to fertilizer names
predictions = []
for pred in test_preds:
    pred_fertilizers = FertilizerName_encoder.inverse_transform(pred)
    predictions.append('  '.join(pred_fertilizers))

    

In [None]:

# Initialize and train XGBoost model
model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(FertilizerName_encoder.classes_),
    eval_metric='mlogloss',
    max_depth=8,
    learning_rate=0.1,
    n_estimators=500,
    random_state=42
)
model.fit(X_train, y_train)

# Validate model
val_probs = model.predict_proba(X_val)
val_preds = np.argsort(-val_probs, axis=1)[:, :3]  # Top-3 predictions

def map_at_k(y_true, y_pred, k=3):
    score = 0.0
    for i, preds in enumerate(y_pred):
        if y_true[i] in preds[:k]:
            rank = np.where(preds[:k] == y_true[i])[0][0] + 1
            score += 1.0 / rank
    return score / len(y_true)

map_score = map_at_k(np.array(y_val), val_preds)
print(f'Validation MAP@3: {map_score:.4f}')

# Predict on test set
test_probs = model.predict_proba(X_test)
test_preds = np.argsort(-test_probs, axis=1)[:, :3]  # Top-3 predictions

# Convert predictions to fertilizer names
predictions = []
for pred in test_preds:
    pred_fertilizers = FertilizerName_encoder.inverse_transform(pred)
    predictions.append('  '.join(pred_fertilizers))

In [None]:
# Create submission dataframe
Submission_File= pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': predictions
})

In [None]:
Submission_File.head()

In [None]:
Submission_File.to_csv('Submission_File.csv', index=False)
print("Submission file created: Submission_File.csv")