In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import xgboost as xgb
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# Load your dataset
data = pd.read_csv('tamil_nadu_production.csv')

# 1. Data Preprocessing
data = data[(data['Area'] > 0) & (data['Production'] > 0)].dropna()

# 2. One-Hot Encoding of Categorical Variables and Normalization of Numerical Variables
categorical_columns = ['District_Name','Season']
numeric_columns = ['Crop_Year','Area','Production']

onehotencoder = OneHotEncoder()
scaler = StandardScaler()
labelencoder = LabelEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', onehotencoder, categorical_columns),
        ('num', scaler, numeric_columns)
    ],
    remainder='passthrough'
)

# Define the features (X) and target (y)
X = data[['District_Name', 'Crop_Year', 'Season', 'Area', 'Production']]
y = data['Crop']

# Fit and transform the preprocessor on the entire dataset
X = preprocessor.fit_transform(X)
y = labelencoder.fit_transform(y)

num_classes = len(np.unique(y))

# 3. Model Training
model = xgb.XGBClassifier(objective='multi:softprob', num_class=num_classes)

model.fit(X, y)

# 4. Model Evaluation (Evaluate on the same dataset, since you're not splitting)
y_pred = model.predict(X)

# Print the classification report
#print(classification_report(y, y_pred))

# 5. Make Predictions for Crop Recommendation
new_data = pd.DataFrame({
    'District_Name': ['VIRUDHUNAGAR'],
    'Crop_Year': [2024],
    'Season': ['Whole Year'],
    'Area': [100],
    'Production': [500]
})

new_data_encoded = preprocessor.transform(new_data)

predicted_probabilities = model.predict_proba(new_data_encoded)


crop_probabilities = [
    (labelencoder.inverse_transform([i])[0], prob*100)
    for i, prob in enumerate(predicted_probabilities[0])
]

sorted_crop_probabilities = sorted(crop_probabilities, key=lambda x: x[1], reverse=True)

# Print the top 5 crop probabilities
for crop_name, prob in sorted_crop_probabilities[:5]:
    print(f"{prob:.1f}% probability for {crop_name}")

38.0% probability for Small millets
13.5% probability for Turmeric
13.0% probability for Arecanut
8.1% probability for Gram
6.5% probability for Ragi
