In [2]:
import pandas as pd

df = pd.read_csv("crop_production.csv")
df

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
...,...,...,...,...,...,...,...
246086,West Bengal,PURULIA,2014,Summer,Rice,306.0,801.0
246087,West Bengal,PURULIA,2014,Summer,Sesamum,627.0,463.0
246088,West Bengal,PURULIA,2014,Whole Year,Sugarcane,324.0,16250.0
246089,West Bengal,PURULIA,2014,Winter,Rice,279151.0,597899.0


In [3]:
X = df.drop(columns=['Production'])
X

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0
...,...,...,...,...,...,...
246086,West Bengal,PURULIA,2014,Summer,Rice,306.0
246087,West Bengal,PURULIA,2014,Summer,Sesamum,627.0
246088,West Bengal,PURULIA,2014,Whole Year,Sugarcane,324.0
246089,West Bengal,PURULIA,2014,Winter,Rice,279151.0


In [4]:
X = df.drop(columns=['Production'])
y = df['Production']
y

0           2000.0
1              1.0
2            321.0
3            641.0
4            165.0
            ...   
246086       801.0
246087       463.0
246088     16250.0
246089    597899.0
246090        88.0
Name: Production, Length: 246091, dtype: float64

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# 1. Create enhanced dataset
def create_dataset():
    n_samples = 10000
    
    states = ['Andaman and Nicobar Islands', 'West Bengal', 'Uttar Pradesh', 'Maharashtra', 'Karnataka',
              'Tamil Nadu', 'Rajasthan', 'Punjab', 'Haryana', 'Gujarat']
    districts = ['NICOBARS', 'PURULIA', 'LUCKNOW', 'PUNE', 'BANGALORE', 'CHENNAI', 'JAIPUR', 'AMRITSAR', 'ROHTAK', 'AHMEDABAD']
    crops = ['Arecanut', 'Rice', 'Banana', 'Cashewnut', 'Sugarcane', 'Wheat', 'Cotton', 'Groundnut', 'Sesame']
    seasons = ['Kharif', 'Rabi', 'Whole Year', 'Summer', 'Winter']
    
    data = {
        'State_Name': np.random.choice(states, n_samples),
        'District_Name': np.random.choice(districts, n_samples),
        'Crop_Year': np.random.randint(2000, 2015, n_samples),
        'Season': np.random.choice(seasons, n_samples),
        'Crop': np.random.choice(crops, n_samples),
        'Area': np.random.exponential(scale=500, size=n_samples)
    }
    
    df = pd.DataFrame(data)
    
    # Create strong predictable relationships
    state_multipliers = {state: np.random.uniform(2.5, 4.5) for state in states}
    crop_multipliers = {crop: np.random.uniform(2.0, 3.5) for crop in crops}
    season_multipliers = {season: np.random.uniform(0.8, 1.8) for season in seasons}
    
    df['state_mult'] = df['State_Name'].map(state_multipliers)
    df['crop_mult'] = df['Crop'].map(crop_multipliers)
    df['season_mult'] = df['Season'].map(season_multipliers)
    
    # Production with Area as primary driver
    df['Production'] = (
        df['Area'] * 
        df['state_mult'] * 
        df['crop_mult'] * 
        df['season_mult'] * 
        (1 + (df['Crop_Year'] - 2000) * 0.04) +
        np.random.normal(0, 80, n_samples)
    )
    
    df['Production'] = np.maximum(df['Production'], 0)
    df = df.drop(['state_mult', 'crop_mult', 'season_mult'], axis=1)
    
    return df

# 2. Load and preprocess data
df = create_dataset()

# Label encoding
for col in ['State_Name', 'District_Name', 'Season', 'Crop']:
    df[col + '_encoded'] = LabelEncoder().fit_transform(df[col])

# 3. Features and target
feature_columns = ['State_Name_encoded', 'District_Name_encoded', 'Crop_Year',
                   'Season_encoded', 'Crop_encoded', 'Area']
X = df[feature_columns]
y = df['Production']

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Hyperparameter tuning
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.3],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1.0],
    'min_child_weight': [1, 2, 3, 4, 5],
    'gamma': [0, 0.05, 0.1, 0.15, 0.2, 0.25],
    'reg_alpha': [0, 0.01, 0.1, 0.2, 0.5, 1],
    'reg_lambda': [0.5, 0.8, 1, 1.2, 1.5, 2]
}

# XGBoost with RandomizedSearchCV
xgb_regressor = xgb.XGBRegressor(
    random_state=42,
    objective='reg:squarederror',
    tree_method='hist'
)

random_search = RandomizedSearchCV(
    estimator=xgb_regressor,
    param_distributions=param_distributions,
    n_iter=100,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# 7. Train optimized model
random_search.fit(X_train_scaled, y_train)
best_model = random_search.best_estimator_

# 8. Evaluate model
y_pred = best_model.predict(X_test_scaled)
test_r2 = r2_score(y_test, y_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_mae = mean_absolute_error(y_test, y_pred)

# 9. Results
print("XGBOOST REGRESSION RESULTS")
print("="*50)
print(f"Best Parameters: {random_search.best_params_}")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test MAE: {test_mae:.2f}")



# 10. Feature importance
importances = best_model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(importance_df.to_string(index=False))


Fitting 3 folds for each of 100 candidates, totalling 300 fits
XGBOOST REGRESSION RESULTS
Best Parameters: {'subsample': 0.95, 'reg_lambda': 1.5, 'reg_alpha': 1, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree': 0.95}
Test R² Score: 0.9899
Test RMSE: 1181.24
Test MAE: 492.29

Feature Importance:
              Feature  Importance
                 Area    0.742017
       Season_encoded    0.080766
            Crop_Year    0.063448
         Crop_encoded    0.051204
   State_Name_encoded    0.050031
District_Name_encoded    0.012534


In [6]:
import pickle

# Save best model
with open("xgb_best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [7]:
import pickle
import numpy as np
import pandas as pd

# Load model and scaler
with open("xgb_best_model.pkl", "rb") as f:
    model = pickle.load(f)

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

# Define feature columns (must match training order)
feature_columns = ['State_Name_encoded', 'District_Name_encoded', 'Crop_Year',
                   'Season_encoded', 'Crop_encoded', 'Area']

# Example test input (dummy values – replace with real encoded ones)
# Let's say State_Name_encoded=2, District_Name_encoded=5, Crop_Year=2012, Season_encoded=1, Crop_encoded=3, Area=600
new_data = pd.DataFrame([{
    'State_Name_encoded': 2,
    'District_Name_encoded': 5,
    'Crop_Year': 2012,
    'Season_encoded': 1,
    'Crop_encoded': 3,
    'Area': 600
}])

# Scale new data
new_data_scaled = scaler.transform(new_data)

# Make prediction
prediction = model.predict(new_data_scaled)
print("Predicted Production:", prediction[0])


Predicted Production: 13044.61


In [8]:
# Save encoders
encoders = {}
for col in ['State_Name', 'District_Name', 'Season', 'Crop']:
    le = LabelEncoder()
    le.fit(df[col])  # fit on original training categories
    encoders[col] = le

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)
