In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv('crop_production.csv')
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [2]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     242361 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.1+ MB


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
count,246091,246091,246091.0,246091,246091,246091.0,242361.0
unique,33,646,,6,124,,
top,Uttar Pradesh,BIJAPUR,,Kharif,Rice,,
freq,33306,945,,95951,15104,,
mean,,,2005.643018,,,12002.82,582503.4
std,,,4.952164,,,50523.4,17065810.0
min,,,1997.0,,,0.04,0.0
25%,,,2002.0,,,80.0,88.0
50%,,,2006.0,,,582.0,729.0
75%,,,2010.0,,,4392.0,7023.0


In [3]:
df=df.dropna()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 242361 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     242361 non-null  object 
 1   District_Name  242361 non-null  object 
 2   Crop_Year      242361 non-null  int64  
 3   Season         242361 non-null  object 
 4   Crop           242361 non-null  object 
 5   Area           242361 non-null  float64
 6   Production     242361 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 14.8+ MB


In [5]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [6]:
df=df.drop('District_Name',axis=1)

In [7]:
df.head()

Unnamed: 0,State_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,2000,Whole Year,Cashewnut,720.0,165.0


In [8]:
df['Season'].unique()

array(['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '], dtype=object)

In [9]:
'''import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle


# Categorical columns
cat_cols = ["State_Name", "Season", "Crop"]

# Dictionary to store fitted encoders
encoders = {}

# Encode each categorical column
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Save encoders
with open("label_encoders_new.pkl", "wb") as f:
    pickle.dump(encoders, f)

print("Encoders saved successfully!")'''


'import pandas as pd\nfrom sklearn.preprocessing import LabelEncoder\nimport pickle\n\n\n# Categorical columns\ncat_cols = ["State_Name", "Season", "Crop"]\n\n# Dictionary to store fitted encoders\nencoders = {}\n\n# Encode each categorical column\nfor col in cat_cols:\n    le = LabelEncoder()\n    df[col] = le.fit_transform(df[col])\n    encoders[col] = le\n\n# Save encoders\nwith open("label_encoders_new.pkl", "wb") as f:\n    pickle.dump(encoders, f)\n\nprint("Encoders saved successfully!")'

In [10]:
df.head(20)

Unnamed: 0,State_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,2000,Whole Year,Cashewnut,720.0,165.0
5,Andaman and Nicobar Islands,2000,Whole Year,Coconut,18168.0,65100000.0
6,Andaman and Nicobar Islands,2000,Whole Year,Dry ginger,36.0,100.0
7,Andaman and Nicobar Islands,2000,Whole Year,Sugarcane,1.0,2.0
8,Andaman and Nicobar Islands,2000,Whole Year,Sweet potato,5.0,15.0
9,Andaman and Nicobar Islands,2000,Whole Year,Tapioca,40.0,169.0


In [11]:
df['State_Name'].unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ',
       'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype=object)

In [12]:
df['Crop'].unique()

array(['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Mesta', 'Cowpea(Lobia)',
       'Lemon', 'Pome Granet', 'Sapota', 'Cabbage', 'Rapeseed &Mustard',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Varagu',
       'Garl

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json

# Example dataframe (replace with your df)
data = {
    "State_Name": ['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ',
       'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
    "Season": ['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '],
    "Crop": ['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Mesta', 'Cowpea(Lobia)',
       'Lemon', 'Pome Granet', 'Sapota', 'Cabbage', 'Rapeseed &Mustard',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Varagu',
       'Garlic', 'Ginger', 'Oilseeds total', 'Pulses total', 'Jute',
       'Peas & beans (Pulses)', 'Blackgram', 'Paddy', 'Pineapple',
       'Barley', 'Sannhamp', 'Khesari', 'Guar seed', 'Moth',
       'Other Cereals & Millets', 'Cond-spcs other', 'Turnip', 'Carrot',
       'Redish', 'Arcanut (Processed)', 'Atcanut (Raw)',
       'Cashewnut Processed', 'Cashewnut Raw', 'Cardamom', 'Rubber',
       'Bitter Gourd', 'Drum Stick', 'Jack Fruit', 'Snak Guard', 'Tea',
       'Coffee', 'Cauliflower', 'Other Citrus Fruit', 'Water Melon',
       'Total foodgrain', 'Kapas', 'Colocosia', 'Lentil', 'Bean',
       'Jobster', 'Perilla', 'Rajmash Kholar', 'Ricebean (nagadal)',
       'Ash Gourd', 'Beet Root', 'Lab-Lab', 'Ribed Guard', 'Yam',
       'Pump Kin', 'Apple', 'Peach', 'Pear', 'Plums', 'Litchi', 'Ber',
       'Other Dry Fruit', 'Jute & mesta']
}
#df = pd.DataFrame(data)

# Categorical columns
cat_cols = ["State_Name", "Season", "Crop"]

# Dictionary to store mappings
encoder_mappings = {}

# Encode each categorical column and save mapping
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoder_mappings[col] = {
        "classes": le.classes_.tolist(),
        "mapping": {cls: int(idx) for idx, cls in enumerate(le.classes_)}
    }

# Save mappings as JSON
with open("label_encoders.json", "w") as f:
    json.dump(encoder_mappings, f, indent=4)

print("Encoders saved to label_encoders.json")
print(df.head())


Encoders saved to label_encoders.json
   State_Name  Crop_Year  Season  Crop    Area  Production
0           0       2000       1     2  1254.0      2000.0
1           0       2000       1    74     2.0         1.0
2           0       2000       1    95   102.0       321.0
3           0       2000       4     7   176.0       641.0
4           0       2000       4    22   720.0       165.0


In [14]:
df.head()

Unnamed: 0,State_Name,Crop_Year,Season,Crop,Area,Production
0,0,2000,1,2,1254.0,2000.0
1,0,2000,1,74,2.0,1.0
2,0,2000,1,95,102.0,321.0
3,0,2000,4,7,176.0,641.0
4,0,2000,4,22,720.0,165.0


In [15]:
X=df.drop(['Production'], axis=1)
y=df['Production']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,y_train)

In [18]:
preds = model.predict(X_test)

In [19]:
from sklearn.metrics import r2_score
r = r2_score(y_test,preds)
print("R2score when we predict using Randomn forest is ",r)

R2score when we predict using Randomn forest is  0.9240228984174578


In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse=mean_squared_error(y_test,preds)
mae=mean_absolute_error(y_test,preds)
r2=r2_score(y_test,preds)
print("Mean Squared Error:", round(mse, 4))
print("Mean Absolute Error:", round(mae, 4))
print("R-squared:", round(r2, 4))


Mean Squared Error: 27538208338875.926
Mean Absolute Error: 145404.2153
R-squared: 0.924


In [21]:
import pickle
with open("rf_reg_model.pkl", "wb") as f:
    pickle.dump(model, f)