# Crop Production Classification
This notebook builds a classification model using crop production dataset.
Target variable: **Crop**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv('crop_production.csv')
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [2]:
# Check dataset info
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     242361 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.1+ MB


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
count,246091,246091,246091.0,246091,246091,246091.0,242361.0
unique,33,646,,6,124,,
top,Uttar Pradesh,BIJAPUR,,Kharif,Rice,,
freq,33306,945,,95951,15104,,
mean,,,2005.643018,,,12002.82,582503.4
std,,,4.952164,,,50523.4,17065810.0
min,,,1997.0,,,0.04,0.0
25%,,,2002.0,,,80.0,88.0
50%,,,2006.0,,,582.0,729.0
75%,,,2010.0,,,4392.0,7023.0


In [3]:
# Drop irrelevant or missing-heavy columns if necessary
# Example: drop NaNs
df = df.dropna()

# Separate features and target
X = df.drop(['Crop','Production'], axis=1)
y = df['Crop']

# Identify categorical and numeric features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()

categorical_features, numeric_features

(['State_Name', 'District_Name', 'Season'], ['Crop_Year', 'Area'])

In [4]:
len(X)

242361

In [5]:
X

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Area
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,1254.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,2.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,102.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,176.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,720.0
...,...,...,...,...,...
246086,West Bengal,PURULIA,2014,Summer,306.0
246087,West Bengal,PURULIA,2014,Summer,627.0
246088,West Bengal,PURULIA,2014,Whole Year,324.0
246089,West Bengal,PURULIA,2014,Winter,279151.0


In [6]:
len(y)

242361

In [7]:
y

0                    Arecanut
1         Other Kharif pulses
2                        Rice
3                      Banana
4                   Cashewnut
                 ...         
246086                   Rice
246087                Sesamum
246088              Sugarcane
246089                   Rice
246090                Sesamum
Name: Crop, Length: 242361, dtype: object

In [8]:
# Preprocessing pipeline
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])



In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Fit the preprocessor only on training data
X_train_prep = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_prep = preprocessor.transform(X_test)

print("Shape of preprocessed training data:", X_train_prep.shape)
print("Shape of preprocessed test data:", X_test_prep.shape)


Shape of preprocessed training data: (193888, 687)
Shape of preprocessed test data: (48473, 687)


In [11]:
# Get feature names after transformation
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

ohe_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = list(ohe_features) + numeric_features

# Convert to DataFrame
import pandas as pd
X_train_prep_df = pd.DataFrame(X_train_prep.toarray(), columns=all_features)
X_test_prep_df = pd.DataFrame(X_test_prep.toarray(), columns=all_features)

X_train_prep_df.head()

Unnamed: 0,State_Name_Andaman and Nicobar Islands,State_Name_Andhra Pradesh,State_Name_Arunachal Pradesh,State_Name_Assam,State_Name_Bihar,State_Name_Chandigarh,State_Name_Chhattisgarh,State_Name_Dadra and Nagar Haveli,State_Name_Goa,State_Name_Gujarat,...,District_Name_YAVATMAL,District_Name_ZUNHEBOTO,Season_Autumn,Season_Kharif,Season_Rabi,Season_Summer,Season_Whole Year,Season_Winter,Crop_Year,Area
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.478309,0.601632
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.478309,1.500874
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.328728,-0.23162
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.934006,1.40962
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.881828,-0.227118


In [12]:
# Fit-transform train data
X_train_prep = preprocessor.fit_transform(X_train)

# Transform test data
X_test_prep = preprocessor.transform(X_test)


In [13]:
X_train_prep

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 969440 stored elements and shape (193888, 687)>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train_prep, y_train)
y_pred = rf.predict(X_test_prep)
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
'''from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=25,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_prep, y_train)
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))'''


In [None]:
# Confusion Matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

NameError: name 'y_test' is not defined