# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder , LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split , GridSearchCV

#for evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score , accuracy_score , classification_report,silhouette_score , precision_score ,f1_score ,recall_score


#models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.svm import SVC


import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")




# 2.Load dataset

In [2]:
crop_df=pd.read_csv(r'C:\Users\Ahmed Ashraf\Desktop\archive (3)\Crop Production data.csv')
crop_df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,ANDAMAN AND NICOBAR,NICOBAR ISLANDS,2000,Kharif,Arecanut,1254.0,2000.0
1,ANDAMAN AND NICOBAR,NICOBAR ISLANDS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,ANDAMAN AND NICOBAR,NICOBAR ISLANDS,2000,Kharif,Rice,102.0,321.0
3,ANDAMAN AND NICOBAR,NICOBAR ISLANDS,2000,Whole Year,Banana,176.0,641.0
4,ANDAMAN AND NICOBAR,NICOBAR ISLANDS,2000,Whole Year,Cashewnut,720.0,165.0


In [3]:
crop_df.isna().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [4]:
crop_df['Production'].fillna(crop_df['Production'].mean() , inplace=True)

In [5]:
crop_df.isna().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [6]:
crop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     246091 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.1+ MB


# 3.Data Preprocessing

In [7]:
crop_df.drop(['District_Name'] , axis=1 , inplace=True)

In [8]:
label_encoder = LabelEncoder()

# Apply label encoding on categorical columns
crop_df['State_Name_Label'] = label_encoder.fit_transform(crop_df['State_Name'])
crop_df['Season_Label'] = label_encoder.fit_transform(crop_df['Season'])
crop_df['Crop_Label'] = label_encoder.fit_transform(crop_df['Crop'])

crop_df.head()

Unnamed: 0,State_Name,Crop_Year,Season,Crop,Area,Production,State_Name_Label,Season_Label,Crop_Label
0,ANDAMAN AND NICOBAR,2000,Kharif,Arecanut,1254.0,2000.0,0,2,2
1,ANDAMAN AND NICOBAR,2000,Kharif,Other Kharif pulses,2.0,1.0,0,2,74
2,ANDAMAN AND NICOBAR,2000,Kharif,Rice,102.0,321.0,0,2,97
3,ANDAMAN AND NICOBAR,2000,Whole Year,Banana,176.0,641.0,0,6,7
4,ANDAMAN AND NICOBAR,2000,Whole Year,Cashewnut,720.0,165.0,0,6,22


In [9]:
crop_df.drop(columns=['State_Name' , 'Season' , 'Crop'] , axis=1 , inplace=True)

In [10]:
crop_df

Unnamed: 0,Crop_Year,Area,Production,State_Name_Label,Season_Label,Crop_Label
0,2000,1254.0,2000.0,0,2,2
1,2000,2.0,1.0,0,2,74
2,2000,102.0,321.0,0,2,97
3,2000,176.0,641.0,0,6,7
4,2000,720.0,165.0,0,6,22
...,...,...,...,...,...,...
246086,2014,306.0,801.0,33,5,97
246087,2014,627.0,463.0,33,5,104
246088,2014,324.0,16250.0,33,6,108
246089,2014,279151.0,597899.0,33,7,97


# 5.Split Data into train & test

In [11]:
x = crop_df.drop("Production", axis=1)
y = crop_df["Production"]
y = y.astype(int)


In [12]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=42)

# 6.Model Building Using SVC

In [14]:
xgb=XGBRegressor()
xgb.fit(x_train , y_train)


In [15]:
y_pred=xgb.predict(x_test)

In [16]:
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("R2-score: ", f"{(r2_score(y_test, y_pred) * 100):0.1f}", "%")


MAE:  155331.72240197365
MSE:  12389053661211.94
R2-score:  92.2 %
