### Import Essential Libraries

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

### Import Dataset

In [13]:
data = pd.read_csv("dataset/Cleaned_Crop_Production_Data.csv")
data.sample(10)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
71848,Himachal Pradesh,SOLAN,2007,Kharif,Sugarcane,156.0,2254.0
68701,Haryana,ROHTAK,1999,Kharif,Sesamum,135.0,0.0
42439,Bihar,SARAN,1998,Whole Year,Sweet potato,68.0,735.0
14663,Assam,CHIRANG,2012,Rabi,Black pepper,17.0,29.0
144326,Odisha,ANUGUL,2002,Whole Year,Onion,2300.0,24610.0
171608,Rajasthan,RAJSAMAND,2006,Kharif,Arhar/Tur,19.0,14.0
185331,Tamil Nadu,TIRUVANNAMALAI,1997,Whole Year,Total foodgrain,183554.0,435800.0
146941,Odisha,CUTTACK,2000,Whole Year,Potato,1069.0,14967.1
74081,Jharkhand,GARHWA,2010,Rabi,Onion,245.0,1365.0
30099,Bihar,BHOJPUR,2005,Kharif,Urad,29.0,25.0


### Check NULL Values

In [14]:
data.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

### Percentage of Production

In [15]:
sum_maxp = data["Production"].sum()
data["percent_of_production"] = data["Production"].map(lambda x:(x/sum_maxp)*100)

In [16]:
data[:5]

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,percent_of_production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0,1.41667e-06
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0,7.083351e-10
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0,2.273756e-07
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0,4.540428e-07
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0,1.168753e-07


### Drop Unnecessary Columns

In [17]:
data1 = data.drop(["District_Name"],axis=1)

### One Hot Encoding

In [18]:
data_dum = pd.get_dummies(data1, dtype=int)
data_dum[:5]

Unnamed: 0,Crop_Year,Area,Production,percent_of_production,State_Name_Andaman and Nicobar Islands,State_Name_Andhra Pradesh,State_Name_Arunachal Pradesh,State_Name_Assam,State_Name_Bihar,State_Name_Chandigarh,...,Crop_Turmeric,Crop_Turnip,Crop_Urad,Crop_Varagu,Crop_Water Melon,Crop_Wheat,Crop_Yam,Crop_other fibres,Crop_other misc. pulses,Crop_other oilseeds
0,2000,1254.0,2000.0,1.41667e-06,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,2.0,1.0,7.083351e-10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2000,102.0,321.0,2.273756e-07,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2000,176.0,641.0,4.540428e-07,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,720.0,165.0,1.168753e-07,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Split Dataset

In [19]:
x = data_dum.drop("Production",axis=1)
y = data_dum[["Production"]]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33, random_state=42)

print("x_train :",x_train.shape)
print("x_test :",x_test.shape)
print("y_train :",y_train.shape)
print("y_test :",y_test.shape)

x_train : (162381, 166)
x_test : (79980, 166)
y_train : (162381, 1)
y_test : (79980, 1)


In [20]:
x_train[:5]

Unnamed: 0,Crop_Year,Area,percent_of_production,State_Name_Andaman and Nicobar Islands,State_Name_Andhra Pradesh,State_Name_Arunachal Pradesh,State_Name_Assam,State_Name_Bihar,State_Name_Chandigarh,State_Name_Chhattisgarh,...,Crop_Turmeric,Crop_Turnip,Crop_Urad,Crop_Varagu,Crop_Water Melon,Crop_Wheat,Crop_Yam,Crop_other fibres,Crop_other misc. pulses,Crop_other oilseeds
4805,2001,785.0,1.486087e-06,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20552,2008,1250.0,5.829598e-07,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
236555,2013,2.0,1.452087e-07,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20590,2009,60.0,4.646678e-07,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
155164,2004,605.0,6.729184e-08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### XGB Model

In [21]:
xgbr = xgb.XGBRegressor(verbosity=0) 
xgbr.fit(x_train,y_train)

preds = xgbr.predict(x_test)

In [22]:
mean_squared_error(y_test,preds)

np.float64(14240867412939.752)

In [23]:
r2_score(y_test,preds)

0.9555647373199463

### Decision Tree Model

In [24]:
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(x_train,y_train)

preds = regressor.predict(x_test)


In [25]:
mean_squared_error(y_test,preds)

np.float64(378537510326.2657)

In [26]:
r2_score(y_test,preds)

0.9988188624755748

#### **Here we noticed that, Decision Tree model predicted more accurate than others. So, we can select Decosion Tree for prediction.**