**Crop yield Prediction made by Synesis**

In [1]:
# Importing Libraries
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Loading dataset with the help of pandas
data = pd.read_csv('crop_production.csv')

# Add District_Name and Season columns to X dataframe
X = data[['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area']]
Y = data['Production']

# Feature Engineering - Adding new features
X['Days'] = data['Crop_Year'].apply(lambda x: 365 if x % 4 == 0 else 364) # Number of days in the crop year
X['Rainfall'] = data['District_Name'].apply(lambda x: 1200 if x == 'CHAMARAJANAGAR' else 1000) # Average rainfall in the district

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, random_state=0)

# Define preprocessing pipelines for categorical and numerical features
num_transform = Pipeline([('imputer', SimpleImputer(strategy="median")), ('scaler', StandardScaler())])
num_cols = ['Crop_Year', 'Area', 'Days', 'Rainfall']

cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
cat_cols = ['State_Name', 'District_Name', 'Crop', 'Season']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transform, cat_cols),# 3 parameters used one for name , the transformer object itself and list of columns to be transformed
        ('num', num_transform, num_cols)
    ])

from sklearn.impute import SimpleImputer

# Fit an imputer on Y_train
imputer = SimpleImputer(strategy='mean')
Y_train = Y_train.to_numpy().reshape(-1, 1)#reshape to  have a single column so as to ensure correct shape for simpleimputer
imputer.fit(Y_train)

# # Transform Y_train using the fitted imputer
Y_train = imputer.transform(Y_train).ravel()#used ravel for converting from 2D array to 1D


# Fit the imputer on Y_train and transform Y_train
Y_train = imputer.fit_transform(Y_train.reshape(-1, 1)).ravel()


# Fit the preprocessing pipeline on training data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [2]:
# Train a Random Forest Regressor model
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
rf.fit(X_train_prep, Y_train)

In [3]:
Y_pred_rf = rf.predict(X_test_prep)
Y_pred_rf

array([ 5803.88206276, 20541.31380738,  5803.88206276, ...,
       15849.84804056,  5961.20970493,  5803.88206276])

In [None]:
# User input for Random Forest Regressor 
# Save the model to a file
pickle.dump(rf, open('model.pkl', 'wb'))

# Load the saved model from a file
model = pickle.load(open('model.pkl', 'rb'))

# User Input
state_name = input("Enter State Name: ")
district_name = input("Enter District Name: ")
crop_year = int(input("Enter Crop Year: "))
season = input("Enter Season: ")
crop = input("Enter Crop: ")
area = float(input("Enter Area: "))

# Create a dictionary with the user input
user_data = {'State_Name': state_name,
             'District_Name': district_name,
             'Crop_Year': crop_year,
             'Season': season,
             'Crop': crop,
             'Area': area}

# Convert the dictionary to a DataFrame and add it to the original data
user_df = pd.DataFrame(user_data, index=[0])
X = pd.concat([X, user_df], ignore_index=True)

# Create input data as a list of lists
input_data = [[state_name, district_name, crop_year, season, crop, area]]

# Create input dataframe from input data
input_df = pd.DataFrame(input_data, columns=['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area'])


# Preprocess the input data and make a prediction
X_user_prep = preprocessor.transform(X.tail(1))
Y_pred = model.predict(X_user_prep)

# Print the predicted production
print(Y_pred[0])   #This code predicts the production for a single preprocessed input data point,

In [4]:
yield_predict_test=rf.predict(X_test_prep)
yield_predict_train=rf.predict(X_train_prep)
pd.DataFrame({'actual unseen data':Y_train,'predicted unseen data':yield_predict_train})

Unnamed: 0,actual unseen data,predicted unseen data
0,452.3,5803.882063
1,57.0,5803.882063
2,84.0,5803.882063
3,554.0,5803.882063
4,262672.0,389012.155222
...,...,...
196867,2370.0,5803.882063
196868,0.0,5803.882063
196869,73.0,5961.209705
196870,7.0,5803.882063


In [5]:
X_train_prep.shape

(196872, 812)

In [6]:
X_test_prep.shape

(49219, 812)

In [7]:
# Fill missing values with mean
Y_test = Y_test.fillna(Y_test.mean())

In [8]:
#Score of Random Forest Regressor
from sklearn.model_selection import cross_val_score, cross_val_predict
scores = cross_val_score(rf, X_test_prep, Y_test, cv=5)
scores

array([0.74054551, 0.91970289, 0.72372018, 0.87999845, 0.94980465])

In [9]:
#Accuracy of Random Forest Regressor
from sklearn import metrics
predictions = cross_val_predict(rf, X_test_prep, Y_test, cv=5)
accuracy = metrics.r2_score(Y_test, Y_pred_rf)
accuracy

0.9770320643443572

In [10]:
print('MAE= ',metrics.mean_absolute_error(Y_test,yield_predict_test))
print('MSE= ',metrics.mean_squared_error(Y_test,yield_predict_test))
print('R2 value= ',rf.score(X_test_prep,Y_test))
print('Adjusted R2 value= ',1 - (1 - (rf.score(X_test_prep,Y_test))) * ((756 - 1)/(756-10-1)))
print('RMSE (train)= ',np.sqrt(mean_squared_error(Y_train, yield_predict_train)))
print('RMSE (test)= ',np.sqrt(mean_squared_error(Y_test,yield_predict_test)))

MAE=  125196.08114270696
MSE=  8327048275297.592
R2 value=  0.9770320643443572
Adjusted R2 value=  0.9767237699060264
RMSE (train)=  2206006.2942277608
RMSE (test)=  2885662.5366278696
