**Crop yield Prediction made by Synesis**

In [1]:
#Importing Libraries
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

#Loading dataset with the help of pandas
data = pd.read_csv('crop_production.csv')

# Add District_Name and Season columns to X dataframe
X = data[['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area']]
Y = data['Production']

# Feature Engineering - Adding new features
X['Days'] = data['Crop_Year'].apply(lambda x: 365 if x % 4 == 0 else 364) # Number of days in the crop year
X['Rainfall'] = data['District_Name'].apply(lambda x: 1200 if x == 'CHAMARAJANAGAR' else 1000) # Average rainfall in the district

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, random_state=0)

# Define preprocessing pipelines for categorical and numerical features
num_transform = Pipeline([('imputer', SimpleImputer(strategy="median")), ('scaler', StandardScaler())])
num_cols = ['Crop_Year', 'Area', 'Days', 'Rainfall']

cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
cat_cols = ['State_Name', 'District_Name', 'Crop', 'Season']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transform, cat_cols),
        ('num', num_transform, num_cols)
    ])

from sklearn.impute import SimpleImputer

# Fit an imputer on Y_train
imputer = SimpleImputer(strategy='mean')
Y_train = Y_train.to_numpy().reshape(-1, 1)
imputer.fit(Y_train)

# # Transform Y_train using the fitted imputer
Y_train = imputer.transform(Y_train).ravel()


# Fit the imputer on Y_train and transform Y_train
Y_train = imputer.fit_transform(Y_train.reshape(-1, 1)).ravel()


# Fit the preprocessing pipeline on training data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [2]:
data.describe()

Unnamed: 0,Crop_Year,Area,Production
count,246091.0,246091.0,242361.0
mean,2005.643018,12002.82,582503.4
std,4.952164,50523.4,17065810.0
min,1997.0,0.04,0.0
25%,2002.0,80.0,88.0
50%,2006.0,582.0,729.0
75%,2010.0,4392.0,7023.0
max,2015.0,8580100.0,1250800000.0


In [3]:
data.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [4]:
data.isnull().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [5]:
# Train a linear regression model on preprocessed data
reg = LinearRegression()
reg.fit(X_train_prep, Y_train)

In [6]:
Y_pred_lr = reg.predict(X_test_prep) #This code predicts the production for multiple preprocessed test data points.
Y_pred_lr

array([-102362.48349955,   60753.76992959,  615862.18802123, ...,
        -77376.23616661,  -85600.5692978 ,  -66198.5771995 ])

In [None]:
# User input for Linear Regression
# Save the model to a file
pickle.dump(reg, open('model.pkl', 'wb'))

# Load the saved model from a file
model = pickle.load(open('model.pkl', 'rb'))

# User Input
state_name = input("Enter State Name: ")
district_name = input("Enter District Name: ")
crop_year = int(input("Enter Crop Year: "))
season = input("Enter Season: ")
crop = input("Enter Crop: ")
area = float(input("Enter Area: "))

# Create a dictionary with the user input
user_data = {'State_Name': state_name,
             'District_Name': district_name,
             'Crop_Year': crop_year,
             'Season': season,
             'Crop': crop,
             'Area': area}

# Convert the dictionary to a DataFrame and add it to the original data
user_df = pd.DataFrame(user_data, index=[0])
X = pd.concat([X, user_df], ignore_index=True)

# Create input data as a list of lists
input_data = [[state_name, district_name, crop_year, season, crop, area]]

# Create input dataframe from input data
input_df = pd.DataFrame(input_data, columns=['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area'])


# Preprocess the input data and make a prediction
X_user_prep = preprocessor.transform(X.tail(1))
Y_pred = model.predict(X_user_prep)

# Print the predicted production
print(Y_pred[0])   #This code predicts the production for a single preprocessed input data point,

In [8]:
yield_predict_test=reg.predict(X_test_prep)
yield_predict_train=reg.predict(X_train_prep)
pd.DataFrame({'actual unseen data':Y_train,'predicted unseen data':yield_predict_train})

Unnamed: 0,actual unseen data,predicted unseen data
0,452.3,-3.611419e+05
1,57.0,-2.471965e+06
2,84.0,2.746120e+05
3,554.0,-4.719013e+05
4,262672.0,1.371767e+06
...,...,...
196867,2370.0,-4.970623e+05
196868,0.0,-3.223619e+05
196869,73.0,-8.190529e+05
196870,7.0,4.577194e+05


In [9]:
X_train_prep.shape

(196872, 812)

In [10]:
X_test_prep.shape

(49219, 812)

In [11]:
# Fill missing values with mean
Y_test = Y_test.fillna(Y_test.mean())

In [12]:
#Score of Linear Regression
from sklearn.model_selection import cross_val_score, cross_val_predict
scores = cross_val_score(reg, X_test_prep, Y_test, cv=5)
scores

array([0.08908784, 0.13184428, 0.14647504, 0.1632821 , 0.1797225 ])

In [13]:
#Accuracy of Linear Regression
from sklearn import metrics
predictions = cross_val_predict(reg, X_test_prep, Y_test, cv=5)
accuracy = metrics.r2_score(Y_test, Y_pred_lr)
accuracy

0.16362582003011394

In [14]:
print('MAE= ',metrics.mean_absolute_error(Y_test,yield_predict_test))
print('MSE= ',metrics.mean_squared_error(Y_test,yield_predict_test))
print('R2 value= ',reg.score(X_test_prep,Y_test))
print('Adjusted R2 value= ',1 - (1 - (reg.score(X_test_prep,Y_test))) * ((756 - 1)/(756-10-1)))
print('RMSE (train)= ',np.sqrt(mean_squared_error(Y_train, yield_predict_train)))
print('RMSE (test)= ',np.sqrt(mean_squared_error(Y_test,yield_predict_test)))

MAE=  1958215.8292060671
MSE=  303228303894634.8
R2 value=  0.16362582003011394
Adjusted R2 value=  0.15239932097011544
RMSE (train)=  14977855.715494357
RMSE (test)=  17413451.808720604
