**Crop yield Prediction made by Synesis**

In [22]:
# Importing libraries
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Loading dataset with help of pandas
data = pd.read_csv('crop_production.csv')


# Add District_Name and Season columns to X dataframe
X = data[['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area']]
Y = data['Production']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, random_state=0)

# Define preprocessing pipelines for categorical and numerical features
num_transform = Pipeline([('imputer', SimpleImputer(strategy="median")), ('scaler', StandardScaler())])
num_cols = ['Crop_Year', 'Area']

cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
cat_cols = ['State_Name', 'District_Name', 'Crop', 'Season']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transform, cat_cols),
        ('num', num_transform, num_cols)
    ])

from sklearn.impute import SimpleImputer

# Fit an imputer on Y_train
imputer = SimpleImputer(strategy='mean')
Y_train = Y_train.to_numpy().reshape(-1, 1)
imputer.fit(Y_train)

# # Transform Y_train using the fitted imputer
Y_train = imputer.transform(Y_train).ravel()


# Fit the imputer on Y_train and transform Y_train
Y_train = imputer.fit_transform(Y_train.reshape(-1, 1)).ravel()


# Fit the preprocessing pipeline on training data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [23]:
from sklearn import ensemble
yield_predict = ensemble.GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_split = 2)
yield_predict.fit(X_train_prep, Y_train)

In [None]:
# User input for Gradient Boosting Regressor
# Save the model to a file
pickle.dump(yield_predict, open('model.pkl', 'wb'))

# Load the saved model from a file
model = pickle.load(open('model.pkl', 'rb'))

# User Input
state_name = input("Enter State Name: ")
district_name = input("Enter District Name: ")
crop_year = int(input("Enter Crop Year: "))
season = input("Enter Season: ")
crop = input("Enter Crop: ")
area = float(input("Enter Area: "))

# Create a dictionary with the user input
user_data = {'State_Name': state_name,
             'District_Name': district_name,
             'Crop_Year': crop_year,
             'Season': season,
             'Crop': crop,
             'Area': area}

# Convert the dictionary to a DataFrame and add it to the original data
user_df = pd.DataFrame(user_data, index=[0])
X = pd.concat([X, user_df], ignore_index=True)

# Create input data as a list of lists
input_data = [[state_name, district_name, crop_year, season, crop, area]]

# Create input dataframe from input data
input_df = pd.DataFrame(input_data, columns=['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area'])


# Preprocess the input data and make a prediction
X_user_prep = preprocessor.transform(X.tail(1))
Y_pred = model.predict(X_user_prep)

# Print the predicted production
print(Y_pred[0])   #This code predicts the production for a single preprocessed input data point,

In [30]:
yield_predict_test=yield_predict.predict(X_test_prep)
yield_predict_train=yield_predict.predict(X_train_prep)
pd.DataFrame({'actual unseen data':Y_train,'predicted unseen data':yield_predict_train})

Unnamed: 0,actual unseen data,predicted unseen data
0,452.3,7185.863962
1,57.0,7185.863962
2,84.0,3268.332884
3,554.0,7185.863962
4,262672.0,313940.610049
...,...,...
196867,2370.0,8231.220702
196868,0.0,3268.332884
196869,73.0,3268.332884
196870,7.0,7185.863962


In [40]:
X_train_prep.shape

(196872, 810)

In [42]:
X_test_prep.shape

(49219, 810)

In [29]:
# Fill missing values with mean
Y_test = Y_test.fillna(Y_test.mean())

In [31]:
# Score of Gradient Boosting Regressor
from sklearn.model_selection import cross_val_score, cross_val_predict
scores = cross_val_score(yield_predict, X_test_prep, Y_test, cv=5)
scores

array([0.73389105, 0.84503592, 0.69263004, 0.89001007, 0.97746456])

In [32]:
#Accuracy of Gradient Boosting Regressor
from sklearn import metrics
predictions = cross_val_predict(yield_predict, X_test_prep, Y_test, cv=5)
accuracy = metrics.r2_score(Y_test, predictions)
accuracy

0.84594552283178

In [33]:
print('MAE= ',metrics.mean_absolute_error(Y_test,yield_predict_test))
print('MSE= ',metrics.mean_squared_error(Y_test,yield_predict_test))
print('R2 value= ',yield_predict.score(X_test_prep,Y_test))
print('Adjusted R2 value= ',1 - (1 - (yield_predict.score(X_test_prep,Y_test))) * ((756 - 1)/(756-10-1)))
print('RMSE (train)= ',np.sqrt(mean_squared_error(Y_train, yield_predict_train)))
print('RMSE (test)= ',np.sqrt(mean_squared_error(Y_test,yield_predict_test)))

MAE=  123916.16893739709
MSE=  6466506692870.09
R2 value=  0.982163870710439
Adjusted R2 value=  0.9819244595790355
RMSE (train)=  1653589.6350388438
RMSE (test)=  2542932.6953087235
