**Crop yield Prediction made by Synesis**

In [12]:
# Importing Libraries
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# Loading dataset with the help of pandas
data = pd.read_csv('crop_production.csv')

# Add District_Name and Season columns to X dataframe
X = data[['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area']]
Y = data['Production']

# Feature Engineering - Adding new features
X['Days'] = data['Crop_Year'].apply(lambda x: 365 if x % 4 == 0 else 364) # Number of days in the crop year
X['Rainfall'] = data['District_Name'].apply(lambda x: 1200 if x == 'CHAMARAJANAGAR' else 1000) # Average rainfall in the district

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, random_state=0)

# Define preprocessing pipelines for categorical and numerical features
num_transform = Pipeline([('imputer', SimpleImputer(strategy="median")), ('scaler', StandardScaler())])
num_cols = ['Crop_Year', 'Area', 'Days', 'Rainfall']

cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
cat_cols = ['State_Name', 'District_Name', 'Crop', 'Season']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transform, cat_cols),
        ('num', num_transform, num_cols)
    ])

from sklearn.impute import SimpleImputer

# Fit an imputer on Y_train
imputer = SimpleImputer(strategy='mean')
Y_train = Y_train.to_numpy().reshape(-1, 1)
imputer.fit(Y_train)

# # Transform Y_train using the fitted imputer
Y_train = imputer.transform(Y_train).ravel()


# Fit the imputer on Y_train and transform Y_train
Y_train = imputer.fit_transform(Y_train.reshape(-1, 1)).ravel()


# Fit the preprocessing pipeline on training data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [13]:
# Train a Ridge regression model on preprocessed data
rid = Ridge(alpha=1.0)
rid.fit(X_train_prep, Y_train)

In [14]:
Y_pred_rid = rid.predict(X_test_prep)
Y_pred_rid

array([-103282.6860718 ,   60712.58090679,  612596.9281836 , ...,
        -75700.97332499,  -85861.03935436,  -66593.39920874])

In [None]:
# User input for Ridge Regression model
# Save the model to a file
pickle.dump(rid, open('model.pkl', 'wb'))

# Load the saved model from a file
model = pickle.load(open('model.pkl', 'rb'))

# User Input
state_name = input("Enter State Name: ")
district_name = input("Enter District Name: ")
crop_year = int(input("Enter Crop Year: "))
season = input("Enter Season: ")
crop = input("Enter Crop: ")
area = float(input("Enter Area: "))

# Create a dictionary with the user input
user_data = {'State_Name': state_name,
             'District_Name': district_name,
             'Crop_Year': crop_year,
             'Season': season,
             'Crop': crop,
             'Area': area}

# Convert the dictionary to a DataFrame and add it to the original data
user_df = pd.DataFrame(user_data, index=[0])
X = pd.concat([X, user_df], ignore_index=True)

# Create input data as a list of lists
input_data = [[state_name, district_name, crop_year, season, crop, area]]

# Create input dataframe from input data
input_df = pd.DataFrame(input_data, columns=['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area'])


# Preprocess the input data and make a prediction
X_user_prep = preprocessor.transform(X.tail(1))
Y_pred = model.predict(X_user_prep)

# Print the predicted production
print(Y_pred[0])   #This code predicts the production for a single preprocessed input data point,

In [4]:
yield_predict_test=rid.predict(X_test_prep)
yield_predict_train=rid.predict(X_train_prep)
pd.DataFrame({'actual unseen data':Y_train,'predicted unseen data':yield_predict_train})

Unnamed: 0,actual unseen data,predicted unseen data
0,452.3,-3.608809e+05
1,57.0,-2.467798e+06
2,84.0,2.763047e+05
3,554.0,-4.716176e+05
4,262672.0,1.370311e+06
...,...,...
196867,2370.0,-4.945942e+05
196868,0.0,-3.282908e+05
196869,73.0,-8.158972e+05
196870,7.0,4.560384e+05


In [10]:
X_train_prep.shape

(196872, 812)

In [11]:
X_test_prep.shape

(49219, 812)

In [9]:
# Fill missing values with mean
Y_test = Y_test.fillna(Y_test.mean())

In [6]:
#Score of Ridge Regression model
from sklearn.model_selection import cross_val_score, cross_val_predict
scores = cross_val_score(rid, X_test_prep, Y_test, cv=5)
scores

array([0.09073586, 0.13329386, 0.14724946, 0.16452351, 0.17917925])

In [7]:
#Accuracy of Ridge Regression model
from sklearn import metrics
predictions = cross_val_predict(rid, X_test_prep, Y_test, cv=5)
accuracy = metrics.r2_score(Y_test, Y_pred_rid)
accuracy

0.16357450669968243

In [13]:
print('MAE= ',metrics.mean_absolute_error(Y_test,yield_predict_test))
print('MSE= ',metrics.mean_squared_error(Y_test,yield_predict_test))
print('R2 value= ',rid.score(X_test_prep,Y_test))
print('Adjusted R2 value= ',1 - (1 - (rid.score(X_test_prep,Y_test))) * ((756 - 1)/(756-10-1)))
print('RMSE (train)= ',np.sqrt(mean_squared_error(Y_train, yield_predict_train)))
print('RMSE (test)= ',np.sqrt(mean_squared_error(Y_test,yield_predict_test)))

MAE=  1956485.0944905952
MSE=  303246907594422.06
R2 value=  0.16357450669968243
Adjusted R2 value=  0.15234731887014796
RMSE (train)=  14977864.317055078
RMSE (test)=  17413985.976634473
