<a href="https://colab.research.google.com/github/architgupta27/PortfolioOptimization/blob/main/Copy_of_indian_agricultural_productivity_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Crop Yield Prediction

#  Importing Libraries

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the dataset
df = pd.read_csv('crop_yield.csv')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'crop_yield.csv'

In [None]:
df.tail()

In [None]:
print("Shape of the dataset : ",df.shape)

# Preprocessing of the dataset

In [None]:
df.isnull().sum()

In [None]:


# Plot bar chart for missing values
missing_values = df.isnull().sum()
plt.figure(figsize=(10, 5))
sns.barplot(x=missing_values.index, y=missing_values.values, palette="viridis")
plt.xticks(rotation=45)
plt.ylabel("Number of Missing Values")
plt.title("Missing Values per Column")
plt.show()

In [None]:
df.info()

### * No Null values in dataset since the shape shows no of rows are 19689 and isnull() function return 0
### * Even in df.info() count of non null for every column is equal to the shape that is no of rows in dataset

In [None]:
# to check the unique values
for i in df.columns:
    print("******************************",i,"*********************************")
    print()
    print(set(df[i].tolist()))
    print()

In [None]:
# Checking for duplicates record
df.duplicated().sum()

In [None]:
# Count duplicates
duplicate_count = df.duplicated().sum()
unique_count = len(df) - duplicate_count

# Pie chart
plt.figure(figsize=(6, 6))
plt.pie([unique_count, duplicate_count], labels=["Unique Records", "Duplicate Records"],
        autopct='%1.1f%%', colors=['lightblue', 'red'])
plt.title("Duplicate Records in Dataset")
plt.show()

###  * No Duplicate records found

In [None]:
df.describe()

# Visualization

In [None]:
sns.scatterplot(x = df['Annual_Rainfall'], y = df['Yield'])
plt.show

#### * High Yield is Concentrated at Moderate Rainfall (500mm - 3000mm)
#### * Most of the high-yield points (above 5000) are in this range, suggesting moderate rainfall is optimal for good crop yield.
#### * Low Yield for Very High Rainfall (>4000mm)
#### * Almost no high-yield crops in regions with excessive rainfall.
#### * Many data points are clustered near Yield = 0.
#### * Suggesting Crop failure due to droughts, floods, poor soil conditions.

# Year wise analysis of agricultural production

In [None]:
# Count occurrences of each year
year_counts = df['Crop_Year'].value_counts().sort_index()
print(year_counts)

In [None]:
# Plot the data
plt.figure(figsize=(10,5))
sns.barplot(x=year_counts.index, y=year_counts.values, palette="viridis")
plt.xlabel("Year")
plt.ylabel("Number of Records")
plt.title("Number of Data Entries Per Year")
plt.xticks(rotation=45)
plt.show()

In [None]:
df_year = df[df['Crop_Year']!=2020]  # As the data of 2020 is incomplete only 37 records

In [None]:
year_yield = df_year.groupby('Crop_Year').sum()
year_yield

In [None]:
plt.figure(figsize = (12,5))
plt.plot(year_yield.index, year_yield['Yield'],color='blue', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='yellow')
plt.xlabel('Year')
plt.ylabel('Yield')
plt.title('Measure of Yield over the year')
plt.show()

#### * It can be observed that the yield has increased over the year.
#### * but after 2014 it is showing the declining trend. Reasons can be climate change, decrease in soil fertility

In [None]:
plt.figure(figsize = (12,3))
plt.plot(year_yield.index, year_yield['Area'],color='blue', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='red')
plt.xlabel('Year')
plt.ylabel('Area')
plt.title('Area under cultivation over the year')
plt.show()

#### * It can be observed that the area under cultivation has increased substantially. Either with the help of fertilizer and more irrigation fallow land is now under cultivation or area under forest is used for agriculture.

In [None]:
plt.figure(figsize = (12,3))
plt.plot(year_yield.index, year_yield['Fertilizer'],color='blue', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='green')
plt.xlabel('Year')
plt.ylabel('Fertilizer')
plt.title('Use of Fertilizer over the year')
plt.show()

### * The overall trend for Fertilizer in the fields shows an increasing trend.

In [None]:
plt.figure(figsize = (12,3))
plt.plot(year_yield.index, year_yield['Pesticide'],color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='cyan')
plt.xlabel('Year')
plt.ylabel('Pesticide')
plt.title('Use of Pesticide over the Year')
plt.show()

### * The pesticide usage initially decreases from 1997 to around 2007.
### * From 2007 to 2010, there is a sharp decline, reaching the lowest point at 2008.
### * After 2010, pesticide usage rises significantly, showing an increasing trend till 2020.
### * after 2010 indicate periods of stabilization and slight drops, but the overall trend is upward.

# State wise analysis of agricultural production

In [None]:
df_state = df.groupby('State').sum()
df_state.sort_values(by = 'Yield', inplace=True, ascending = False)
df_state

In [None]:
#df_state['Region'] = ['States' for i in range(len(df_state))]

#fig = px.bar(df_state, x='Region', y = 'Yield', color=df_state.index, hover_data=['Yield'])
#fig.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=df_state, x=df_state.index, y='Yield', palette='viridis')


plt.xlabel('States')
plt.ylabel('Yield')
plt.title('Yield Across Different States')
plt.xticks(rotation=45)  # Rotate x labels if needed

# Show the plot
plt.show()

### * From the above graph it can be observed that the yield of West Bengal is highest.
### * Reason can be more annual rainfall, use of fertilizers

In [None]:
#plt.figure(figsize = (15,8))
#sns.barplot(x = df_state.index, y=df_state['Annual_Rainfall'], palette = 'gnuplot')
#plt.xticks(rotation = 45)
#plt.show()

plt.figure(figsize=(15, 8))
sns.barplot(x=df_state.index, y=df_state['Annual_Rainfall'], palette='gnuplot')

# Adding title and labels
plt.title("Annual Rainfall Across Different States", fontsize=16, fontweight='bold')
plt.xlabel("States", fontsize=14)
plt.ylabel("Annual Rainfall (mm)", fontsize=14)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add grid for better visualization
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

### * There is a significant variation in annual rainfall across different states.
### * Some states receive extremely high rainfall, while others receive much less.


In [None]:
plt.figure(figsize=(12,5))
sns.scatterplot(x=df_state.index, y = df_state['Annual_Rainfall'], palette='rainbow', hue = df_state['Yield'])
plt.xticks(rotation=45)
plt.title('Annual Rainfall across the States')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.scatterplot(x=df_state.index, y=df_state['Fertilizer'], palette='spring', hue = df_state['Yield'])
plt.xticks(rotation=45)
plt.title('Use of Fertilizer in Different States')
plt.show()


### * Annual Rainfall is highest in Chattisgarh but the yield is not the highest.
### * West Bengal has the maximum yield
### * Uttar Pradesh, Haryana, Maharashtra are using high amount of fertilizer but yield is not high reason can be low annual rainfall

# Season wise analysis

In [None]:
df_Seas = df[df['Season']!='Whole Year ']

df_season = df_Seas.groupby('Season').sum()
df_season

In [None]:
#fig = px.bar(df_season, y = 'Area', color=df_season.index, hover_data=['Area'],text = 'Area')
#fig.show()

# Plotting the bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=df_season.index, y=df_season['Area'], palette='viridis')

# Adding labels
plt.xlabel("Season", fontsize=12)
plt.ylabel("Total Area", fontsize=12)
plt.title("Total Area Covered in Different Seasons", fontsize=14)

# Rotating x-axis labels for better visibility
plt.xticks(rotation=45)

# Display the plot
plt.show()

### * Area under cultivation in Kharif season is highest, second is Rabi season
### * Crops in autumn, summer are not grown over large area

In [None]:
#fig = px.sunburst(df_season, path=[df_season.index, 'Yield'], values='Yield',
                  #color=df_season.index, hover_data=['Yield'])
#fig.show()
plt.figure(figsize=(8, 8))
plt.pie(df_season['Yield'], labels=df_season.index, autopct='%1.1f%%', colors=plt.cm.Paired.colors)

plt.title("Yield Distribution Across Seasons")
plt.show()




### * Yield in India is maximum in Kharif season

# Crop wise Analysis

In [None]:
# Where the Yield is zero
df_yz = df[df['Yield']==0]
df_yz.shape

In [None]:
df_yz.head()

In [None]:
plt.figure(figsize = (25,15))
sns.catplot(y="State", x="Crop",data=df_yz, aspect = 3, palette ='inferno')
plt.xticks(rotation=45)
plt.title('States and the Crops where yield is zero')
plt.show()

In [None]:
df_ynz = df[df['Yield']>0]  # where yield is more than zero
df_crop = df_ynz.groupby('Crop').sum()
df_crop

In [None]:
plt.figure(figsize = (25,8))
plt.plot(df_crop.index, df_crop['Fertilizer'],color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='cyan')
plt.xlabel('Crops')
plt.ylabel('Fertilizer')
plt.title(' Use of Fertilizer in different Crops')
plt.xticks(rotation=30)
plt.show()

### The amount of Fertilizer used is maximum in Rice Crop
### The second crop to use more fertilizer is Wheat crop

In [None]:
plt.figure(figsize = (25,8))
plt.plot(df_crop.index, df_crop['Area'],color='indigo', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='fuchsia')
plt.xlabel('Crops')
plt.ylabel('Area under cultivation')
plt.xticks(rotation=30)
plt.show()

#### Area under cultivation is larger for Rice and Wheat crops

# Analysis of Wheat crop

In [None]:
df_wheat = df[df['Crop']=='Wheat']
df_wheat.reset_index(drop=True,inplace=True)
df_wheat

In [None]:
df_wheat1 = df_wheat[df_wheat['Crop_Year']!=2020]
df_wheat_year = df_wheat1.groupby('Crop_Year').sum()
df_wheat_year

In [None]:
plt.figure(figsize = (12,5))
plt.plot(df_wheat_year.index, df_wheat_year['Yield'],color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='blue')
plt.xlabel('Year')
plt.ylabel('Yield')
plt.title('Yield of Wheat Crop over the Years')
plt.show()

### Checking the co-relation in the dataset using heatmap

In [None]:
# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=['number'])

# Compute the correlation matrix
corr_matrix = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Title
plt.title("Correlation Heatmap of Features")
plt.show()

## * From this heatmap, we can clearly see that Fertilizer and Pesticide have a high correlation of 0.95
## * Which means they are almost providing the same information to the model.
## * Including both can lead to multicollinearity, where the model struggles to decide which one is more   important.
## * To handle this, we need to either drop one of them or a regularization if we want to keep them both.

# Modelling

In [None]:
df1 = df.copy()
df1 = df1.drop(['Crop_Year','Pesticide'], axis = 1)

## Checking Ridge and Lasso Performance

In [None]:
#Checking Ridge and Lasso Performance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

df2 = df.copy()


In [None]:
# One-Hot Encoding for Categorical Variables
category_columns = df2.select_dtypes(include=['object']).columns  # Identify categorical columns
df2 = pd.get_dummies(df2, columns=category_columns, drop_first=True)  # Apply encoding

In [None]:
df2.head()

In [None]:
# Define Features (X) and Target (y)
X2 = df2.drop(['Yield'], axis=1)
y2 = df2['Yield']

In [None]:
# Split the data into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)


In [None]:
 #Display the shapes of training and testing sets
print(f"X_train shape: {X_train2.shape}, X_test shape: {X_test2.shape}")
print(f"y_train shape: {y_train2.shape}, y_test shape: {y_test2.shape}")

In [None]:
# Apply Power Transformation (Yeo-Johnson method)
pt = PowerTransformer(method='yeo-johnson')
X_train_transformed2 = pt.fit_transform(X_train2)
X_test_transformed2 = pt.transform(X_test2)

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Apply Ridge Regression
ridge = Ridge(alpha=1.0)  # You can tune alpha to see the effect
ridge.fit(X_train_transformed2, y_train2)

In [None]:
# Predictions using Ridge
ridge_pred = ridge.predict(X_test_transformed2)

In [None]:
# Ridge Evaluation
ridge_mse = mean_squared_error(y_test2, ridge_pred)
ridge_r2 = r2_score(y_test2, ridge_pred)
print(f"Ridge Regression - MSE: {ridge_mse:.4f}, R2 Score: {ridge_r2:.4f}")

In [None]:
# Apply Lasso Regression
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_transformed2, y_train2)

In [None]:
# Predictions using Lasso
lasso_pred = lasso.predict(X_test_transformed2)

In [None]:
# Lasso Evaluation
lasso_mse = mean_squared_error(y_test2, lasso_pred)
lasso_r2 = r2_score(y_test2, lasso_pred)
print(f"Lasso Regression - MSE: {lasso_mse:.4f}, R2 Score: {lasso_r2:.4f}")

In [None]:
# Comparing Coefficients
print("Ridge Coefficients:")
print(pd.Series(ridge.coef_[0], index=X_train2.columns))

In [None]:
alphas = [0.1, 0.5, 1.0, 2.0, 5.0]
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_transformed2, y_train2)

    # Predictions
    y_pred = ridge.predict(X_test_transformed2)

    # Metrics
    mse = mean_squared_error(y_test2, y_pred)
    r2 = r2_score(y_test2, y_pred)

    print(f"Alpha: {alpha}")
    print(f"Ridge Regression - MSE: {mse:.4f}, R2 Score: {r2:.4f}")
    #print(f"Ridge Coefficients: {ridge.coef_}")
    print("Ridge Coefficients:")
    print(pd.Series(ridge.coef_[0], index=X_train2.columns))
    print("\n")

In [None]:
print("\nLasso Coefficients:")
print(pd.Series(lasso.coef_, index=X_train2.columns))

In [None]:
# To check the distribution of dataset
plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
sns.distplot(df1['Area'],bins = 20,color = 'red')
plt.subplot(4,2,2)
sns.distplot(df1['Production'],bins = 10,color = 'green')
plt.subplot(4,2,3)
sns.distplot(df1['Annual_Rainfall'],bins = 10,color = 'blue')
plt.subplot(4,2,4)
sns.distplot(df1['Fertilizer'],bins = 10, color = 'black')
plt.show()

In [None]:
# Q-Q plot of the dataset
import scipy.stats as stats

plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
stats.probplot(df1['Area'], dist = 'norm', plot = plt)
plt.subplot(4,2,2)
stats.probplot(df1['Production'], dist = 'norm', plot = plt)
plt.subplot(4,2,3)
stats.probplot(df1['Annual_Rainfall'], dist = 'norm', plot = plt)
plt.subplot(4,2,4)
stats.probplot(df1['Fertilizer'], dist = 'norm', plot = plt)
plt.show()

### Data distribution have right skewness - to remove skewness using transformation approach
The algorithm is more likely to be biased when the data distribution is skewed

# One-Hot Encoding

In [None]:
category_columns = df1.select_dtypes(include = ['object']).columns
category_columns

In [None]:
df1 = pd.get_dummies(df1, columns = category_columns, drop_first=True)

In [None]:
df1.shape

In [None]:
df1.head()

### Split the data into dependent and independent variable

In [None]:
x = df1.drop(['Yield'], axis = 1)
y = df1[['Yield']]

In [None]:
print(x.shape)
y.shape

In [None]:
x.head()

In [None]:
y.head()

### Splitting  the data set into train and test set

In [None]:
#split the data into training and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
x_train.shape, x_test.shape, y_train.shape,y_test.shape

# Power Transformation using the method 'Yeo-Johnson'

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method='yeo-johnson')

x_train_transform1 = pt.fit_transform(x_train)
x_test_transform1 = pt.fit_transform(x_test)

In [None]:
print(pt.lambdas_)

In [None]:
df_trans = pd.DataFrame(x_train_transform1, columns=x_train.columns)
df_trans.head()

## After Transformation, there is no need for Standardization of the data

In [None]:
plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
sns.distplot(df_trans['Area'],bins = 20,color = 'red')
plt.subplot(4,2,2)
sns.distplot(df_trans['Production'],bins = 10,color = 'green')
plt.subplot(4,2,3)
sns.distplot(df_trans['Annual_Rainfall'],bins = 10,color = 'fuchsia')
plt.subplot(4,2,4)
sns.distplot(df_trans['Fertilizer'],bins = 10, color = 'indigo')

plt.show()

## Viewing the Q-Q Plot after the Transformation

In [None]:
plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
stats.probplot(df_trans['Area'], dist = 'norm', plot = plt)
plt.subplot(4,2,2)
stats.probplot(df_trans['Production'], dist = 'norm', plot = plt)
plt.subplot(4,2,3)
stats.probplot(df_trans['Annual_Rainfall'], dist = 'norm', plot = plt)
plt.subplot(4,2,4)
stats.probplot(df_trans['Fertilizer'], dist = 'norm', plot = plt)

plt.show()

# Linear Regression with skewed data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr = LinearRegression()
lr.fit(x_train,y_train)

y_pred_train = lr.predict(x_train)
print("Training Accuracy : ",r2_score(y_train,y_pred_train))

y_pred_test = lr.predict(x_test)
print("Test Accuracy : ",r2_score(y_test,y_pred_test))

In [None]:
# to store performance values value
train_accu = []  # Stores training results as (R², MSE)
test_accu = []   # Stores testing results as (R², MSE)

##  Linear Regression with Transformation Approach

In [None]:
lr.fit(x_train_transform1, y_train)

y_pred_train_ = lr.predict(x_train_transform1)
y_pred_test_ = lr.predict(x_test_transform1)

# Calculating MSE for training and test sets
mse_train_lr = mean_squared_error(y_train, y_pred_train_)
mse_test_lr = mean_squared_error(y_test, y_pred_test_)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_))
print()
print("Test Accuracy : ",r2_score(y_test, y_pred_test_))
print("Training MSE: ", mse_train_lr)
print("Test MSE: ", mse_test_lr)

#train_accu.append(r2_score(y_train,y_pred_train_))
#test_accu.append(r2_score(y_test,y_pred_test_))


# Storing results as (R², MSE) tuples
train_accu.append((r2_score(y_train, y_pred_train_), mse_train_lr))
test_accu.append((r2_score(y_test, y_pred_test_), mse_test_lr))

## Test Accuracy has improved after 'Yeo-Johnson' Transformation

### Here it is showing no case of overfitting or underfitting

## Variance Inflation Factor

In [None]:
x1 = df_trans.copy()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variable = x1

vif = pd.DataFrame()

vif['Variance Inflation Factor'] = [variance_inflation_factor(variable, i)
                                    for i in range(variable.shape[1])]

vif['Features'] = x1.columns

In [None]:
vif

VIF of the independent columns should be less than 5 to remove multicollinearity

In [None]:
x2 = x1.copy()

In [None]:
x2.drop(['Area'], axis = 1, inplace=True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variable = x2

vif = pd.DataFrame()

vif['Variance Inflation Factor'] = [variance_inflation_factor(variable, i)
                                    for i in range(variable.shape[1])]

vif['Features'] = x2.columns

In [None]:
vif

In [None]:
x2.drop(['Production'], axis = 1, inplace=True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

variable = x2

vif = pd.DataFrame()

vif['Variance Inflation Factor'] = [variance_inflation_factor(variable, i)
                                    for i in range(variable.shape[1])]

vif['Features'] = x2.columns

In [None]:
vif

In [None]:
x2.head()

In [None]:
x_test1 = pd.DataFrame(x_test_transform1, columns=x_test.columns)
x_test1.drop(['Area','Production'], axis = 1, inplace = True)

In [None]:
# After applying vif
lr.fit(x2, y_train)

y_pred_train_ = lr.predict(x2)
y_pred_test_ = lr.predict(x_test1)

# Calculating MSE for training and test sets
mse_train_lr = mean_squared_error(y_train, y_pred_train_)
mse_test_lr = mean_squared_error(y_test, y_pred_test_)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_))
print()
print("Test Accuracy : ",r2_score(y_test, y_pred_test_))
print("Training MSE: ", mse_train_lr)
print("Test MSE: ", mse_test_lr)

#train_accu.append(r2_score(y_train,y_pred_train_))
#test_accu.append(r2_score(y_test,y_pred_test_))

# Storing results as (R², MSE) tuples
train_accu.append((r2_score(y_train, y_pred_train_), mse_train_lr))
test_accu.append((r2_score(y_test, y_pred_test_), mse_test_lr))

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()

regr.fit(x_train_transform1, y_train)

y_pred_train_regr= regr.predict(x_train_transform1)
y_pred_test_regr = regr.predict(x_test_transform1)

# Calculating MSE for training and test sets
mse_train_regr = mean_squared_error(y_train, y_pred_train_regr)
mse_test_regr = mean_squared_error(y_test, y_pred_test_regr)


print("Training Accuracy : ",r2_score(y_train, y_pred_train_regr))
print("Test Accuracy : ",r2_score(y_test, y_pred_test_regr))
print("Training MSE: ", mse_train_regr)
print("Test MSE: ", mse_test_regr)

#train_accu.append(r2_score(y_train,y_pred_train_regr))
#test_accu.append(r2_score(y_test,y_pred_test_regr))

# Storing results as (R², MSE) tuples
train_accu.append((r2_score(y_train, y_pred_train_regr), mse_train_regr))
test_accu.append((r2_score(y_test, y_pred_test_regr), mse_test_regr))

In [None]:
feature_importancesx = pd.Series(regr.feature_importances_, index=x_train.columns)
print(feature_importancesx)

In [None]:
# After applying vif
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()


regr.fit(x2, y_train)

y_pred_train_regr= regr.predict(x2)
y_pred_test_regr = regr.predict(x_test1)

# Calculating MSE for training and test sets
mse_train_regr = mean_squared_error(y_train, y_pred_train_regr)
mse_test_regr = mean_squared_error(y_test, y_pred_test_regr)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_regr))
print("Test Accuracy : ",r2_score(y_test, y_pred_test_regr))
print("Training MSE: ", mse_train_regr)
print("Test MSE: ", mse_test_regr)

#train_accu.append(r2_score(y_train,y_pred_train_regr))
#test_accu.append(r2_score(y_test,y_pred_test_regr))

# Storing results as (R², MSE) tuples
train_accu.append((r2_score(y_train, y_pred_train_regr), mse_train_regr))
test_accu.append((r2_score(y_test, y_pred_test_regr), mse_test_regr))

In [None]:
feature_importancesx = pd.Series(regr.feature_importances_, index=x2.columns)
print("\nFeature Importances:\n")
print(feature_importancesx)

# Support Vector Regressor

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(x_train_transform1, y_train)

y_pred_train_svr= svr.predict(x_train_transform1)
y_pred_test_svr = svr.predict(x_test_transform1)

mse_train = mean_squared_error(y_train, y_pred_train_svr)
mse_test = mean_squared_error(y_test, y_pred_test_svr)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_svr))
print("Test Accuracy : ",r2_score(y_test, y_pred_test_svr))
print("Training MSE: ", mse_train)
print("Test MSE: ", mse_test)


#train_accu.append(r2_score(y_train,y_pred_train_svr))
#test_accu.append(r2_score(y_test,y_pred_test_svr))

train_accu.append((r2_score(y_train, y_pred_train_svr), mse_train))
test_accu.append((r2_score(y_test, y_pred_test_svr), mse_test))

In [None]:
# After applying vif
from sklearn.svm import SVR
svr = SVR()
svr.fit(x2, y_train)

y_pred_train_svr= svr.predict(x2)
y_pred_test_svr = svr.predict(x_test1)

mse_train = mean_squared_error(y_train, y_pred_train_svr)
mse_test = mean_squared_error(y_test, y_pred_test_svr)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_svr))
print("Test Accuracy : ",r2_score(y_test, y_pred_test_svr))
print("Training MSE: ", mse_train)
print("Test MSE: ", mse_test)

train_accu.append((r2_score(y_train,y_pred_train_svr),mse_train))
test_accu.append((r2_score(y_test,y_pred_test_svr),mse_test))

# CatBoostRegressor

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(learning_rate=0.15)
cat.fit(x_train_transform1, y_train)

y_pred_train_cat = cat.predict(x_train_transform1)
y_pred_test_cat = cat.predict(x_test_transform1)

# Calculating MSE for training and test sets
mse_train_cat = mean_squared_error(y_train, y_pred_train_cat)
mse_test_cat = mean_squared_error(y_test, y_pred_test_cat)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_cat))
print()
print("Test Accuracy : ",r2_score(y_test, y_pred_test_cat))
print("Training MSE: ", mse_train_cat)
print("Test MSE: ", mse_test_cat)

#train_accu.append(r2_score(y_train,y_pred_train_cat))
#test_accu.append(r2_score(y_test,y_pred_test_cat))
train_accu.append((r2_score(y_train, y_pred_train_cat), mse_train_cat))
test_accu.append((r2_score(y_test, y_pred_test_cat), mse_test_cat))

In [None]:
feature_importance = cat.get_feature_importance()
print(feature_importance)

In [None]:
# After applying vif
from catboost import CatBoostRegressor
cat = CatBoostRegressor(learning_rate=0.15)
cat.fit(x2, y_train)

y_pred_train_cat = cat.predict(x2)
y_pred_test_cat = cat.predict(x_test1)

# Calculating MSE for training and test sets
mse_train_cat = mean_squared_error(y_train, y_pred_train_cat)
mse_test_cat = mean_squared_error(y_test, y_pred_test_cat)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_cat))
print()
print("Test Accuracy : ",r2_score(y_test, y_pred_test_cat))
print("Training MSE: ", mse_train_cat)
print("Test MSE: ", mse_test_cat)


#train_accu.append(r2_score(y_train,y_pred_train_cat))
#test_accu.append(r2_score(y_test,y_pred_test_cat))

# Storing results as (R², MSE) tuples
train_accu.append((r2_score(y_train, y_pred_train_cat), mse_train_cat))
test_accu.append((r2_score(y_test, y_pred_test_cat), mse_test_cat))

In [None]:

# Importing BaggingRegressor

from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score

# Train Bagging Regressor with pre-trained CatBoost as base estimator
bagging_model = BaggingRegressor(estimator=cat, n_estimators=10, random_state=42)
bagging_model.fit(x_train_transform1, y_train)

y_pred_train_bag = bagging_model.predict(x_train_transform1)
y_pred_test_bag = bagging_model.predict(x_test_transform1)

# Calculating MSE for training and test sets
mse_train_bag = mean_squared_error(y_train, y_pred_train_bag)
mse_test_bag = mean_squared_error(y_test, y_pred_test_bag)
# Printing accuracy results for Bagging Regressor
print("\nBagging Regressor Training Accuracy : ", r2_score(y_train, y_pred_train_bag))
print("Bagging Regressor Test Accuracy : ", r2_score(y_test, y_pred_test_bag))
print("Bagging Regressor Training MSE: ", mse_train_bag)
print("Bagging Regressor Test MSE: ", mse_test_bag)


# Storing results in train_accu and test_accu lists
#train_accu.append(r2_score(y_train, y_pred_train_bag))
#test_accu.append(r2_score(y_test, y_pred_test_bag))

# Storing results in train_accu and test_accu lists
train_accu.append((r2_score(y_train, y_pred_train_bag), mse_train_bag))
test_accu.append((r2_score(y_test, y_pred_test_bag), mse_test_bag))



In [None]:
# Importing BaggingRegressor
#After Applying VIF
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score

# Train Bagging Regressor with pre-trained CatBoost as base estimator
bagging_model = BaggingRegressor(estimator=cat, n_estimators=10, random_state=42)
bagging_model.fit(x2, y_train)

y_pred_train_bag = bagging_model.predict(x2)
y_pred_test_bag = bagging_model.predict(x_test1)

# Calculating MSE for training and test sets
mse_train = mean_squared_error(y_train, y_pred_train_bag)
mse_test = mean_squared_error(y_test, y_pred_test_bag)

# Printing accuracy results for Bagging Regressor
print("\nBagging Regressor Training Accuracy : ", r2_score(y_train, y_pred_train_bag))
print("Bagging Regressor Test Accuracy : ", r2_score(y_test, y_pred_test_bag))
print("Bagging Regressor Training MSE: ", mse_train)
print("Bagging Regressor Test MSE: ", mse_test)

# Storing results in train_accu and test_accu lists
#train_accu.append(r2_score(y_train, y_pred_train_bag))
#test_accu.append(r2_score(y_test, y_pred_test_bag))

train_accu.append((r2_score(y_train, y_pred_train_bag), mse_train))
test_accu.append((r2_score(y_test, y_pred_test_bag), mse_test))





# Comparison of the models

In [None]:
#algorithm = ['LinearRegression','LRvif','RandomForestRegressor','RFRvif','SupprtVectorRegressor','SVRvif','CatBoostRegressor','CBRvif','BaggingRegressor','BRvif']
#accu_data = {'Training Accuracy':train_accu,'Test Accuracy':test_accu}
#model = pd.DataFrame(accu_data, index = algorithm)
#model

# Updated DataFrame creation
algorithm = [
    'LinearRegression', 'LRvif', 'RandomForestRegressor', 'RFRvif',
    'SupportVectorRegressor', 'SVRvif', 'CatBoostRegressor', 'CBRvif',
    'BaggingRegressor', 'BRvif'
]

# Creating the DataFrame with separate columns for R² and MSE
accu_data = {
    'Training Accuracy (R²)': [x[0] for x in train_accu],
    'Training MSE': [x[1] for x in train_accu],
    'Test Accuracy (R²)': [x[0] for x in test_accu],
    'Test MSE': [x[1] for x in test_accu]
}

# Creating the DataFrame
model = pd.DataFrame(accu_data, index=algorithm)
model

# Conclusion

* Machine Learning Algorithm can be used to predict the crop yield in different states
* Challenge is to have the authentic dataset