In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

1.Data exploration and Cleaning

In [None]:
#read the csv file
df = pd.read_csv('crop_yield.csv')
df.head()


In [None]:
df.describe() #describe the dataset in terms of mean,count, min and max value

In [None]:
df.isnull().sum() #show if there is any null value present

In [None]:
df.isnull().mean() # show if there is any null value in terms of mean

In [None]:
df.mean() #Mean value of dataset

In [None]:
df.median() #median value of the dataset

In [None]:
df['Crop'].mode() #mode of the dataset

In [None]:
df['Area'].std()

In [None]:
df.duplicated().sum() #if there is any duplicate values

2.Data Visualization and EDA

In [None]:
plt.scatter(df['Yield'],df['Annual_Rainfall'],color='Green',marker = 'o') #Yield vs rainfall graph
plt.ylabel('Annual_Rainfall')
plt.xlabel('Crop_Yield')
plt.title('Crop_yield vs Annual_Rainfall graph')
plt.show()

In [None]:
plt.scatter(df['Crop_Year'],df['Annual_Rainfall'],color='Red',marker = 'o')
plt.ylabel('Annual_Rainfall')
plt.xlabel('Crop_Year')
plt.title('Crop_yield vs Annual_Rainfall graph')
plt.show()

In [None]:
sns.barplot(x=df['Season'],y=df['Production'], data=df , palette='viridis') #Season vs Production graph 
plt.title('season vs production Graph')
plt.xlabel('Season')
plt.ylabel('Production')
plt.show()

In [None]:
sns.barplot(x=df['Season'],y=df['Yield'], data=df , palette='mako') #Season vs Production graph 
plt.title('season vs Yield Graph')
plt.xlabel('Season')
plt.ylabel('Yield')
plt.show()

In [None]:
sns.barplot(x=df['Annual_Rainfall'],y=df['State'], data=df , palette='Oranges') #Season vs yield graph 
plt.title('season vs Yield Graph')
plt.xlabel('Annual_Rainfall')
plt.ylabel('State')
plt.show()

In [None]:
df.corr()

In [None]:
corr = df[['Yield', 'Annual_Rainfall', 'State', 'Season']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

1.Hypothesis testing and ANOVA Perform

In [None]:
#Perform Two sample T-test  between state and yield

In [None]:
state_a_yield = df[df['State'] == 'Assam']['Yield']
state_b_yield = df[df['State'] == 'Nagaland']['Yield']

In [None]:
from scipy.stats import ttest_ind

In [None]:
t_stat,p_value = ttest_ind(state_a_yield,state_b_yield)

In [None]:
print(f"t_statistic: {t_stat:.2f}")
print(f"p_value: {p_value: .4f}")

In [None]:
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in average crop yields between State_A and State_B.")
else:
    print("Fail to reject the null hypothesis: No significant difference in average crop yields between State_A and State_B.")

ANOVA TEST

In [None]:
from scipy.stats import f_oneway

In [None]:
States = df['State'].unique()

In [None]:
yield_data_by_state = [df[df['State'] == 'State']['Yield'] for State in States]

In [None]:
f_stat,p_value = f_oneway(*yield_data_by_state)

In [None]:
print(f"F-statistic: {f_stat:.2f}")
print(f"P-value: {p_value:.4f}")


In [None]:
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in crop yields across different states.")
else:
    print("Fail to reject the null hypothesis: No significant difference in crop yields across different states.")

Correlation Matrix

In [None]:
corr_matrix = df[['Yield', 'Annual_Rainfall', 'Area', 'Production']].corr()

In [None]:
print(corr_matrix)

Simple linear regression and Multiple linear regression analysis

In [None]:
import statsmodels.api as sm


In [None]:
x = df['Annual_Rainfall']
y= df['Yield']

In [None]:
X = sm.add_constant(x)

In [None]:
model = sm.OLS(x,y).fit()

In [None]:
print(model.summary())

In [None]:
#Multiple Linear Regression
x = df[['Annual_Rainfall','Area','Production']]
y = df['Yield']

In [None]:
X = sm.add_constant(x)

In [None]:
model = sm.OLS(y,x).fit()

In [None]:
print(model.summary())

Model training and Prediction for regression models

In [None]:
x = df[['Area','Annual_Rainfall','Production']]
y = df['Yield']

In [None]:
from sklearn.model_selection import train_test_split
x_test,x_train,y_test,y_train = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,accuracy_score

In [None]:
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=100 , random_state=42)
gbr_model = GradientBoostingRegressor(n_estimators=100,learning_rate=0.1 ,max_depth=3,random_state=42 )

In [None]:
lr_model.fit(x_train,y_train)
rf_model.fit(x_train,y_train)
gbr_model.fit(x_train,y_train)

In [None]:
y_pred = lr_model.predict(x_test)
y_pred = rf_model.predict(x_test)
y_pred = gbr_model.predict(x_test)


In [None]:
mse = mean_squared_error(y_test,y_pred)
mae  = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [None]:
print(f"mean_absolute_error(MSE): {mse:.2f}")
print(f"mean_squared_error(MAE):{mae:.2f}")
print(f"r2_score(R2): {r2:.2f}")