In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam


In [97]:
# Load dataset
from google.colab import files
uploaded = files.upload()



In [98]:
climate_data = pd.read_csv("Burdwan_Crop.csv")

In [99]:
# Step 1: Drop redundant columns
climate_data = climate_data.drop(columns=["State_Name", "District_Name"])


In [100]:
# Step 2: Data Cleaning
# Check for missing values and duplicates
print("Missing Values:\n", climate_data.isnull().sum())
print("Duplicates:", climate_data.duplicated().sum())

Missing Values:
 Crop_Year     0
Season        0
Crop          0
Area          0
Production    2
Temparetue    0
Rainfall      0
Humidity      0
Sun hours     0
dtype: int64
Duplicates: 0


In [101]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to 'Season' and 'Crop' columns
climate_data['Season'] = label_encoder.fit_transform(climate_data['Season'])
climate_data['Crop'] = label_encoder.fit_transform(climate_data['Crop'])


In [102]:
climate_data.head()

Unnamed: 0,Crop_Year,Season,Crop,Area,Production,Temparetue,Rainfall,Humidity,Sun hours
0,2009,1,23,17119,52128.0,22.1,341,81.0,8.1
1,2009,2,6,128,226.0,26.9,1189,80.0,8.2
2,2009,2,11,12459,222721.0,26.9,1189,80.0,8.2
3,2009,2,14,147,254.0,26.9,1189,80.0,8.2
4,2009,2,16,88,46.0,26.9,1189,80.0,8.2


In [103]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
climate_data = pd.DataFrame(imputer.fit_transform(climate_data), columns=climate_data.columns)

In [104]:
from scipy.stats import zscore
# Outlier Removal (Z-Score Method)
def remove_outliers_zscore(df, threshold=3):
    z_scores = np.abs(zscore(df))
    return df[(z_scores < threshold).all(axis=1)]

# Apply the Z-score method for outlier removal on the 'Temparetue' and 'Rainfall' columns
climate_data_cleaned_zscore = remove_outliers_zscore(climate_data[['Temparetue', 'Rainfall', 'Humidity', 'Sun hours']])

In [105]:
# Step 3: Normalize numerical features
scaler = StandardScaler()
numerical_columns = ["Area", "Temparetue", "Rainfall", "Humidity", "Sun hours"]
climate_data[numerical_columns] = scaler.fit_transform(climate_data[numerical_columns])


In [106]:
print(climate_data.dtypes)


Crop_Year     float64
Season        float64
Crop          float64
Area          float64
Production    float64
Temparetue    float64
Rainfall      float64
Humidity      float64
Sun hours     float64
dtype: object


In [107]:
# Step 4: Feature engineering (Production Density and Climatic Index)
climate_data["Production_density"] = climate_data["Production"] / climate_data["Area"].replace(0, np.nan)
climate_data["Climatic_Index"] = (
    climate_data["Temparetue"] * climate_data["Humidity"] / climate_data["Rainfall"].replace(0, np.nan)
)

In [108]:
# Step 6: Separate features and target variable
X = climate_data.drop(columns=["Production",])
y = climate_data["Production"]

In [109]:
# Step 7: Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train and evaluate Linear Regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the Linear Regression model
linear_regressor = LinearRegression()

# Train the model
linear_regressor.fit(X_train, y_train)

# Predict on test set
y_pred_linear = linear_regressor.predict(X_test)

# Evaluate the Linear Regression model
print("Linear Regression Evaluation:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_linear))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_linear))
print("R-squared:", r2_score(y_test, y_pred_linear))

Linear Regression Evaluation:
Mean Absolute Error: 202217.44188138738
Mean Squared Error: 248616560346.29703
R-squared: 0.9813004668785457


In [110]:
# Step 6: Separate features and target variable
X = climate_data.drop(columns=["Production", "Production_density"])
y = climate_data["Production"]

In [111]:
# Step 9: Train and evaluate Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_regressor.predict(X_test)

# Evaluate the Random Forest model
print("\nRandom Forest Regressor Evaluation:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_rf))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("R-squared:", r2_score(y_test, y_pred_rf))


Random Forest Regressor Evaluation:
Mean Absolute Error: 301436.67215962446
Mean Squared Error: 5733536083914.033
R-squared: 0.5687558071157219


In [112]:
# Step 10: Train and evaluate Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the Gradient Boosting model
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Train the model
gb_regressor.fit(X_train, y_train)

# Predict on test set
y_pred_gb = gb_regressor.predict(X_test)

# Evaluate the Gradient Boosting model
print("\nGradient Boosting Regressor Evaluation:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_gb))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_gb))
print("R-squared:", r2_score(y_test, y_pred_gb))


Gradient Boosting Regressor Evaluation:
Mean Absolute Error: 240589.7313207759
Mean Squared Error: 3648328366547.827
R-squared: 0.7255933513311567


In [113]:
# Step 9: Train and evaluate Ridge Regression
from sklearn.linear_model import Ridge

# Initialize Ridge model
ridge_regressor = Ridge(alpha=1.0)

# Train the model
ridge_regressor.fit(X_train, y_train)

# Predict on test set
y_pred_ridge = ridge_regressor.predict(X_test)

# Evaluate the Ridge model
print("\nRidge Regression Evaluation:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_ridge))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_ridge))
print("R-squared:", r2_score(y_test, y_pred_ridge))



Ridge Regression Evaluation:
Mean Absolute Error: 201756.03848632184
Mean Squared Error: 249564261026.5072
R-squared: 0.9812291861873715


In [114]:
# Step 10: Train and evaluate Lasso Regression
from sklearn.linear_model import Lasso

# Initialize Lasso model
lasso_regressor = Lasso(alpha=0.1)

# Train the model
lasso_regressor.fit(X_train, y_train)

# Predict on test set
y_pred_lasso = lasso_regressor.predict(X_test)

# Evaluate the Lasso model
print("\nLasso Regression Evaluation:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_lasso))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lasso))
print("R-squared:", r2_score(y_test, y_pred_lasso))


Lasso Regression Evaluation:
Mean Absolute Error: 202217.27375667365
Mean Squared Error: 248616983369.59082
R-squared: 0.9813004350611233
