In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# PART 1: Load and Process Crop Data
crop_data = pd.read_csv("yield_df.csv")
crop_data.rename(columns={"Area": "Country"}, inplace=True)
crop_data = crop_data.drop(columns=['avg_temp', 'pesticides_tonnes', 'average_rain_fall_mm_per_year'])
crop_data = crop_data[['Country', 'Item', 'Year', 'hg/ha_yield']]
crop_data.rename(columns={'Item': 'Crop', 'hg/ha_yield': 'Yield'}, inplace=True)
crop_data = crop_data.dropna()

# PART 2: Load and Process Climate Data
climate_data = pd.read_csv("climate_data_processed.csv")

# PART 3: Merge Crop and Climate Data
merged_data = pd.merge(crop_data, climate_data, on=["Country", "Year"], how="inner")

# PART 4: Preprocess Data
# Remove outliers using Z-Score
numeric_columns = ['Yield', 'Max_Temperature', 'Min_Temperature', 'Avg_Humidity']
z_scores = np.abs((merged_data[numeric_columns] - merged_data[numeric_columns].mean()) / merged_data[numeric_columns].std())
merged_data = merged_data[(z_scores < 3).all(axis=1)]

# Standardize features (excluding target)
scaler = StandardScaler()
scaled_columns = ['Max_Temperature', 'Min_Temperature', 'Avg_Humidity']
merged_data[scaled_columns] = scaler.fit_transform(merged_data[scaled_columns])

# Encode categorical features
merged_data = pd.get_dummies(merged_data, columns=['Crop'], drop_first=True)
label_encoder = LabelEncoder()
merged_data['Country'] = label_encoder.fit_transform(merged_data['Country'])

# Save the scaler and encoder for later use
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# PART 5: Train Model
X = merged_data.drop(columns=['Yield'])
y = merged_data['Yield']

# Ensure target variable is not scaled
assert np.all(y >= 0), "Target variable (Yield) contains negative values before model training."

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)

# Ensure predictions are non-negative
y_pred = np.maximum(y_pred, 0)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

# Save the trained model
joblib.dump(model, "crop_yield_model.pkl")
print("Model saved as crop_yield_model.pkl")

Mean Absolute Error: 0.0
R-squared: 1.0
Model saved as crop_yield_model.pkl


In [7]:
pip install pandas


Collecting pandas
  Using cached pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl (11.3 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2024.2
Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
