# 🌽 Maize Yield Prediction (Zambia Dataset)
Welcome to this AI/ML notebook based on a synthetic dataset simulating Zambian agriculture data. This notebook demonstrates how to load the dataset, perform EDA, train a model, and evaluate its performance.

**Created by: Abel Bihinda**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:
# Load the dataset
df = pd.read_csv('../datasets/maize_yield_dataset.csv')
df.head()


In [None]:
# Check dataset info
df.info()


In [None]:
# Visualize numerical features
df.hist(figsize=(12, 8))
plt.tight_layout()
plt.show()


In [None]:
# Encode categorical variables
le = LabelEncoder()
for col in ['Province', 'Soil_Type', 'Seed_Variety', 'Pesticide_Usage']:
    df[col] = le.fit_transform(df[col])
df.head()


In [None]:
X = df.drop('Yield_tons_per_hectare', axis=1)
y = df['Yield_tons_per_hectare']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, '../models/maize_yield_model.pkl')


In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")
