In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../data/crop_yield.csv")
df.head()


In [None]:
df.info()
df.describe()
df.isnull().sum()


In [None]:
# Drop rows with missing values
df = df.dropna()

# Encode categorical columns
df = pd.get_dummies(df, drop_first=True)

# Split features and target
X = df.drop('Yield', axis=1)
y = df['Yield']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("R²:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("R²:", r2_score(y_test, y_pred_xgb))
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))


In [None]:
models = ['Random Forest', 'XGBoost']
r2_scores = [r2_score(y_test, y_pred), r2_score(y_test, y_pred_xgb)]

plt.bar(models, r2_scores)
plt.title('Model Comparison')
plt.ylabel('R² Score')
plt.show()


In [None]:
import joblib
joblib.dump(xgb, "../src/best_model.pkl")
