In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../data/crop_yield.csv")
df.head()


In [None]:
df.info()
df.describe()
df.isnull().sum()


In [None]:
# Drop rows with missing values
df = df.dropna()

# Encode categorical columns
df = pd.get_dummies(df, drop_first=True)

# Split features and target
X = df.drop('Yield', axis=1)
y = df['Yield']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("R²:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("R²:", r2_score(y_test, y_pred_xgb))
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))


In [None]:
models = ['Random Forest', 'XGBoost']
r2_scores = [r2_score(y_test, y_pred), r2_score(y_test, y_pred_xgb)]

plt.bar(models, r2_scores)
plt.title('Model Comparison')
plt.ylabel('R² Score')
plt.show()


In [None]:
import joblib
joblib.dump(xgb, "../src/best_model.pkl")


In [None]:
import pandas as pd

df = pd.read_csv("../data/your_file_name.csv")  # use your exact CSV name
df.head()


In [None]:
df.info()
df.describe()
df.isnull().sum()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example plots (customize based on your columns)
sns.histplot(df['Yield'], kde=True)
plt.title("Distribution of Crop Yield")

plt.figure(figsize=(10,5))
sns.boxplot(x='Crop', y='Yield', data=df)
plt.title("Yield by Crop Type")
plt.show()


In [None]:
# Handle missing values
df = df.dropna()

# Encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Separate features and target
X = df.drop('Yield', axis=1)
y = df['Yield']

print(X.shape, y.shape)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("R²:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
print("R²:", r2_score(y_test, y_pred_xgb))
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))


In [None]:
models = ['Random Forest', 'XGBoost']
r2_scores = [r2_score(y_test, y_pred), r2_score(y_test, y_pred_xgb)]

plt.bar(models, r2_scores)
plt.title('Model Comparison (R² Scores)')
plt.ylabel('R² Score')
plt.show()


In [None]:
import joblib

joblib.dump(xgb, "../src/best_model.pkl")  # or rf if RandomForest is better


In [None]:
model = joblib.load("../src/best_model.pkl")
sample = X_test.iloc[0:1]
prediction = model.predict(sample)
print("Predicted yield:", prediction)
