In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
import pickle


df = pd.read_csv("_crop+yield+prediction_data_crop_yield.csv")


df['Crop'] = df['Crop'].astype('category').cat.codes


df.dropna(inplace=True)


X = df[['Crop', 'Precipitation (mm day-1)', 'Specific Humidity at 2 Meters (g/kg)', 'Relative Humidity at 2 Meters (%)', 'Temperature at 2 Meters (C)']]
y = df['Yield']


z_scores = np.abs(stats.zscore(X))
filtered_entries = (z_scores < 3).all(axis=1)
X = X[filtered_entries]
y = y[filtered_entries]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=100)


rf = RandomForestRegressor(random_state=100)


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


best_rf = grid_search.best_estimator_


cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5)


best_rf.fit(X_train, y_train)


y_pred = best_rf.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Cross-validation scores: {cv_scores}")
print(f"Average Cross-validation score: {cv_scores.mean()}")


with open('crop_yield_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

print(y_pred)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Mean Squared Error: 168175942.13806486
R-squared: 0.9901946815692412
Cross-validation scores: [0.99509153 0.988455   0.97715516 0.99639425 0.99256464]
Average Cross-validation score: 0.9899321149298117
[168389.11945676  86511.65264008 418030.29905017  84992.52957958
 418825.20928607 289928.79246972 419720.78880655  93311.58168143
 281145.87323511 171545.54410349  91489.93692155 280651.87205782
  90470.95661616  86153.98762082 419402.14274095 168782.5759577
 168911.42767375 168181.89490069 179330.85191614 281086.78959875
 171029.89646617  85869.28744114 290946.0109289  422739.48369141
 179276.70498432 420559.27031959 420212.42686026  83833.7144607
  91189.28596071 290738.61562608 418811.10095274 280445.04110683
 420578.53541997 421504.811946    85991.37412532  86816.3314403
 173030.31247473 419431.81472549 420296.61436711 421920.95312657
 422665.31747929 418839.14023845 280890.33657081 281873.70928553
 167681.3430796  28014