In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import joblib

# Load and preprocess
df = pd.read_csv("crop_yield.csv")
categorical = ['Crop', 'Season', 'State']

encoder = OneHotEncoder(sparse_output=False)
encoder.fit(df[categorical])

categorical_encoded = encoder.transform(df[categorical])
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out())

# Weight factor for Annual_Rainfall
rainfall_weight = 2
weighted_rainfall = df['Annual_Rainfall'] * rainfall_weight

features = ['Crop_Year', 'Area', 'Production', 'Fertilizer', 'Pesticide']

X = pd.concat([
    categorical_df.reset_index(drop=True),
    df[features].reset_index(drop=True),
    weighted_rainfall.reset_index(drop=True)
], axis=1)

# Rename weighted rainfall column
X = X.rename(columns={'Annual_Rainfall': 'Annual_Rainfall_Weighted'})

y = df['Yield']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

param_grid = {
    'n_estimators': [100],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Save model and encoder for reuse
joblib.dump(best_model, "crop_yield_model.joblib")
joblib.dump(encoder, "onehot_encoder.joblib")

print("Training done and model saved.")


Training done and model saved.
