In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_absolute_error


In [2]:
df = pd.read_csv("../data/raw/crop_yield.csv") 
df.head()


Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [5]:
X = df.drop("Yield", axis=1)
y = df["Yield"]


In [6]:
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

print("Numerical:", numerical_features)
print("Categorical:", categorical_features)


Numerical: Index(['Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer',
       'Pesticide'],
      dtype='object')
Categorical: Index(['Crop', 'Season', 'State'], dtype='object')


In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [9]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}


In [10]:
results = []

for name, model in models.items():
    
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    end_time = time.time()
    
    train_preds = pipeline.predict(X_train)
    val_preds = pipeline.predict(X_val)
    
    train_r2 = r2_score(y_train, train_preds)
    val_r2 = r2_score(y_val, val_preds)
    
    train_mae = mean_absolute_error(y_train, train_preds)
    val_mae = mean_absolute_error(y_val, val_preds)
    
    results.append({
        "Model": name,
        "Train R2": train_r2,
        "Validation R2": val_r2,
        "Train MAE": train_mae,
        "Validation MAE": val_mae,
        "Overfitting Gap (R2)": train_r2 - val_r2,
        "Training Time (s)": end_time - start_time
    })

results_df = pd.DataFrame(results)
results_df.sort_values(by="Validation R2", ascending=False)


Unnamed: 0,Model,Train R2,Validation R2,Train MAE,Validation MAE,Overfitting Gap (R2),Training Time (s)
2,Random Forest,0.995369,0.98783,2.95464,7.350449,0.007539,102.903311
1,Decision Tree,1.0,0.976048,1.0572879999999999e-20,8.434741,0.023952,1.254663
0,Linear Regression,0.85291,0.802229,56.35657,62.980433,0.05068,0.068072


We define Preprocessing Pipeline, define Regression Models, then we train and compare the models, to find the most optimal model.

Linear Regression

Validation RÂ² = 0.802
MAE â‰ˆ 63
Moderate gap

This shows:
Linear model cannot capture complex interactions.
Yield prediction depends on non-linear relationships.
Underfitting relative to tree-based models.

ðŸ”¹ Decision Tree

Train RÂ² = 1.000
Validation RÂ² = 0.976
MAE â‰ˆ 8.43

Classic behavior:
Perfect fit on training
Slight drop on validation
Some overfitting
Still strong performance.

ðŸ”¹ Random Forest

Train RÂ² = 0.995
Validation RÂ² = 0.98
MAE â‰ˆ 7.35
Gap = 0.0075 (extremely small)

This is excellent generalization.

This tells us:

Ensemble reduced variance.
Very low overfitting.
Strong predictive stability.