In [1]:
# IMPORT LIBRARIES

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
)

In [2]:
# DATA LOADING

df = pd.read_csv("commodities_dataset.csv")   # change name if needed

print("Shape:", df.shape)
df.head()


Shape: (733823, 10)


Unnamed: 0,state,district,market,commodity,variety,grade,min_price,max_price,modal_price,Unnamed: 9
0,Tamil Nadu,Dharmapuri,Ajattihalli(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,2800.0,3000,3000.0,
1,Tamil Nadu,Pudukkottai,Alangudi(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,1500.0,2000,2000.0,
2,Kerala,Ernakulam,Aluva,Amaranthus,Amaranthus,FAQ,4000.0,6000,5000.0,
3,Tamil Nadu,Thirunelveli,Ambasamudram(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,1200.0,1500,1500.0,
4,Tamil Nadu,Salem,Ammapet(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,2400.0,2600,2600.0,


In [3]:
# DATA INSPECTION

print("\n=== Info ===")
df.info()

print("\n=== Missing Values ===")
print(df.isnull().sum())

print("\n=== Numerical Describe ===")
print(df.describe())



=== Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733823 entries, 0 to 733822
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   state        733823 non-null  object 
 1   district     733823 non-null  object 
 2   market       733823 non-null  object 
 3   commodity    733823 non-null  object 
 4   variety      733823 non-null  object 
 5   grade        733823 non-null  object 
 6   min_price    733823 non-null  float64
 7   max_price    733823 non-null  int64  
 8   modal_price  733823 non-null  float64
 9   Unnamed: 9   0 non-null       float64
dtypes: float64(3), int64(1), object(6)
memory usage: 56.0+ MB

=== Missing Values ===
state               0
district            0
market              0
commodity           0
variety             0
grade               0
min_price           0
max_price           0
modal_price         0
Unnamed: 9     733823
dtype: int64

=== Numerical Describe ===
           

In [4]:
# DATA PROCESSING

if 'Unnamed: 9' in df.columns:
    df = df.drop(columns=['Unnamed: 9'])

price_cols = ['min_price', 'max_price', 'modal_price']

for col in price_cols:
    q1 = df[col].quantile(0.01)
    q99 = df[col].quantile(0.99)
    df[col] = df[col].clip(q1, q99)

num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df['price_range'] = df['max_price'] - df['min_price']
df['avg_price'] = (df['max_price'] + df['min_price']) / 2

df.head()


Unnamed: 0,state,district,market,commodity,variety,grade,min_price,max_price,modal_price,price_range,avg_price
0,Tamil Nadu,Dharmapuri,Ajattihalli(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,2800.0,3000,3000.0,200.0,2900.0
1,Tamil Nadu,Pudukkottai,Alangudi(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,1500.0,2000,2000.0,500.0,1750.0
2,Kerala,Ernakulam,Aluva,Amaranthus,Amaranthus,FAQ,4000.0,6000,5000.0,2000.0,5000.0
3,Tamil Nadu,Thirunelveli,Ambasamudram(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,1200.0,1500,1500.0,300.0,1350.0
4,Tamil Nadu,Salem,Ammapet(Uzhavar Sandhai ),Amaranthus,Amaranthus,LOCAL,2400.0,2600,2600.0,200.0,2500.0


In [5]:
# PREPROCESSING PIPELINE (Scaling + Encoding)

numeric_features = ['min_price', 'max_price', 'price_range', 'avg_price']
categorical_features = ['state', 'district', 'market', 'commodity', 'variety', 'grade']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [6]:
# EDA

print("\nTop 10 commodities:")
print(df['commodity'].value_counts().head(10))

print("\nAverage modal price by commodity:")
print(df.groupby('commodity')['modal_price'].mean().sort_values(ascending=False).head(10))

print("\nAverage modal price by state:")
print(df.groupby('state')['modal_price'].mean().sort_values(ascending=False).head(10))



Top 10 commodities:
commodity
Potato                   323161
Onion                    294788
Wheat                     76067
Tomato                    25067
Rice                       7760
Green Chilli                238
Bhindi(Ladies Finger)       233
Bottle Gourd                220
Cabbage                     194
Banana - Green              190
Name: count, dtype: int64

Average modal price by commodity:
commodity
Green Gram Dal (Moong Dal)       7500.0
Kabuli Chana(Chickpeas-White)    7500.0
Jute                             7500.0
Arhar Dal(Tur Dal)               7500.0
Rat Tail Radish (Mogari)         7500.0
Tamarind Fruit                   7500.0
Surat Beans (Papadi)             7500.0
Peas Cod                         7500.0
Grapes                           7500.0
Green Peas                       7500.0
Name: modal_price, dtype: float64

Average modal price by state:
state
Tamil Nadu    4855.732379
Tamilnadu     4697.501381
Tripura       4608.035714
Kerala        4039.582693
Man

In [7]:
# DEFINE X AND y

target_col = "modal_price"
X = df.drop(columns=[target_col])
y = df[target_col]

print("Features:", X.columns.tolist())
print("Target:", target_col)

Features: ['state', 'district', 'market', 'commodity', 'variety', 'grade', 'min_price', 'max_price', 'price_range', 'avg_price']
Target: modal_price


In [8]:
# TRAIN-TEST SPLIT (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


Shapes:
X_train: (587058, 10)
X_test: (146765, 10)
y_train: (587058,)
y_test: (146765,)


In [None]:
#RANDOM FOREST REGRESSOR

rf_model = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(n_estimators=120, max_depth=15, random_state=42))
])

rf_model.fit(X_train, y_train)



In [None]:
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)


In [None]:
#MAE, MSE, RMSE, R^2

mae_train = mean_absolute_error(y_train, y_train_pred_rf)
mse_train = mean_squared_error(y_train, y_train_pred_rf)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred_rf)

mae_test = mean_absolute_error(y_test, y_test_pred_rf)
mse_test = mean_squared_error(y_test, y_test_pred_rf)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred_rf)

print("\n================ METRICS: Random Forest ================")
print("---- TRAIN ----")
print("MAE:", mae_train)
print("RMSE:", rmse_train)
print("R²:", r2_train)

print("\n---- TEST ----")
print("MAE:", mae_test)
print("RMSE:", rmse_test)
print("R²:", r2_test)
