In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump, load
import pickle

In [22]:
# Load data
df = pd.read_csv("../data/foods_grains/Bajra.csv")
df = df.rename(columns={'t': 'date'})

In [23]:
# Convert date properly
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df = df.sort_values('date')  # CRITICAL: Sort by date first!

print(f"Data period: {df['date'].min()} to {df['date'].max()}")
print(f"Total records: {len(df)}")

Data period: 2020-01-01 00:00:00 to 2025-09-20 00:00:00
Total records: 45647


In [24]:
# Extract date features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['DayOfWeek'] = df['date'].dt.dayofweek
df['WeekOfYear'] = df['date'].dt.isocalendar().week

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45647 entries, 9624 to 43203
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           45647 non-null  datetime64[ns]
 1   cmdty          45647 non-null  object        
 2   market_id      45647 non-null  int64         
 3   market_name    45647 non-null  object        
 4   state_id       45647 non-null  int64         
 5   state_name     45647 non-null  object        
 6   district_id    45647 non-null  int64         
 7   district_name  45647 non-null  object        
 8   variety        45647 non-null  object        
 9   p_min          45641 non-null  float64       
 10  p_max          45647 non-null  int64         
 11  p_modal        45645 non-null  float64       
 12  Year           45647 non-null  int32         
 13  Month          45647 non-null  int32         
 14  Day            45647 non-null  int32         
 15  DayOfWeek      45647 

In [26]:

# Encode categorical variables
le_district = LabelEncoder()
le_market = LabelEncoder()

df['district_encoded'] = le_district.fit_transform(df['district_name'])
df['market_encoded'] = le_market.fit_transform(df['market_name'])

In [7]:
# Save these encoders for the API
pickle.dump(le_district, open('../models/Bajradistrict_encoder.pkl', 'wb'))
pickle.dump(le_market, open('../models/Bajramarket_encoder.pkl', 'wb'))

In [8]:
# Data cleaning - remove unrealistic prices
print(f"Before cleaning: {len(df)} records")
df = df[(df['p_modal'] > 500) & (df['p_modal'] < 10000)]
df = df[(df['p_min'] > 0) & (df['p_min'] < 10000)]
df = df[(df['p_max'] > 0) & (df['p_max'] < 10000)]
print(f"After cleaning: {len(df)} records")

Before cleaning: 45647 records
After cleaning: 45625 records


In [9]:
# Handle missing values
df['p_min'] = df['p_min'].fillna(df['p_modal'] * 0.9)  # p_min ‚âà 90% of modal
df['p_max'] = df['p_max'].fillna(df['p_modal'] * 1.1)  # p_max ‚âà 110% of modal

In [10]:
# ‚úÖ CORRECT FEATURE SET - All 9 features your API expects
features = [
    'market_id', 'state_id', 'district_id',  # Location IDs
    'p_min', 'p_max',                        # Price range
    'Year', 'Month', 'Day',                  # Date components
    'district_encoded'                       # Encoded district
]

target = 'p_modal'

X = df[features]
y = df[target]

print(f"Features used: {features}")
print(f"Feature matrix shape: {X.shape}")

Features used: ['market_id', 'state_id', 'district_id', 'p_min', 'p_max', 'Year', 'Month', 'Day', 'district_encoded']
Feature matrix shape: (45625, 9)


In [11]:
# ‚úÖ PROPER TIME-BASED SPLIT
split_date = df['date'].quantile(0.8)  # 80% for training, 20% for testing
train_mask = df['date'] < split_date
test_mask = df['date'] >= split_date

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Train period: {df[train_mask]['date'].min()} to {df[train_mask]['date'].max()}")
print(f"Test period: {df[test_mask]['date'].min()} to {df[test_mask]['date'].max()}")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train period: 2020-01-01 00:00:00 to 2023-10-02 00:00:00
Test period: 2023-10-03 00:00:00 to 2025-09-20 00:00:00
Train size: 36482, Test size: 9143


In [12]:
# Create pipeline
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
])

In [13]:
# Fit pipeline on training data only
X_train_prepared = my_pipeline.fit_transform(X_train)
X_test_prepared = my_pipeline.transform(X_test)

print(f"Prepared train shape: {X_train_prepared.shape}")
print(f"Prepared test shape: {X_test_prepared.shape}")

Prepared train shape: (36482, 9)
Prepared test shape: (9143, 9)


In [14]:
# Train model
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5
)

model.fit(X_train_prepared, y_train)

In [15]:
# Evaluate model
y_pred_train = model.predict(X_train_prepared)
y_pred_test = model.predict(X_test_prepared)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\nüìä Model Performance:")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Train R¬≤: {train_r2:.4f}")
print(f"Test R¬≤: {test_r2:.4f}")


üìä Model Performance:
Train RMSE: 90.89
Test RMSE: 111.30
Train MAE: 51.53
Test MAE: 64.77
Train R¬≤: 0.9723
Test R¬≤: 0.9064


In [16]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüîç Feature Importance:")
print(feature_importance)


üîç Feature Importance:
            feature  importance
4             p_max    0.765811
3             p_min    0.220464
0         market_id    0.004766
5              Year    0.002667
6             Month    0.002127
2       district_id    0.001635
7               Day    0.001266
8  district_encoded    0.001264
1          state_id    0.000000


In [17]:
# Time Series Cross Validation
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X_train_prepared, y_train, 
                           scoring='neg_mean_squared_error', cv=tscv)
cv_rmse_scores = np.sqrt(-cv_scores)
print(f"\nüìà Time Series CV RMSE: {cv_rmse_scores.mean():.2f} (¬±{cv_rmse_scores.std():.2f})")


üìà Time Series CV RMSE: 125.45 (¬±21.93)


In [18]:
# Save model and pipeline
dump(model, '../models/bajra_model.joblib')
pickle.dump(model, open('../models/bajra_model.pkl', 'wb'))
pickle.dump(my_pipeline, open('../models/bajra_preprocessor.pkl', 'wb'))

print("\nüíæ Model and pipeline saved successfully!")


üíæ Model and pipeline saved successfully!


In [19]:
# Test prediction with correct features
sample_data = X_test.iloc[:3]
print(f"\nüß™ Sample test data shape: {sample_data.shape}")
prepared_sample = my_pipeline.transform(sample_data)
predictions = model.predict(prepared_sample)

print("Sample predictions:", predictions)
print("Actual values:", y_test.iloc[:3].values)


üß™ Sample test data shape: (3, 9)
Sample predictions: [1889.20203703 2146.05667606 2300.10272298]
Actual values: [1890. 2112. 2299.]


In [20]:
# Create mappings for the API
district_mapping = dict(zip(le_district.classes_, range(len(le_district.classes_))))
market_mapping = df.groupby('market_name')['market_id'].first().to_dict()

print(f"\nüåç District mappings: {list(district_mapping.keys())[:5]}...")
print(f"üè™ Market mappings: {list(market_mapping.keys())[:5]}...")


üåç District mappings: ['Ahmadnagar', 'Akola', 'Amravati', 'Aurangabad', 'Bid']...
üè™ Market mappings: ['Achalpur', 'Ahmednagar', 'Ahmedpur', 'Akkalkot', 'Akluj']...
