In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/onion_maharashtra_cleaned.csv")
print("Loaded shape:", df.shape)

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])
df = df.sort_values('Date')

df[['Date', 'Market', 'Modal_Price']].head()

Loaded shape: (10877, 11)


Unnamed: 0,Date,Market,Modal_Price
0,2023-06-06,Pathardi,650.0
26,2023-06-06,Pimpalgaon,1051.0
27,2023-06-06,Pimpalgaon Baswant(Saykheda),750.0
28,2023-06-06,Vashi New Mumbai,1000.0
29,2023-06-06,Manchar,1050.0


In [2]:
daily = (
    df.groupby("Date", as_index=False)
      .agg(
          Avg_Modal_Price = ('Modal_Price', 'mean'),
          Min_Price       = ('Min_Price', 'mean'),
          Max_Price       = ('Max_Price', 'mean'),
          Num_Markets     = ('Market', 'nunique')
      )
)

daily = daily.sort_values("Date")
print("Daily shape:", daily.shape)
daily.head()

Daily shape: (255, 5)


Unnamed: 0,Date,Avg_Modal_Price,Min_Price,Max_Price,Num_Markets
0,2023-06-06,845.574468,403.744681,1227.255319,47
1,2023-06-07,844.636364,361.5,1204.795455,43
2,2023-06-08,847.285714,424.0,1202.979592,48
3,2023-06-09,868.477273,415.772727,1273.090909,44
4,2023-06-10,910.363636,451.204545,1246.75,41


In [3]:
data = daily.copy()

# Time features
data['day_of_week'] = data['Date'].dt.weekday
data['month'] = data['Date'].dt.month
data['weekofyear'] = data['Date'].dt.isocalendar().week.astype(int)

# Lag features
data['lag_1'] = data['Avg_Modal_Price'].shift(1)
data['lag_3'] = data['Avg_Modal_Price'].shift(3)
data['lag_7'] = data['Avg_Modal_Price'].shift(7)

# Rolling windows
data['roll_mean_7'] = data['Avg_Modal_Price'].rolling(7).mean()
data['roll_std_7'] = data['Avg_Modal_Price'].rolling(7).std()
data['roll_mean_14'] = data['Avg_Modal_Price'].rolling(14).mean()

In [4]:
feature_cols = [
    'day_of_week', 'month', 'weekofyear',
    'lag_1', 'lag_3', 'lag_7',
    'roll_mean_7', 'roll_std_7', 'roll_mean_14'
]

target_col = "Avg_Modal_Price"

data_ml = data.dropna(subset=feature_cols + [target_col]).copy()
data_ml = data_ml.sort_values("Date")

print("ML dataset shape:", data_ml.shape)
data_ml.head()

ML dataset shape: (242, 14)


Unnamed: 0,Date,Avg_Modal_Price,Min_Price,Max_Price,Num_Markets,day_of_week,month,weekofyear,lag_1,lag_3,lag_7,roll_mean_7,roll_std_7,roll_mean_14
13,2023-07-07,1223.711765,507.686275,1749.117647,50,4,7,27,1323.141071,1248.027308,861.78,1239.829119,49.209977,1058.830687
14,2023-07-08,1225.930233,555.27907,1729.395349,41,5,7,27,1223.711765,1263.369074,1207.226905,1242.501023,47.62708,1085.998956
15,2023-07-09,1276.416667,702.083333,1750.833333,24,6,7,27,1225.930233,1323.141071,1248.166667,1246.536737,49.352803,1116.840406
16,2023-07-10,1131.128824,433.392157,1659.980392,51,0,7,28,1276.416667,1223.711765,1165.161042,1241.674992,59.365705,1137.114914
17,2023-07-11,1224.783929,574.089286,1674.5,55,1,7,28,1131.128824,1225.930233,1248.027308,1238.354509,59.600753,1162.565389


In [5]:
n = len(data_ml)
split_idx = int(n * 0.8)

train = data_ml.iloc[:split_idx].copy()
valid = data_ml.iloc[split_idx:].copy()

print("Train shape:", train.shape)
print("Valid shape:", valid.shape)

print("Train dates:", train['Date'].min(), "→", train['Date'].max())
print("Valid dates:", valid['Date'].min(), "→", valid['Date'].max())

Train shape: (193, 14)
Valid shape: (49, 14)
Train dates: 2023-07-07 00:00:00 → 2025-02-10 00:00:00
Valid dates: 2025-02-11 00:00:00 → 2025-06-11 00:00:00


In [6]:
data_ml.to_csv("../data/processed/onion_maharashtra_features.csv", index=False)
train.to_csv("../data/processed/onion_maharashtra_train.csv", index=False)
valid.to_csv("../data/processed/onion_maharashtra_valid.csv", index=False)

print("Saved ML datasets.")

Saved ML datasets.
