In [1]:
import pandas as pd
from scipy.stats import f_oneway

In [2]:
def assign_season(date):
    month = date.month
    if month in range(6, 10):
        return 'Southwest Monsoon'
    elif month in range(10, 12):
        return 'Post Monsoon'
    elif month in range(3, 6):
        return 'Summer'
    else:
        return 'Winter'

In [4]:
def wrangle(df):
    df['Arrival_Date'] = pd.to_datetime(df['Arrival_Date'], format='%d/%m/%Y')
    df[['Max_Price', 'Modal_Price']] = df[['Max_Price', 'Modal_Price']].astype(float)

    df['Is_VFPCK'] = df['Market'].str.contains('VFPCK', case=False)
    df['Season'] = df['Arrival_Date'].apply(assign_season)

    commodity_counts = df['Commodity'].value_counts()
    valid_commodities = commodity_counts[commodity_counts > 10].index
    df = df[df['Commodity'].isin(valid_commodities)]
    df = df.sort_values(by='Commodity').reset_index(drop=True)
    return df

df = pd.read_csv('data/commodity_prices.csv')
df = wrangle(df)

df.describe(include='all')

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code,Is_VFPCK,Season
count,97727,97727,97727,97727,97727,97727,97727,97727.0,97727.0,97727.0,97727.0,97727,97727
unique,1,1,28,54,61,5,,,,,,2,4
top,Kerala,Ernakulam,North Paravur,Banana,Other,FAQ,,,,,,False,Southwest Monsoon
freq,97727,97727,13064,11950,27280,79414,,,,,,80616,41570
mean,,,,,,,2025-01-06 20:07:13.943536640,4776.513899,6022.617813,5362.873842,106.712229,,
min,,,,,,,2024-06-02 00:00:00,0.0,0.0,300.0,18.0,,
25%,,,,,,,2024-09-23 00:00:00,2500.0,3500.0,3000.0,34.0,,
50%,,,,,,,2025-01-15 00:00:00,4000.0,5000.0,4500.0,86.0,,
75%,,,,,,,2025-04-24 00:00:00,5500.0,6500.0,6000.0,156.0,,
max,,,,,,,2025-08-04 00:00:00,75000.0,110000.0,90000.0,360.0,,


In [7]:
# Group prices by season
groups = [group['Modal_Price'].values for name, group in df.groupby('Season')]

# Run one-way ANOVA
anova_result = f_oneway(*groups)
print(anova_result)

F_onewayResult(statistic=np.float64(77.43304865927817), pvalue=np.float64(5.047198443973974e-50))


In [8]:
anova_results = {}

for commodity, group in df.groupby('Commodity'):
    seasonal_groups = [seasonal_group['Modal_Price'].values for name, seasonal_group in group.groupby('Season')]

    if len(seasonal_groups) >= 2:
        stat, pval = f_oneway(*seasonal_groups)
        anova_results[commodity] = {'F-statistic': stat, 'p-value': pval}

# Convert to DataFrame for sorting/filtering
anova_df = pd.DataFrame(anova_results).T.sort_values('p-value').reset_index(drop=False)
anova_df

Unnamed: 0,index,F-statistic,p-value
0,Banana - Green,935.457634,0.0
1,Ashgourd,516.949012,6.215327999999999e-281
2,Drumstick,560.607389,1.7498859999999998e-276
3,Ginger(Green),522.649012,1.104253e-256
4,Onion,398.285085,1.021208e-211
5,Carrot,370.747544,3.239337e-199
6,Green Chilli,312.134245,1.778406e-171
7,Tomato,305.914572,4.951535e-171
8,Amphophalus,322.065506,7.023668000000001e-156
9,Potato,249.670665,2.1415940000000001e-141


In [9]:
df.shape

(97727, 13)

In [42]:
df[df['Commodity'] == 'Amaranthus']

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code,Is_VFPCK,Season
328,Kerala,Ernakulam,Broadway market,Amaranthus,Amaranthus,FAQ,2025-06-23,3000.0,4000.0,3200.0,86,False,Southwest Monsoon
329,Kerala,Ernakulam,Perumbavoor,Amaranthus,Amaranthus,FAQ,2024-10-11,3000.0,4000.0,3500.0,86,False,Post Monsoon
330,Kerala,Ernakulam,Perumbavoor,Amaranthus,Amaranthus,FAQ,2024-12-06,3000.0,4000.0,3500.0,86,False,Winter
331,Kerala,Ernakulam,Ernakulam,Amaranthus,Amaranthus,FAQ,2024-11-15,2700.0,3000.0,2900.0,86,False,Post Monsoon
332,Kerala,Ernakulam,Perumbavoor,Amaranthus,Amaranthus,FAQ,2024-12-02,2800.0,3800.0,3500.0,86,False,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2820,Kerala,Ernakulam,Perumbavoor,Amaranthus,Amaranthus,FAQ,2025-04-02,2200.0,3200.0,2500.0,86,False,Summer
2821,Kerala,Ernakulam,Broadway market,Amaranthus,Amaranthus,FAQ,2024-10-26,2000.0,3000.0,2200.0,86,False,Post Monsoon
2822,Kerala,Ernakulam,Piravam,Amaranthus,Other,FAQ,2025-03-20,4000.0,5000.0,4500.0,86,False,Summer
2823,Kerala,Ernakulam,Kothamangalam,Amaranthus,Other,FAQ,2024-10-30,3500.0,4500.0,4000.0,86,False,Post Monsoon


In [36]:
df.groupby('Season')['Modal_Price'].mean().sort_values(ascending=False)

Season
Post Monsoon         5549.938006
Southwest Monsoon    5537.070099
Winter               5512.336498
Summer               4785.036447
Name: Modal_Price, dtype: float64

In [37]:
avg_prices = df.groupby(['Commodity', 'Is_VFPCK'])['Modal_Price'].mean().unstack()
avg_prices

Is_VFPCK,False,True
Commodity,Unnamed: 1_level_1,Unnamed: 2_level_1
Alsandikai,4366.463415,
Amaranthus,3529.409401,4650.0
Amla(Nelli Kai),5839.634146,
Amphophalus,6573.954984,4752.272727
Arecanut(Betelnut/Supari),28798.511905,
Ashgourd,2426.073514,1744.169697
Banana,4117.953807,3787.923412
Banana - Green,5459.029126,5223.676681
Beetroot,5042.390139,
Bhindi(Ladies Finger),4070.262172,4344.444444


In [38]:
price_variation = df.groupby(['Commodity', 'Is_VFPCK'])['Modal_Price'].std().unstack()

price_variation

Is_VFPCK,False,True
Commodity,Unnamed: 1_level_1,Unnamed: 2_level_1
Alsandikai,855.797101,
Amaranthus,928.156219,826.351707
Amla(Nelli Kai),716.952342,
Amphophalus,1003.916763,570.547936
Arecanut(Betelnut/Supari),3089.437594,
Ashgourd,876.377859,970.39431
Banana,1626.099106,1620.792724
Banana - Green,1123.96159,1102.358536
Beetroot,1695.491094,
Bhindi(Ladies Finger),1039.861005,531.768538


In [None]:
df[df['Commodity'] == 'Tomato'].groupby('Grade')['Modal_Price'].mean()

Grade
FAQ        4111.714507
Non-FAQ    3696.296296
Name: Modal_Price, dtype: float64

In [18]:
df[df['Commodity'] == 'Tomato'].groupby('Season')['Modal_Price'].mean()

Season
Post Monsoon         4643.686869
Southwest Monsoon    4766.466036
Summer               2918.093700
Winter               3570.841121
Name: Modal_Price, dtype: float64

In [19]:
df[df['Commodity'] == 'Tomato'].groupby('Market')['Modal_Price'].mean()

Market
Aluva              3761.688312
Angamaly           4649.834983
Broadway market    4447.528517
Ernakulam          3716.428571
Kothamangalam      3616.613419
North Paravur      3636.085627
Perumbavoor        3694.886364
Piravam            4668.924303
Thrippunithura     4834.493671
Name: Modal_Price, dtype: float64

In [21]:
grouped = df.groupby(['Commodity', 'Market'])['Modal_Price'].mean().reset_index()
cheapest_markets = grouped.loc[grouped.groupby('Commodity')['Modal_Price'].idxmin()]

cheapest_markets.sort_values(by='Commodity', inplace=True)
cheapest_markets.reset_index(drop=True, inplace=True)

cheapest_markets

Unnamed: 0,Commodity,Market,Modal_Price
0,Alsandikai,North Paravur,4366.463415
1,Amaranthus,North Paravur,2478.963415
2,Amla(Nelli Kai),North Paravur,5839.634146
3,Amphophalus,Nedungapra VFPCK,4116.666667
4,Arecanut(Betelnut/Supari),Perumbavoor,28785.970149
5,Ashgourd,Amalapuram VFPCK,962.5
6,Banana,Aluva,2694.378698
7,Banana - Green,Keezhampara VFPCK,4927.920228
8,Beetroot,Perumbavoor,3186.968839
9,Bhindi(Ladies Finger),Perumbavoor,3268.926554


In [23]:
cheapest_markets['Market'].value_counts()

Market
North Paravur            11
Perumbavoor              11
Moovattupuzha             6
Kothamangalam             5
Piravam                   5
Aluva                     3
Edackattuvayal  VFPCK     3
Keezhampara VFPCK         2
Angamaly                  2
Thiruvaniyoor  VFPCK      2
Koovapadi VFPCK           2
Nedungapra  VFPCK         1
Amalapuram  VFPCK         1
Mazhuvannur VFPCK         1
KARUMALOOR VFPCK          1
Thrippunithura            1
Name: count, dtype: int64