In [1]:
import pandas as pd
import numpy as np

from utils.load_data_for_eda import load_data
from utils.effect_sizes import compute_effect_sizes_by_group, compute_effect_sizes_by_commodity
df = load_data('data/commodity_prices.csv')
df = compute_effect_sizes_by_group(df)
df = compute_effect_sizes_by_commodity(df)

In [2]:
df.head(10)

Unnamed: 0,Product_Type,Commodity,Variety_Type,Arrival_Date,Market,Is_VFPCK,Season,Year,Modal_Price,Max_Price,...,eta2_Season,omega2_Season,eta2_Market,omega2_Market,eta2_Year,omega2_Year,eta2_Commodity,omega2_Commodity,eta2_Variety_Type,omega2_Variety_Type
0,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-13,North Paravur,False,Winter,2023,5200.0,6000.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
1,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-14,North Paravur,False,Winter,2023,6200.0,6500.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
2,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-16,North Paravur,False,Winter,2023,4800.0,5600.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
3,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-18,North Paravur,False,Winter,2023,3500.0,4500.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
4,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-19,North Paravur,False,Winter,2023,5500.0,6000.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
5,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-20,North Paravur,False,Winter,2023,5500.0,6000.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
6,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-21,North Paravur,False,Winter,2023,4500.0,5600.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
7,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-22,North Paravur,False,Winter,2023,4500.0,5600.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
8,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-23,North Paravur,False,Winter,2023,5500.0,6000.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0
9,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-26,North Paravur,False,Winter,2023,3400.0,5600.0,...,0.386076,0.38163,0.0,0.0,0.016996,0.012769,0.0,0.0,0.0,0.0


In [3]:

def compute_effect_sizes_by_commodity(df, group_cols=['Commodity', 'Variety_Type']):
    for group_col in group_cols:
        for commodity, group in df.groupby(group_col):
            group['Log_Modal_Price'] = group['Modal_Price'].apply(lambda x: np.log(x) if x > 0 else np.nan)
            prices = [grp['Log_Modal_Price'].values for _, grp in group.groupby('Product_Type')]
            mean_prices = [np.mean(p) for p in prices]
            grand_mean = np.mean(np.concatenate(prices))
            size_prices = [len(p) for p in prices]
            variance_prices = [np.var(p, ddof=1) if len(p) > 1 else 0 for p in prices]
            SS_between = sum([(mean_prices[i] - grand_mean)**2 * size_prices[i] for i in range(len(prices))])
            SS_within = sum([(size_prices[i] - 1) * variance_prices[i] for i in range(len(prices))])
            SS_total = SS_between + SS_within
            eta2 = SS_between / SS_total if SS_total > 0 else 0
            df_between = len(prices) - 1
            df_within = sum(size_prices) - len(prices)
            MS_within = SS_within / df_within if df_within > 0 else 0
            omega2 = (SS_between - df_between * MS_within) / (SS_total + MS_within) if (SS_total + MS_within) > 0 else 0
            df.loc[group.index, f'eta2_{group_col}'] = eta2
            df.loc[group.index, f'omega2_{group_col}'] = omega2
    return df
        

In [4]:
df = compute_effect_sizes_by_commodity(df)

In [3]:
df[df['eta2_Commodity'] >= 0.1][['Product_Type', 'eta2_Commodity', 'omega2_Commodity']].drop_duplicates()

Unnamed: 0,Product_Type,eta2_Commodity,omega2_Commodity
471,Amaranthus|Amaranthus|FAQ,0.149755,0.14953
3648,Amaranthus|Other|FAQ,0.149755,0.14953
20696,Banana|Nendra Bale|Large,0.64585,0.645579
25696,Banana|Nendra Bale|Medium,0.64585,0.645579
25837,Banana|Nendra Bale|Small,0.64585,0.645579
25870,Banana|Other|Large,0.64585,0.645579
25945,Banana|Other|Medium,0.64585,0.645579
26028,Banana|Palayamthodan|Large,0.64585,0.645579
29555,Banana|Palayamthodan|Medium,0.64585,0.645579
32172,Banana|Palayamthodan|Small,0.64585,0.645579


In [4]:
df[df['eta2_Variety_Type'] >= 0.1][['Product_Type', 'eta2_Variety_Type', 'omega2_Variety_Type']].drop_duplicates()

Unnamed: 0,Product_Type,eta2_Variety_Type,omega2_Variety_Type
26028,Banana|Palayamthodan|Large,0.12356,0.12326
29555,Banana|Palayamthodan|Medium,0.12356,0.12326
32172,Banana|Palayamthodan|Small,0.12356,0.12326
126416,Onion|1st Sort|FAQ,0.199267,0.198114
127142,Onion|1st Sort|Non-FAQ,0.199267,0.198114
132863,Pineapple|Other|Large,0.284891,0.284471
134808,Pineapple|Other|Medium,0.284891,0.284471
136209,Pineapple|Other|Small,0.284891,0.284471


“While market, season, and year effects dominate overall, certain commodities (notably Banana, Pineapple, Onion) show strong within-commodity variety-level effects. This justifies treating Commodity and Variety as important features in price modeling.”

In [14]:
for commodity, group in df.groupby('Variety_Type'):
    group['Log_Modal_Price'] = group['Modal_Price'].apply(lambda x: np.log(x) if x > 0 else np.nan)
    prices = [grp['Log_Modal_Price'].values for _, grp in group.groupby('Product_Type')]
    mean_prices = [np.mean(p) for p in prices]
    grand_mean = np.mean(np.concatenate(prices))
    size_prices = [len(p) for p in prices]
    variance_prices = [np.var(p, ddof=1) if len(p) > 1 else 0 for p in prices]
    SS_between = sum([(mean_prices[i] - grand_mean)**2 * size_prices[i] for i in range(len(prices))])
    SS_within = sum([(size_prices[i] - 1) * variance_prices[i] for i in range(len(prices))])
    SS_total = SS_between + SS_within
    eta2 = SS_between / SS_total if SS_total > 0 else 0
    df_between = len(prices) - 1
    df_within = sum(size_prices) - len(prices)
    MS_within = SS_within / df_within if df_within > 0 else 0
    omega2 = (SS_between - df_between * MS_within) / (SS_total + MS_within) if (SS_total + MS_within) > 0 else 0
    df.loc[group.index, 'eta2_variety'] = eta2
    df.loc[group.index,'omega2_variety'] = omega2

    #print(f"Variety_Type: {commodity}, Eta²: {eta2:.4f}, Omega²: {omega2:.4f}")

In [13]:
df_simple = df[(df['eta2_Market'] == 0.0) & (df['eta2_Year'] == 0.0)]

In [30]:
df[df['Product_Type'].str.contains('Coriander')]['Product_Type'].value_counts()

Product_Type
Coriander(Leaves)|Coriander|FAQ    449
Coriander(Leaves)|Other|FAQ        151
Name: count, dtype: int64

In [33]:
df[df['Product_Type'] == 'Coriander(Leaves)|Coriander|FAQ']['Season'].value_counts()

Season
Summer               142
Southwest Monsoon    129
Winter               108
Post Monsoon          70
Name: count, dtype: int64

In [32]:
df[df['Product_Type'] == 'Coriander(Leaves)|Other|FAQ']['Market'].value_counts()

Market
Piravam    151
Name: count, dtype: int64

In [16]:
df_simple['Product_Type'].value_counts()

Product_Type
Coriander(Leaves)|Other|FAQ      151
Orange|Other|Large               149
Sweet Potato|Other|FAQ           140
Tomato|Deshi|Non-FAQ             122
Water Melon|Water Melon|Large    119
Onion|1st Sort|Non-FAQ           108
Potato|(Red Nanital)|Non-FAQ     108
Long Melon(Kakri)|Other|FAQ       98
Egg|Egg|FAQ                       26
Name: count, dtype: int64

In [26]:
df_simple[['Product_Type', 'eta2_Season', 'omega2_Season']].drop_duplicates()

Unnamed: 0,Product_Type,eta2_Season,omega2_Season
83938,Coriander(Leaves)|Other|FAQ,0.175735,0.163685
97951,Egg|Egg|FAQ,0.5,0.446809
122165,Long Melon(Kakri)|Other|FAQ,0.234177,0.216311
127142,Onion|1st Sort|Non-FAQ,0.035741,0.017216
131281,Orange|Other|Large,0.534722,0.52667
141142,Potato|(Red Nanital)|Non-FAQ,0.047642,0.029236
156151,Sweet Potato|Other|FAQ,0.359891,0.348916
163816,Tomato|Deshi|Non-FAQ,0.604214,0.595581
167831,Water Melon|Water Melon|Large,0.100681,0.084521


In [23]:
df[df['Product_Type'] == 'Orange|Other|Large']['Market'].value_counts()

Market
Piravam    149
Name: count, dtype: int64

In [3]:
df['Product_Type'].nunique()

124

In [5]:
df[df['eta2_Season'] == 0.0]

Unnamed: 0,Product_Type,Arrival_Date,Market,Is_VFPCK,Season,Year,Modal_Price,Max_Price,Min_Price,eta2_Season,omega2_Season,eta2_Market,omega2_Market,eta2_Year,omega2_Year


In [4]:
df['eta2_Market'].nunique(), df['omega2_Market'].nunique(), df['eta2_Season'].nunique(), df['omega2_Season'].nunique(), df['eta2_Year'].nunique(), df['omega2_Year'].nunique(), df['eta2_Commodity'].nunique(), df['omega2_Commodity'].nunique()

(98, 98, 124, 124, 112, 112, 1, 1)

In [86]:
df['Product_Type'].nunique()

124

In [None]:

for product, group in df.groupby('Product_Type'):
    group['log_Modal_Price'] = group["Modal_Price"].apply(lambda X: np.log(X) if X > 0 else np.nan)
    prices = [grp['log_Modal_Price'].values for _, grp in group.groupby('Season')]
    mean_prices = [np.mean(p) for p in prices]
    grand_mean = np.mean(np.concatenate(prices))
    size_prices = [len(p) for p in prices]
    variance_prices = [np.var(p, ddof=1) if len(p) > 1 else 0 for p in prices]
    SS_between = sum([(mean_prices[i] - grand_mean)**2 * size_prices[i] for i in range(len(prices))])
    SS_within = sum([(size_prices[i] - 1) * variance_prices[i] for i in range(len(prices))])
    SS_total = SS_between + SS_within
    eta2_season = SS_between / SS_total if SS_total > 0 else 0
    df_between = len(prices) - 1
    df_within = sum(size_prices) - len(prices)
    MS_within = SS_within / df_within if df_within > 0 else 0
    omega2_season = (SS_between - df_between * MS_within) / (SS_total + MS_within) if (SS_total + MS_within) > 0 else 0
    df['eta2_season'] = eta2_season
    df['omega2_season'] = omega2_season
# print(prices)
# # print(prices)
# # print(grand_mean)
# # print(mean_prices)
# # print(variance_prices)
# print(eta2_season, omega2_season)

In [6]:

for product, group in df.groupby('Product_Type'):
    group['log_Modal_Price'] = group["Modal_Price"].apply(lambda X: np.log(X) if X > 0 else np.nan)
    prices = [grp['log_Modal_Price'].values for _, grp in group.groupby('Commodity')]
    mean_prices = [np.mean(p) for p in prices]
    grand_mean = np.mean(np.concatenate(prices))
    size_prices = [len(p) for p in prices]
    variance_prices = [np.var(p, ddof=1) if len(p) > 1 else 0 for p in prices]
    SS_between = sum([(mean_prices[i] - grand_mean)**2 * size_prices[i] for i in range(len(prices))])
    SS_within = sum([(size_prices[i] - 1) * variance_prices[i] for i in range(len(prices))])
    SS_total = SS_between + SS_within
    eta2_market = SS_between / SS_total if SS_total > 0 else 0
    df_between = len(prices) - 1
    df_within = sum(size_prices) - len(prices)
    MS_within = SS_within / df_within if df_within > 0 else 0
    omega2_market = (SS_between - df_between * MS_within) / (SS_total + MS_within) if (SS_total + MS_within) > 0 else 0
    df['eta2_season'] = eta2_market
    df['omega2_season'] = omega2_market
print(prices)
# # print(prices)
print(grand_mean)
print(mean_prices)
print(variance_prices)
print(SS_between, SS_within, SS_total)
print(df_between, df_within, MS_within)
print(eta2_market, omega2_market)
print()

[array([8.00636757, 7.82404601, 7.82404601, 7.82404601, 7.82404601,
       7.82404601, 8.51719319, 7.82404601, 8.51719319, 8.51719319,
       8.51719319, 7.82404601, 7.82404601, 8.00636757, 8.00636757,
       8.00636757, 8.00636757, 7.82404601, 7.82404601, 7.82404601,
       7.82404601, 7.82404601, 7.82404601, 8.00636757, 7.82404601,
       7.82404601, 8.00636757, 7.82404601, 7.82404601, 7.82404601,
       7.82404601, 7.82404601, 7.82404601, 8.16051825, 7.82404601,
       8.16051825, 8.16051825, 7.82404601, 7.82404601, 8.16051825,
       8.16051825, 8.00636757, 8.16051825, 8.16051825, 7.82404601,
       7.82404601, 8.16051825, 7.82404601, 8.16051825, 8.00636757,
       7.82404601, 8.16051825, 7.60090246, 7.60090246, 7.60090246,
       7.60090246, 8.00636757, 7.60090246, 7.60090246, 7.60090246,
       7.82404601, 7.60090246, 7.37775891, 8.16051825, 7.60090246,
       7.60090246, 7.24422752, 7.60090246, 7.60090246, 7.60090246,
       7.60090246, 7.69621264, 7.60090246, 7.60090246, 7.6009

In [56]:
df.head(10)

Unnamed: 0,Product_Type,Arrival_Date,Market,Is_VFPCK,Season,Year,Modal_Price,Max_Price,Min_Price,eta2_season,omega2_season
0,Alsandikai|Alsandikai|FAQ,2023-12-13,North Paravur,False,Winter,2023,5200.0,6000.0,5000.0,0.0,0.0
1,Alsandikai|Alsandikai|FAQ,2023-12-14,North Paravur,False,Winter,2023,6200.0,6500.0,6000.0,0.0,0.0
2,Alsandikai|Alsandikai|FAQ,2023-12-16,North Paravur,False,Winter,2023,4800.0,5600.0,4600.0,0.0,0.0
3,Alsandikai|Alsandikai|FAQ,2023-12-18,North Paravur,False,Winter,2023,3500.0,4500.0,3000.0,0.0,0.0
4,Alsandikai|Alsandikai|FAQ,2023-12-19,North Paravur,False,Winter,2023,5500.0,6000.0,5500.0,0.0,0.0
5,Alsandikai|Alsandikai|FAQ,2023-12-20,North Paravur,False,Winter,2023,5500.0,6000.0,5000.0,0.0,0.0
6,Alsandikai|Alsandikai|FAQ,2023-12-21,North Paravur,False,Winter,2023,4500.0,5600.0,4000.0,0.0,0.0
7,Alsandikai|Alsandikai|FAQ,2023-12-22,North Paravur,False,Winter,2023,4500.0,5600.0,4000.0,0.0,0.0
8,Alsandikai|Alsandikai|FAQ,2023-12-23,North Paravur,False,Winter,2023,5500.0,6000.0,5000.0,0.0,0.0
9,Alsandikai|Alsandikai|FAQ,2023-12-26,North Paravur,False,Winter,2023,3400.0,5600.0,3000.0,0.0,0.0


In [43]:
group.shape

(119, 12)

In [46]:
df[df['Product_Type'] == 'Water Melon|Water Melon|Large']['Season'].shape

(119,)

In [41]:
group.head(10)

Unnamed: 0,Product_Type,Arrival_Date,Market,Is_VFPCK,Season,Year,Modal_Price,Max_Price,Min_Price,log_Modal_Price,eta2_season,omega2_season
167831,Water Melon|Water Melon|Large,2025-02-07,Piravam,False,Winter,2025,3000.0,4000.0,2000.0,8.006368,0.100681,0.084521
167832,Water Melon|Water Melon|Large,2025-02-09,Piravam,False,Winter,2025,2500.0,3000.0,2000.0,7.824046,0.100681,0.084521
167833,Water Melon|Water Melon|Large,2025-02-10,Piravam,False,Winter,2025,2500.0,3000.0,2000.0,7.824046,0.100681,0.084521
167834,Water Melon|Water Melon|Large,2025-02-12,Piravam,False,Winter,2025,2500.0,3000.0,2000.0,7.824046,0.100681,0.084521
167835,Water Melon|Water Melon|Large,2025-02-13,Piravam,False,Winter,2025,2500.0,3000.0,2000.0,7.824046,0.100681,0.084521
167836,Water Melon|Water Melon|Large,2025-02-14,Piravam,False,Winter,2025,2500.0,3000.0,2000.0,7.824046,0.100681,0.084521
167837,Water Melon|Water Melon|Large,2025-02-16,Piravam,False,Winter,2025,5000.0,6000.0,4000.0,8.517193,0.100681,0.084521
167838,Water Melon|Water Melon|Large,2025-02-17,Piravam,False,Winter,2025,2500.0,3000.0,2000.0,7.824046,0.100681,0.084521
167839,Water Melon|Water Melon|Large,2025-02-21,Piravam,False,Winter,2025,5000.0,6000.0,4000.0,8.517193,0.100681,0.084521
167840,Water Melon|Water Melon|Large,2025-02-23,Piravam,False,Winter,2025,5000.0,6000.0,4000.0,8.517193,0.100681,0.084521


In [48]:
df.head(10)

Unnamed: 0,Product_Type,Arrival_Date,Market,Is_VFPCK,Season,Year,Modal_Price,Max_Price,Min_Price,log_Modal_Price,eta2_season,omega2_season
0,Alsandikai|Alsandikai|FAQ,2023-12-13,North Paravur,False,Winter,2023,5200.0,6000.0,5000.0,8.556414,0.100681,0.084521
1,Alsandikai|Alsandikai|FAQ,2023-12-14,North Paravur,False,Winter,2023,6200.0,6500.0,6000.0,8.732305,0.100681,0.084521
2,Alsandikai|Alsandikai|FAQ,2023-12-16,North Paravur,False,Winter,2023,4800.0,5600.0,4600.0,8.476371,0.100681,0.084521
3,Alsandikai|Alsandikai|FAQ,2023-12-18,North Paravur,False,Winter,2023,3500.0,4500.0,3000.0,8.160518,0.100681,0.084521
4,Alsandikai|Alsandikai|FAQ,2023-12-19,North Paravur,False,Winter,2023,5500.0,6000.0,5500.0,8.612503,0.100681,0.084521
5,Alsandikai|Alsandikai|FAQ,2023-12-20,North Paravur,False,Winter,2023,5500.0,6000.0,5000.0,8.612503,0.100681,0.084521
6,Alsandikai|Alsandikai|FAQ,2023-12-21,North Paravur,False,Winter,2023,4500.0,5600.0,4000.0,8.411833,0.100681,0.084521
7,Alsandikai|Alsandikai|FAQ,2023-12-22,North Paravur,False,Winter,2023,4500.0,5600.0,4000.0,8.411833,0.100681,0.084521
8,Alsandikai|Alsandikai|FAQ,2023-12-23,North Paravur,False,Winter,2023,5500.0,6000.0,5000.0,8.612503,0.100681,0.084521
9,Alsandikai|Alsandikai|FAQ,2023-12-26,North Paravur,False,Winter,2023,3400.0,5600.0,3000.0,8.131531,0.100681,0.084521
