In [21]:
import pandas as pd

In [29]:
df_old = pd.read_csv("data/kalimati_old.csv")
df_new = pd.read_csv("data/kalimati_new.csv", names=["Commodity", "Date", "Unit", "Minimum", "Maximum", "Average"])

In [30]:
df_old = df_old.drop('SN', axis=1)
print(df_old.shape)
df_old.head()

(197161, 6)


Unnamed: 0,Commodity,Date,Unit,Minimum,Maximum,Average
0,Tomato Big(Nepali),2013-06-16,Kg,35.0,40.0,37.5
1,Tomato Small(Local),2013-06-16,Kg,26.0,32.0,29.0
2,Potato Red,2013-06-16,Kg,20.0,21.0,20.5
3,Potato White,2013-06-16,Kg,15.0,16.0,15.5
4,Onion Dry (Indian),2013-06-16,Kg,28.0,30.0,29.0


In [31]:
print(df_new.shape)
df_new.head()

(96480, 6)


Unnamed: 0,Commodity,Date,Unit,Minimum,Maximum,Average
0,Tomato Big(Nepali),1/5/2021,Kg,50,60,55.0
1,Tomato Big(Indian),1/5/2021,Kg,50,60,55.0
2,Tomato Small(Local),1/5/2021,Kg,30,35,32.5
3,Tomato Small(Tunnel),1/5/2021,Kg,30,35,32.5
4,Tomato Small(Indian),1/5/2021,KG,40,45,42.5


In [32]:
df_final = pd.concat([df_old, df_new], axis=0)
print(df_final.shape)
df_final.head()

(293641, 6)


Unnamed: 0,Commodity,Date,Unit,Minimum,Maximum,Average
0,Tomato Big(Nepali),2013-06-16,Kg,35.0,40.0,37.5
1,Tomato Small(Local),2013-06-16,Kg,26.0,32.0,29.0
2,Potato Red,2013-06-16,Kg,20.0,21.0,20.5
3,Potato White,2013-06-16,Kg,15.0,16.0,15.5
4,Onion Dry (Indian),2013-06-16,Kg,28.0,30.0,29.0


In [37]:
unique_items = df_final['Commodity'].unique()
print(unique_items.size)
unique_items

136


array(['Tomato Big(Nepali)', 'Tomato Small(Local)', 'Potato Red',
       'Potato White', 'Onion Dry (Indian)', 'Carrot(Local)',
       'Cabbage(Local)', 'Cauli Local', 'Raddish Red',
       'Raddish White(Local)', 'Brinjal Long', 'Brinjal Round',
       'Cow pea(Long)', 'Green Peas', 'French Bean(Local)',
       'Soyabean Green', 'Bitter Gourd', 'Bottle Gourd',
       'Pointed Gourd(Local)', 'Snake Gourd', 'Smooth Gourd',
       'Sponge Gourd', 'Pumpkin', 'Squash(Long)', 'Turnip', 'Okara',
       'Christophine', 'Brd Leaf Mustard', 'Spinach Leaf', 'Cress Leaf',
       'Mustard Leaf', 'Fenugreek Leaf', 'Onion Green', 'Mushroom(Kanya)',
       'Asparagus', 'Neuro', 'Brocauli', 'Sugarbeet', 'Drumstick',
       'Red Cabbbage', 'Lettuce', 'Celery', 'Parseley', 'Fennel Leaf',
       'Mint', 'Turnip A', 'Tamarind', 'Bamboo Shoot', 'Tofu', 'Gundruk',
       'Apple(Jholey)', 'Banana', 'Lime', 'Pomegranate', 'Mango(Maldah)',
       'Grapes(Green)', 'Water Melon(Green)', 'Sweet Orange', 'Pineappl

In [40]:
unique_units = df_final['Unit'].unique()
print(unique_units.size)
unique_units

6


array(['Kg', 'Doz', '1 Pc', 'KG', 'Per Dozen', 'Per Piece'], dtype=object)

In [42]:
unit_mapping = {
    'Kg': 'KG',
    'KG': 'KG',
    'Doz': 'DOZ',
    'Per Dozen': 'DOZ',
    '1 Pc': 'PC',
    'Per Piece': 'PC'
}

df_final['Unit'] = df_final['Unit'].map(unit_mapping)

In [43]:
df_final['Unit'].unique()

array(['KG', 'DOZ', 'PC'], dtype=object)

In [57]:
def classify_commodities(df, debug=True):
    """
    Classify commodities into groups and categories.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing a 'Commodity' column
    debug (bool): If True, prints debug information about unmatched items
    
    Returns:
    pandas.DataFrame: DataFrame with additional 'Group' and 'Category' columns
    """
    # Create a copy to avoid modifying the original DataFrame
    df_classified = df.copy()
    
    # Define grouping mapping
    group_mapping = {
        # Tomatoes
        'Tomato': ['Tomato Big(Nepali)', 'Tomato Small(Local)', 'Tomato Small(Tunnel)', 
                   'Tomato Big(Indian)', 'Tomato Small(Indian)', 'Tomato Small(Terai)', 'Tree Tomato'],
        
        # Potatoes
        'Potato': ['Potato Red', 'Potato White', 'Potato Red(Indian)', 'Potato Red(Mude)', 'Sweet Potato'],
        
        # Onions
        'Onion': ['Onion Dry (Indian)', 'Onion Green', 'Onion Dry (Chinese)'],
        
        # Carrots & Root vegetables
        'Root Vegetables': ['Carrot(Local)', 'Carrot(Terai)', 'Turnip', 'Turnip A', 'Raddish Red', 
                          'Raddish White(Local)', 'Raddish White(Hybrid)', 'Sugarbeet', 'Bakula'],
        
        # Cabbage family
        'Cabbage': ['Cabbage(Local)', 'Cabbage(Terai)', 'Cabbage', 'Red Cabbbage', 
                    'Cauli Local', 'Cauli Terai', 'Cauli Local(Jyapu)', 'Brocauli'],
        
        # Eggplants
        'Eggplant': ['Brinjal Long', 'Brinjal Round'],
        
        # Beans and Peas
        'Beans & Peas': ['Cow pea(Long)', 'Cowpea(Short)', 'Green Peas', 'French Bean(Local)', 
                        'French Bean(Hybrid)', 'French Bean(Rajma)', 'Sword Bean', 'Soyabean Green'],
        
        # Gourds
        'Gourd': ['Bitter Gourd', 'Bottle Gourd', 'Pointed Gourd(Local)', 'Pointed Gourd(Terai)',
                  'Snake Gourd', 'Smooth Gourd', 'Sponge Gourd', 'Pumpkin'],
        
        # Leafy Greens
        'Leafy Greens': ['Brd Leaf Mustard', 'Spinach Leaf', 'Cress Leaf', 'Mustard Leaf', 
                        'Fenugreek Leaf', 'Lettuce', 'Celery', 'Parseley', 'Fennel Leaf', 'Mint'],
        
        # Chilies and Peppers
        'Chili & Pepper': ['Chilli Dry', 'Chilli Green', 'Chilli Green(Bullet)', 
                          'Chilli Green(Machhe)', 'Chilli Green(Akbare)', 'Capsicum'],
        
        # Garlic
        'Garlic': ['Garlic Green', 'Garlic Dry Chinese', 'Garlic Dry Nepali'],
        
        # Fish
        'Fish': ['Fish Fresh', 'Fish Fresh(Rahu)', 'Fish Fresh(Bachuwa)', 
                 'Fish Fresh(Chhadi)', 'Fish Fresh(Mungari)'],
        
        # Citrus
        'Citrus': ['Lime', 'Sweet Lime', 'Lemon', 'Orange(Nepali)', 'Orange(Indian)', 
                   'Sweet Orange', 'Mandarin', 'Kinnow'],
        
        # Berries
        'Berries': ['Strawberry'],
        
        # Mango
        'Mango': ['Mango(Maldah)', 'Mango(Dushari)', 'Mango(Calcutte)', 'Mango(Chousa)'],
        
        # Mushroom
        'Mushroom': ['Mushroom(Kanya)', 'Mushroom(Button)'],
        
        # Other vegetables
        'Other Vegetables': ['Squash(Long)', 'Squash(Round)', 'Okara', 'Christophine', 'Asparagus', 
                           'Neuro', 'Drumstick', 'Bamboo Shoot', 'Tofu', 'Gundruk', 'Arum', 'Yam', 
                           'Knolkhol', 'Bauhania flower', 'Barela'],
        
        # Other fruits
        'Other Fruits': ['Apple(Jholey)', 'Apple(Fuji)', 'Banana', 'Pomegranate', 'Grapes(Green)', 
                        'Grapes(Black)', 'Water Melon(Green)', 'Water Melon(Dotted)', 'Pineapple', 
                        'Cucumber(Local)', 'Cucumber(Hybrid)', 'Jack Fruit', 'Papaya(Nepali)', 
                        'Papaya(Indian)', 'Guava', 'Mombin', 'Pear(Local)', 'Pear(Chinese)', 
                        'Litchi(Local)', 'Litchi(Indian)', 'Musk Melon', 'Kiwi', 'Sarifa', 
                        'Avocado', 'Amla'],
        
        # Herbs and Spices
        'Herbs & Spices': ['Ginger', 'Coriander Green', 'Clive Dry', 'Clive Green', 'Tamarind'],
        
        # Grains
        'Grains': ['Maize', 'Sugarcane']
    }
    
    # Define category mapping
    category_mapping = {
        'Vegetable': ['Tomato', 'Potato', 'Onion', 'Root Vegetables', 'Cabbage', 'Eggplant', 
                     'Beans & Peas', 'Gourd', 'Leafy Greens', 'Chili & Pepper', 'Garlic', 
                     'Other Vegetables', 'Mushroom'],
        'Fruit': ['Citrus', 'Mango', 'Other Fruits', 'Berries'],
        'Non-veg': ['Fish'],
        'Herbs & Spices': ['Herbs & Spices'],
        'Grain': ['Grains']
    }
    
    # Create reverse mappings for easy lookup
    commodity_to_group = {commodity: group
                         for group, commodities in group_mapping.items()
                         for commodity in commodities}
    
    group_to_category = {group: category
                        for category, groups in category_mapping.items()
                        for group in groups}
    
    # Add Group column
    df_classified['Group'] = df_classified['Commodity'].map(commodity_to_group)
    
    # Add Category column
    df_classified['Category'] = df_classified['Group'].map(group_to_category)
    
    # Debug information
    if debug:
        unmatched = df_classified[df_classified['Group'].isnull()]['Commodity'].unique()
        if len(unmatched) > 0:
            print("\nUnmatched commodities:")
            for item in unmatched:
                print(f"- '{item}'")
    
    return df_classified

In [58]:
df_categorized = classify_commodities(df_final)
df_categorized.head()

Unnamed: 0,Commodity,Date,Unit,Minimum,Maximum,Average,Group,Category
0,Tomato Big(Nepali),2013-06-16,KG,35.0,40.0,37.5,Tomato,Vegetable
1,Tomato Small(Local),2013-06-16,KG,26.0,32.0,29.0,Tomato,Vegetable
2,Potato Red,2013-06-16,KG,20.0,21.0,20.5,Potato,Vegetable
3,Potato White,2013-06-16,KG,15.0,16.0,15.5,Potato,Vegetable
4,Onion Dry (Indian),2013-06-16,KG,28.0,30.0,29.0,Onion,Vegetable


In [59]:
df_categorized.isnull().sum()

Commodity    0
Date         0
Unit         0
Minimum      0
Maximum      0
Average      0
Group        0
Category     0
dtype: int64

In [64]:
df_categorized.dtypes

Commodity    object
Date         object
Unit         object
Minimum      object
Maximum      object
Average      object
Group        object
Category     object
dtype: object

In [66]:
price_columns = ['Minimum', 'Maximum', 'Average']

for col in price_columns:
    df_categorized[col] = df_categorized[col].astype(str)
    df_categorized[col] = df_categorized[col].str.replace(',', '')
    df_categorized[col] = df_categorized[col].str.replace('Rs', '')
    df_categorized[col] = df_categorized[col].str.strip()

    df_categorized[col] = pd.to_numeric(df_categorized[col], errors='coerce')

print(df_categorized.dtypes)
df_categorized.head()

Commodity     object
Date          object
Unit          object
Minimum      float64
Maximum      float64
Average      float64
Group         object
Category      object
dtype: object


Unnamed: 0,Commodity,Date,Unit,Minimum,Maximum,Average,Group,Category
0,Tomato Big(Nepali),2013-06-16,KG,35.0,40.0,37.5,Tomato,Vegetable
1,Tomato Small(Local),2013-06-16,KG,26.0,32.0,29.0,Tomato,Vegetable
2,Potato Red,2013-06-16,KG,20.0,21.0,20.5,Potato,Vegetable
3,Potato White,2013-06-16,KG,15.0,16.0,15.5,Potato,Vegetable
4,Onion Dry (Indian),2013-06-16,KG,28.0,30.0,29.0,Onion,Vegetable


In [67]:
df_categorized.to_csv("data/kalimati_final.csv", index=False)