In [186]:
import re
import pandas as pd

In [187]:
df=pd.read_csv('grocery_products_canonicalized_categories.csv')
df.shape

(3017, 8)

extract all possible units for measurment of the quantity

In [188]:
# Extract all words that might be units
def extract_potential_units(text_series):
    units = set()
    for text in text_series:
        if pd.notna(text):
            # Find patterns like "number + word"
            matches = re.findall(r'\d+\.?\d*\s*([a-zA-Z]+)', str(text).lower())
            units.update(matches)
    return units

all_units = extract_potential_units(df['name'])
print(all_units)



{'growing', 'classic', 'p', 'round', 'g', 'sachet', 'up', 'bottle', 'batata', 'year', 'coffee', 'bags', 'happy', 'kg', 'selling', 'litre', 'envelope', 'vanilla', 'months', 'milk', 'infant', 'mini', 'plus', 'k', 'fruit', 'toddler', 'bag', 'black', 'pac', 'n', 'tin', 'in', 'slice', 'pop', 'mugs', 'gms', 'month', 'lt', 'fresh', 'water', 'packs', 'instant', 'optipro', 'lahori', 'whitening', 'fruits', 's', 'off', 'soft', 'ltr', 'grm', 'cups', 'gm', 'cl', 'slices', 'x', 'to', 'savoury', 'ml', 'sachets', 'singles', 'l', 'followup', 'tea', 'the', 'sp', 'ultimate', 'grain', 'grams', 'mlx', 'follow', 'e', 'pcs', 'portions', 'caramel', 'snack', 'cheese', 'rs', 'box', 'supreme', 'pc'}


from seen above wrote all the units and their standardization

In [189]:
# Manually curated list of common units
unit_standardization = {
    # Weight
    "kg": "kg", "kgx": "kg", "k": "kg",
    "g": "g", "gm": "g", "gms": "g", "grm": "g", "gmx": "g",
    "oz": "oz",
    
    # Volume
    "l": "L", "ltr": "L", "lt": "L", "liter": "L", 
    "litre": "L", "litre": "L", "litter": "L",
    "ml": "ml", "mlx": "ml","m": "ml",  # Ambiguous - could be meter
    "cl": "cl",
    
    # Count
    "pcs": "pieces", "pc": "pieces", "p": "pieces",
    "pack": "pack", "packs": "pack",
    "portions": "portions",
    "slices": "slices",
    
    # Container
    #"box": "box",'MEIJI KIDS PLUS VANILLA 4 BOX 600GM'
    
    # Unclear
    #"x": "units", "n": "units", "s": "units"
}


creating a pattern with all those units present in the column

In [190]:
# Create regex pattern - IMPORTANT: longer units first to avoid partial matches!
units_list = sorted(unit_standardization.keys(), key=len, reverse=True)
units_pattern = '|'.join(re.escape(unit) for unit in units_list)



# Complete pattern: number + optional space + unit
pattern = rf'\d+\.?\d*\s*({units_pattern})\b'

print("Pattern:", pattern)
print("\n")

Pattern: \d+\.?\d*\s*(portions|litter|slices|liter|litre|packs|pack|kgx|gms|grm|gmx|ltr|mlx|pcs|kg|gm|oz|lt|ml|cl|pc|k|g|l|m|p)\b




extracting qauntites and units from the name column

In [191]:
def extract_and_standardize_unit(text):
    """Extract unit from text and standardize it"""
    if pd.isna(text):
        return None
    
    match = re.search(pattern, str(text).lower(), re.IGNORECASE)
    if match:
        original_unit = match.group(1).lower()
        return unit_standardization.get(original_unit, original_unit)
    return None

def extract_quantity(text):
    """Extract numeric quantity from text"""
    if pd.isna(text):
        return None
    
    match = re.search(pattern, str(text).lower(), re.IGNORECASE)
    if match:
        # Extract the number part
        number_match = re.search(r'\d+\.?\d*', match.group(0))
        return float(number_match.group(0)) if number_match else None
    return None

def remove_unit_pattern(text):
    """Remove quantity + unit pattern from text and clean up"""
    if pd.isna(text):
        return text
    
    # Remove the pattern
    cleaned = re.sub(pattern, '', str(text).lower(), flags=re.IGNORECASE)
    
    # Clean up extra spaces and strip
    cleaned = re.sub(r'[\s()]+', ' ', cleaned).strip()
    
    return cleaned

# Apply to your dataframe
df['standardized_unit'] = df['name'].apply(extract_and_standardize_unit)
df['quantity'] = df['name'].apply(extract_quantity)
df["cleaned_name"] = df["name"].apply(remove_unit_pattern)

# Display the results
display(df[['name', 'quantity', 'standardized_unit', 'cleaned_name']].sample(20))

Unnamed: 0,name,quantity,standardized_unit,cleaned_name
872,NOMS SOUR CREAM & HERB CHIPS 32 GM,32.0,g,noms sour cream & herb chips
2371,Shan Tez Lal Mirch 200G(200 G),200.0,g,shan tez lal mirch
977,LAHORI MIXTURE DAAL MOTH 250 GM,250.0,g,lahori mixture daal moth
1344,REGAL SIPRUS JUICE MANGO NECTAR BOTTLE 2 LTR,2.0,L,regal siprus juice mango nectar bottle
2048,Milo Powder Sachet 15G(15 G),15.0,g,milo powder sachet
842,NOMS NACHOS THAI SWEET CHILLI CHIPS 32 GM,32.0,g,noms nachos thai sweet chilli chips
1447,RANIA DRINK DIET PEACH FRUIT TIN 250 ML,250.0,ml,rania drink diet peach fruit tin
1089,AMERICAN KUISINE CHOCO CHIP BROWNIE 30GM,30.0,g,american kuisine choco chip brownie
1711,FRESH ST SHREDDED CHEDDAR CHEESE 200 GM,200.0,g,fresh st shredded cheddar cheese
2104,Aqua Coconut Water(1 L),1.0,L,aqua coconut water


while extracting some products of al-fateh had no quantity because it was inside and i then hardcode them to proper quantities

In [192]:


indexes=df[(df["store"]=="al-fateh") & (df["category"]=="dry_fruits_&_dates---fateh") & (df["standardized_unit"].isna())].index.tolist()

print(indexes)

# Manually set standardized unit and quantity for these specific cases

#why it is not working

# Get column position first
col_unit = df.columns.get_loc("standardized_unit")
col_qty = df.columns.get_loc("quantity")

print(col_unit, col_qty)

for idx in indexes:
    df.iloc[idx, col_unit] = "kg"
    df.iloc[idx, col_qty] = 1

[]
8 9


renaming the column name

In [193]:
df.rename(columns={"common_category": "cleaned_category"}, inplace=True)

dropping null quantities rows

In [194]:
print(df.shape)
df.dropna(subset=["standardized_unit"], inplace=True)
print(df.shape)

(3017, 11)
(2885, 11)


In [195]:
print(df["standardized_unit"].isna().sum())
print(df["quantity"].isna().sum())

0
0


In [196]:
print(df["standardized_unit"].value_counts())

standardized_unit
g           1722
ml           664
kg           251
L            195
pieces        38
slices         6
portions       4
cl             3
pack           2
Name: count, dtype: int64


used to standardize the quantites into a quantity e.g kg to g, l to ml etc.

In [197]:
import pandas as pd
import numpy as np

class UnitStandardizer:
    """
    Standardizes various units to base units for machine learning
    """
    
    def __init__(self):
        # Define conversion factors to base units
        self.weight_conversions = {
            'kg': 1000,      # to grams
            'g': 1,
            'mg': 0.001,
            'oz': 28.3495,   # ounces to grams
            'lb': 453.592,   # pounds to grams
        }
        
        self.volume_conversions = {
            'L': 1000,       # to ml
            'l': 1000,       # to ml
            'ml': 1,
            'cl': 10,        # centiliters to ml
            'dl': 100,       # deciliters to ml
            'fl oz': 29.5735 # fluid ounces to ml
        }
        
        # Units that are countable/discrete
        self.count_units = ['pieces', 'slices', 'portions', 'pack', 'items']
        
    def standardize_unit(self, quantity, unit):
        """
        Convert quantity and unit to standardized form
        Returns: (standardized_quantity, standardized_unit, unit_category)
        """
        if pd.isna(unit) or pd.isna(quantity):
            return None, None, None
            
        unit_lower = str(unit).lower().strip()
        
        # Weight conversions (to grams)
        if unit_lower in self.weight_conversions:
            std_quantity = float(quantity) * self.weight_conversions[unit_lower]
            return std_quantity, 'g' #, 'weight'
        
        # Volume conversions (to ml)
        elif unit_lower in self.volume_conversions:
            std_quantity = float(quantity) * self.volume_conversions[unit_lower]
            return std_quantity, 'ml' #, 'volume'
        
        # Count units (keep as is)
        elif unit_lower in self.count_units:
            return float(quantity), 'pieces' #, 'count'
        
        # Unknown unit
        else:
            return float(quantity), unit_lower #, 'other'
    
    def process_dataframe(self, df, quantity_col='quantity', unit_col='standardized_unit'):
        """
        Process a dataframe with quantity and unit columns
        Returns: DataFrame with standardized columns
        """
        df = df.copy()
        
        # Apply standardization
        standardized = df.apply(
            lambda row: self.standardize_unit(row[quantity_col], row[unit_col]), 
            axis=1
        )
        
        # Unpack results
        df['cleaned_quantity'] = standardized.apply(lambda x: x[0])
        df['cleaned_unit'] = standardized.apply(lambda x: x[1])
        #df['unit_category'] = standardized.apply(lambda x: x[2])
        
        return df
    
    def create_ml_features(self, df, quantity_col='cleaned_quantity', unit_col='cleaned_unit'):
        """
        Create features suitable for ML model training
        Returns: DataFrame with ML-ready features
        """
        df_processed = df.copy()
        
        # Create one-hot encoding for unit categories
        df_encoded = pd.get_dummies(df_processed, columns=['cleaned_unit',"store","cleaned_category"], prefix=['quantity','store',"categpory"])
        
        # Optional: Create log-transformed quantity (useful for many ML models)
        df_encoded['log_quantity'] = np.log1p(df_encoded['cleaned_quantity'])
        
        return df_encoded

# Initialize standardizer
standardizer = UnitStandardizer()

# Process data
df_standardized = standardizer.process_dataframe(df)

In [198]:
display(df_standardized.sample(10))

Unnamed: 0.1,Unnamed: 0,store,category,name,price,product-link,image_url,cleaned_category,standardized_unit,quantity,cleaned_name,cleaned_quantity,cleaned_unit
2609,,Metro,spices_and_herbs---metro,Metro Chef Pink Salt 800GM,Rs. 60,https://www.metro-online.pk/detail/cooking-ess...,https://www.metro-online.pk/_next/image?url=%2...,Spices & Seasonings,g,800.0,metro chef pink salt,800.0,g
1768,4131.0,al-fateh,cheese---fateh,ALMARAI FETA CHEESE LOW FAT 200 GM,"Rs.1,295",https://alfatah.pk/products/almarai-feta-chees...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Dairy,g,200.0,almarai feta cheese low fat,200.0,g
2649,,Metro,tea---metro,Tapal Danedar Black Tea 430GM,Rs. 900,https://www.metro-online.pk/detail/grocery/tea...,https://www.metro-online.pk/_next/image?url=%2...,Beverages,g,430.0,tapal danedar black tea,430.0,g
2802,,Metro,juices---metro,Nestle Nesfruta Juice Mango 1 Ltr,Rs. 245,https://www.metro-online.pk/detail/beverages/j...,https://www.metro-online.pk/_next/image?url=%2...,Beverages,L,1.0,nestle nesfruta juice mango,1000.0,ml
1763,4126.0,al-fateh,cheese---fateh,FF PIZZA CHEESE BLOCK 130GM,Rs.445,https://alfatah.pk/products/ff-pizza-cheese-bl...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Dairy,g,130.0,ff pizza cheese block,130.0,g
2660,,Metro,coffee_and_whiteners---metro,Nestle Everyday Tea Whitener 1.8KG,"Rs. 3,389",https://www.metro-online.pk/detail/grocery/tea...,https://www.metro-online.pk/_next/image?url=%2...,Beverages,kg,1.8,nestle everyday tea whitener,1800.0,g
2375,1145.0,Jalal Sons,tea_and_coffee---jalalsons,Nescafe Classic Jar 100Gm,"Rs 2,094",https://jalalsons.com.pk/product/nescafe-class...,https://static.tossdown.com/images/24e391fe-d1...,Beverages,g,100.0,nescafe classic jar,100.0,g
1596,3815.0,al-fateh,popcorn---fateh,KERNEL POPCORN ENGLISH BUTTER 90 GM,Rs.160,https://alfatah.pk/products/kernel-popcorn-eng...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Snacks,g,90.0,kernel popcorn english butter,90.0,g
205,376.0,al-fateh,local_drinks---fateh,SPRITE 350 ML PET,Rs.65,https://alfatah.pk/products/sprite-350-ml-pet,https://cdn.shopify.com/s/files/1/0777/0954/16...,Beverages,ml,350.0,sprite pet,350.0,ml
1784,4147.0,al-fateh,cheese---fateh,MONTE CHRISTO CHEDDAR CHEESE WHITE 200 GM,"Rs.2,495",https://alfatah.pk/products/monte-christo-ched...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Dairy,g,200.0,monte christo cheddar cheese white,200.0,g


In [199]:
df_standardized.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2885 entries, 0 to 3016
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2555 non-null   float64
 1   store              2885 non-null   object 
 2   category           2885 non-null   object 
 3   name               2885 non-null   object 
 4   price              2885 non-null   object 
 5   product-link       2885 non-null   object 
 6   image_url          2885 non-null   object 
 7   cleaned_category   2885 non-null   object 
 8   standardized_unit  2885 non-null   object 
 9   quantity           2885 non-null   float64
 10  cleaned_name       2885 non-null   object 
 11  cleaned_quantity   2885 non-null   float64
 12  cleaned_unit       2885 non-null   object 
dtypes: float64(3), object(10)
memory usage: 315.5+ KB


now we gonna remove incosistencies from the price column

In [200]:
df_standardized["price"].isna().sum()

np.int64(0)

first we extracting and understanding the pattern

In [201]:
import re
import numpy as np
def find_pattern(p):
    pattern=r"\D+"
    temp=re.findall(pattern,p)
    return temp

arr=[]
#destrcutre array int ot 
arr=df_standardized["price"].apply(find_pattern)
arr=[item for sublist in arr for item in sublist]
arr=np.array(arr)
print(arr)

print(np.unique(arr))

['Rs.' 'Rs.' 'Rs.' ... '.' 'Rs. ' '.']
[' - Rs.' ',' '.' 'Rs ' 'Rs.' 'Rs. ']


now we cleaning the pattern

In [202]:
print(len(df_standardized))

2885


In [203]:
import re
import numpy as np
def ectrcact_price(p):
    pattern=r"Rs\.?\s*"
    temp=re.sub(pattern,"",p)
    return temp

def make_price_consistent(p):
     temp=re.match(r"[0-9]+,?.?[0-9]+",str(p)).group(0)
     return temp
 
arr=df_standardized["price"].apply(ectrcact_price)
print(arr.shape)
arr=arr.apply(make_price_consistent)
print(arr.shape)

(2885,)
(2885,)


In [204]:
print(len(arr))
print(df_standardized.shape)

2885
(2885, 13)


In [205]:
arr=pd.Series(arr)
arr=arr.str.replace(",","")
print(arr.shape)
print(arr.isna().sum())
print(df_standardized.shape)

(2885,)
0
(2885, 13)


In [206]:
arr=pd.to_numeric(arr, errors='coerce')


In [207]:
df_standardized["cleaned_price"]=arr

In [209]:
df_standardized=df_standardized.drop(columns=["Unnamed: 0","price","quantity","standardized_unit",])

In [210]:
df_standardized.to_csv('feature_enginered_products.csv', index=False)