In [1]:
import re
import pandas as pd

In [2]:
df=pd.read_csv('grocery_products_canonicalized_categories.csv')
df.shape

(3596, 8)

extract all possible units for measurment of the quantity

In [3]:
# Extract all words that might be units
def extract_potential_units(text_series):
    units = set()
    for text in text_series:
        if pd.notna(text):
            # Find patterns like "number + word"
            matches = re.findall(r'\d+\.?\d*\s*([a-zA-Z]+)', str(text).lower())
            units.update(matches)
    return units

all_units = extract_potential_units(df['name'])
print(all_units)



{'tea', 'growing', 'pcs', 'lt', 'half', 'milk', 'round', 'year', 'snack', 'steam', 'portions', 'pac', 'litre', 'caramel', 'mini', 'badam', 'cheese', 'mugs', 'kg', 'singles', 'happy', 'plus', 'grain', 'extra', 'g', 'months', 'ultimate', 'teabags', 'e', 'vanilla', 'whitening', 'month', 'rs', 'selling', 'supreme', 'sachets', 'envelope', 'grams', 'gm', 'mlx', 'l', 'choco', 'off', 'infant', 'coffee', 'fruit', 'bottle', 'the', 'followup', 'box', 'p', 'pop', 'to', 'toddler', 'ltr', 'slice', 'fruits', 'lahori', 'cl', 'sachet', 'classic', 'x', 'cups', 'up', 'bag', 'water', 'ml', 'in', 'slices', 'steamed', 'batata', 'pc', 'with', 'follow', 'instant', 'packs', 'tin', 'k', 'black', 'bags', 'soft', 'fresh', 'gms', 'optipro', 'n', 'sella', 'pack', 'savoury', 'pieces', 'sp', 's', 'grm', 'lx'}


from seen above wrote all the units and their standardization

In [4]:
# Manually curated list of common units
unit_standardization = {
    # Weight
    "kg": "kg", "kgx": "kg", "k": "kg",
    "g": "g", "gm": "g", "gms": "g", "grm": "g", "gmx": "g",
    "oz": "oz",
    
    # Volume
    "l": "L", "ltr": "L", "lt": "L", "liter": "L", 
    "litre": "L", "litre": "L", "litter": "L",
    "ml": "ml", "mlx": "ml","m": "ml",  # Ambiguous - could be meter
    "cl": "cl",
    
    # Count
    "pcs": "pieces", "pc": "pieces", "p": "pieces",
    "pack": "pack", "packs": "pack",
    "portions": "portions",
    "slices": "slices",
    
    # Container
    #"box": "box",'MEIJI KIDS PLUS VANILLA 4 BOX 600GM'
    
    # Unclear
    #"x": "units", "n": "units", "s": "units"
}


creating a pattern with all those units present in the column

In [5]:
# Create regex pattern - IMPORTANT: longer units first to avoid partial matches!
units_list = sorted(unit_standardization.keys(), key=len, reverse=True)
units_pattern = '|'.join(re.escape(unit) for unit in units_list)



# Complete pattern: number + optional space + unit
pattern = rf'\d+\.?\d*\s*({units_pattern})\b'

print("Pattern:", pattern)
print("\n")

Pattern: \d+\.?\d*\s*(portions|litter|slices|liter|litre|packs|pack|kgx|gms|grm|gmx|ltr|mlx|pcs|kg|gm|oz|lt|ml|cl|pc|k|g|l|m|p)\b




extracting qauntites and units from the name column

In [6]:
def extract_and_standardize_unit(text):
    """Extract unit from text and standardize it"""
    if pd.isna(text):
        return None
    
    match = re.search(pattern, str(text).lower(), re.IGNORECASE)
    if match:
        original_unit = match.group(1).lower()
        return unit_standardization.get(original_unit, original_unit)
    return None

def extract_quantity(text):
    """Extract numeric quantity from text"""
    if pd.isna(text):
        return None
    
    match = re.search(pattern, str(text).lower(), re.IGNORECASE)
    if match:
        # Extract the number part
        number_match = re.search(r'\d+\.?\d*', match.group(0))
        return float(number_match.group(0)) if number_match else None
    return None

def remove_unit_pattern(text):
    """Remove quantity + unit pattern from text and clean up"""
    if pd.isna(text):
        return text
    
    # Remove the pattern
    cleaned = re.sub(pattern, '', str(text).lower(), flags=re.IGNORECASE)
    
    # Clean up extra spaces and strip
    cleaned = re.sub(r'[\s()]+', ' ', cleaned).strip()
    
    return cleaned

# Apply to your dataframe
df['standardized_unit'] = df['name'].apply(extract_and_standardize_unit)
df['quantity'] = df['name'].apply(extract_quantity)
df["cleaned_name"] = df["name"].apply(remove_unit_pattern)

# Display the results
display(df[['name', 'quantity', 'standardized_unit', 'cleaned_name']].sample(20))

Unnamed: 0,name,quantity,standardized_unit,cleaned_name
908,PAK FOOD KORNO BAKED CORN STICKS SALT & PEPPER...,30.0,g,pak food korno baked corn sticks salt & pepper
2631,White Sugar per 500GM,500.0,g,white sugar per
3103,Dairy Omung 1L x 12,1.0,L,dairy omung x 12
573,ENSURE NUTRI SUPPLEMENT POWDER VANILLA NUTRIVI...,400.0,g,ensure nutri supplement powder vanilla nutrivi...
3443,Murree Brewery's Beer Can Lemon Malt 250ml,250.0,ml,murree brewery's beer can lemon malt
2753,National Pulao Masala 140GM,140.0,g,national pulao masala
2227,Falak Premium 1Kg(1 Kg),1.0,kg,falak premium
2500,Prema Yogurt Plain 375Gm,375.0,g,prema yogurt plain
1905,ADAMS YOGURT 200 GM,200.0,g,adams yogurt
1700,KIRI CHEESE FRESH CREAM AND MILK 100 GM,100.0,g,kiri cheese fresh cream and milk


while extracting some products of al-fateh had no quantity because it was inside and i then hardcode them to proper quantities

In [7]:


indexes=df[(df["store"]=="al-fateh") & (df["category"]=="dry_fruits_&_dates---fateh") & (df["standardized_unit"].isna())].index.tolist()

print(indexes)

# Manually set standardized unit and quantity for these specific cases

#why it is not working

# Get column position first
col_unit = df.columns.get_loc("standardized_unit")
col_qty = df.columns.get_loc("quantity")

print(col_unit, col_qty)

for idx in indexes:
    df.iloc[idx, col_unit] = "kg"
    df.iloc[idx, col_qty] = 1

[]
8 9


renaming the column name

In [8]:
df.rename(columns={"common_category": "cleaned_category"}, inplace=True)

dropping null quantities rows

In [9]:
print(df.shape)
df.dropna(subset=["standardized_unit"], inplace=True)
print(df.shape)

(3596, 11)
(3400, 11)


In [10]:
print(df["standardized_unit"].isna().sum())
print(df["quantity"].isna().sum())

0
0


In [11]:
print(df["standardized_unit"].value_counts())

standardized_unit
g           2098
ml           730
kg           302
L            205
pieces        49
slices         6
portions       4
cl             3
pack           3
Name: count, dtype: int64


used to standardize the quantites into a quantity e.g kg to g, l to ml etc.

In [12]:
import pandas as pd
import numpy as np

class UnitStandardizer:
    """
    Standardizes various units to base units for machine learning
    """
    
    def __init__(self):
        # Define conversion factors to base units
        self.weight_conversions = {
            'kg': 1000,      # to grams
            'g': 1,
            'mg': 0.001,
            'oz': 28.3495,   # ounces to grams
            'lb': 453.592,   # pounds to grams
        }
        
        self.volume_conversions = {
            'L': 1000,       # to ml
            'l': 1000,       # to ml
            'ml': 1,
            'cl': 10,        # centiliters to ml
            'dl': 100,       # deciliters to ml
            'fl oz': 29.5735 # fluid ounces to ml
        }
        
        # Units that are countable/discrete
        self.count_units = ['pieces', 'slices', 'portions', 'pack', 'items']
        
    def standardize_unit(self, quantity, unit):
        """
        Convert quantity and unit to standardized form
        Returns: (standardized_quantity, standardized_unit, unit_category)
        """
        if pd.isna(unit) or pd.isna(quantity):
            return None, None, None
            
        unit_lower = str(unit).lower().strip()
        
        # Weight conversions (to grams)
        if unit_lower in self.weight_conversions:
            std_quantity = float(quantity) * self.weight_conversions[unit_lower]
            return std_quantity, 'g' #, 'weight'
        
        # Volume conversions (to ml)
        elif unit_lower in self.volume_conversions:
            std_quantity = float(quantity) * self.volume_conversions[unit_lower]
            return std_quantity, 'ml' #, 'volume'
        
        # Count units (keep as is)
        elif unit_lower in self.count_units:
            return float(quantity), 'pieces' #, 'count'
        
        # Unknown unit
        else:
            return float(quantity), unit_lower #, 'other'
    
    def process_dataframe(self, df, quantity_col='quantity', unit_col='standardized_unit'):
        """
        Process a dataframe with quantity and unit columns
        Returns: DataFrame with standardized columns
        """
        df = df.copy()
        
        # Apply standardization
        standardized = df.apply(
            lambda row: self.standardize_unit(row[quantity_col], row[unit_col]), 
            axis=1
        )
        
        # Unpack results
        df['cleaned_quantity'] = standardized.apply(lambda x: x[0])
        df['cleaned_unit'] = standardized.apply(lambda x: x[1])
        #df['unit_category'] = standardized.apply(lambda x: x[2])
        
        return df
    
    def create_ml_features(self, df, quantity_col='cleaned_quantity', unit_col='cleaned_unit'):
        """
        Create features suitable for ML model training
        Returns: DataFrame with ML-ready features
        """
        df_processed = df.copy()
        
        # Create one-hot encoding for unit categories
        df_encoded = pd.get_dummies(df_processed, columns=['cleaned_unit',"store","cleaned_category"], prefix=['quantity','store',"categpory"])
        
        # Optional: Create log-transformed quantity (useful for many ML models)
        df_encoded['log_quantity'] = np.log1p(df_encoded['cleaned_quantity'])
        
        return df_encoded

# Initialize standardizer
standardizer = UnitStandardizer()

# Process data
df_standardized = standardizer.process_dataframe(df)

In [13]:
display(df_standardized.sample(10))

Unnamed: 0.1,Unnamed: 0,store,category,name,price,product-link,image_url,cleaned_category,standardized_unit,quantity,cleaned_name,cleaned_quantity,cleaned_unit
643,1225,al-fateh,flour---fateh,SUNRIDGE SUPER WHITE ATTA 5 KG,Rs.795,https://alfatah.pk/products/sunridge-super-fin...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Staples,kg,5.0,sunridge super white atta,5000.0,g
985,1567,al-fateh,chips_&_nimko---fateh,POP AND CO CLASSIC SALTED POPCORN 45 GM,Rs.105,https://alfatah.pk/products/pop-and-co-classic...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Snacks,g,45.0,pop and co classic salted popcorn,45.0,g
1299,2994,al-fateh,coffee---fateh,MOVENPICK COFFEE DER HIMMLISCHE LUNGO 53 GM,"Rs.1,695",https://alfatah.pk/products/movenpick-coffee-d...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Beverages,g,53.0,movenpick coffee der himmlische lungo,53.0,g
1623,3842,al-fateh,popcorn---fateh,POPNOSH BUTTER POPCORN SALT 450GM,Rs.290,https://alfatah.pk/products/popnosh-butter-pop...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Snacks,g,450.0,popnosh butter popcorn salt,450.0,g
1524,3743,al-fateh,squashes---fateh,MARHABA SHARBAT IMLI AALO BUKHARA 800 ML,Rs.515,https://alfatah.pk/products/marhaba-sharbat-im...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Beverages,ml,800.0,marhaba sharbat imli aalo bukhara,800.0,ml
2967,1080,Metro,crisps_and_popcorn---metro,Lays Salt 21gm,Rs. 29,https://www.metro-online.pk/detail/grocery/sna...,https://www.metro-online.pk/_next/image?url=ht...,Snacks,g,21.0,lays salt,21.0,g
1597,3816,al-fateh,popcorn---fateh,KERNEL POPCORN SALT & PEPPER 90 GM,Rs.160,https://alfatah.pk/products/kernel-popcorn-sal...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Snacks,g,90.0,kernel popcorn salt & pepper,90.0,g
1860,4223,al-fateh,butter---fateh,LURPAK BUTTER UNSALTED 200 GM,"Rs.1,245",https://alfatah.pk/products/lurpak-butter-unsa...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Dairy,g,200.0,lurpak butter unsalted,200.0,g
1538,3757,al-fateh,liquid_tin_milk---fateh,PREMIUM CHOCIE EVAPORATED MILK 368 ML,Rs.395,https://alfatah.pk/products/premium-chocie-eva...,https://cdn.shopify.com/s/files/1/0777/0954/16...,Dairy,ml,368.0,premium chocie evaporated milk,368.0,ml
3557,150,Imtiaz,"salt,_spices_&_herbs---imtiaz",Ponam White Pepper Powder 50g,Rs. 189.00,https://shop.imtiaz.com.pk/product/product-ite...,https://shop.imtiaz.com.pk/_next/image?url=htt...,Spices & Seasonings,g,50.0,ponam white pepper powder,50.0,g


In [14]:
df_standardized.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3400 entries, 0 to 3595
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3400 non-null   int64  
 1   store              3400 non-null   object 
 2   category           3400 non-null   object 
 3   name               3400 non-null   object 
 4   price              3400 non-null   object 
 5   product-link       3400 non-null   object 
 6   image_url          3400 non-null   object 
 7   cleaned_category   3400 non-null   object 
 8   standardized_unit  3400 non-null   object 
 9   quantity           3400 non-null   float64
 10  cleaned_name       3400 non-null   object 
 11  cleaned_quantity   3400 non-null   float64
 12  cleaned_unit       3400 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 371.9+ KB


now we gonna remove incosistencies from the price column

In [15]:
df_standardized["price"].isna().sum()

np.int64(0)

first we extracting and understanding the pattern

In [16]:
import re
import numpy as np
def find_pattern(p):
    pattern=r"\D+"
    temp=re.findall(pattern,p)
    return temp

arr=[]
#destrcutre array int ot 
arr=df_standardized["price"].apply(find_pattern)
arr=[item for sublist in arr for item in sublist]
arr=np.array(arr)
print(arr)

print(np.unique(arr))

['Rs.' 'Rs.' 'Rs.' ... '.' 'Rs. ' '.']
[' - Rs.' ',' '.' 'Rs ' 'Rs.' 'Rs. ']


now we cleaning the pattern

In [17]:
print(len(df_standardized))

3400


In [18]:
import re
import numpy as np
def ectrcact_price(p):
    pattern=r"Rs\.?\s*"
    temp=re.sub(pattern,"",p)
    return temp

def make_price_consistent(p):
     temp=re.match(r"[0-9]+,?.?[0-9]+",str(p)).group(0)
     return temp
 
arr=df_standardized["price"].apply(ectrcact_price)
print(arr.shape)
arr=arr.apply(make_price_consistent)
print(arr.shape)

(3400,)
(3400,)


In [19]:
print(len(arr))
print(df_standardized.shape)

3400
(3400, 13)


In [20]:
arr=pd.Series(arr)
arr=arr.str.replace(",","")
print(arr.shape)
print(arr.isna().sum())
print(df_standardized.shape)

(3400,)
0
(3400, 13)


In [21]:
arr=pd.to_numeric(arr, errors='coerce')


In [22]:
df_standardized["cleaned_price"]=arr

In [23]:
df_standardized=df_standardized.drop(columns=["Unnamed: 0","price","quantity","standardized_unit",])

In [24]:
df_standardized.to_csv('feature_enginered_products.csv', index=False)