<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import" data-toc-modified-id="Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import</a></span></li><li><span><a href="#Brands" data-toc-modified-id="Brands-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Brands</a></span></li><li><span><a href="#Product-groups-and-categories" data-toc-modified-id="Product-groups-and-categories-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Product groups and categories</a></span></li><li><span><a href="#Functions-to-categorise,-quantity" data-toc-modified-id="Functions-to-categorise,-quantity-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Functions to categorise, quantity</a></span></li><li><span><a href="#Example" data-toc-modified-id="Example-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Example</a></span></li></ul></div>

# Import

In [2]:
from rapidfuzz import process
import pandas as pd
import numpy as np
import re

# Brands

In [3]:
woolies = pd.read_csv("Category\Woolworths.csv")
coles = pd.read_csv("Category\Coles.csv")

In [4]:
#get woolies brands
brands = woolies[woolies["WOW Category"] == "Pantry"]["Brand"].unique()
brands = list(brands[~pd.isna(brands)])

#get coles brands
coles_brands = coles[coles["COL Category"].isin(['Fruit Vegetables', 'International Foods',
       'Pantry', 'Frozen', 'Dairy Eggs Meals', 'Household', 'Drinks', 'From Deli',
       'Bakery', 'Kids Lunch Box', 'Convenience Meals',
       'Meat Seafood Deli'])]["Brand"].unique()
coles_brands = list(coles_brands[~pd.isna(coles_brands)])
#change words to lowercase
for words in coles_brands:
    brands.append(words.lower())

#remove duplicates
brands = list(set(brands)) 

In [5]:
brands = sorted(brands, key=len, reverse = True) #so that regex removes arnotts instead of a+

In [6]:
pattern = '|^'.join(brands)

In [7]:
#can also save pattern as text and retrieve it instead of loading csv all the time. 
with open("brand_names.txt", "w") as f:
    f.write(pattern)

# Product groups and categories

In [31]:
product_groups = [
    "accessories coffee", "iced tea", "apples", "asparagus", "fennel", "artichokes", "avocados", "bananas", 
    "eggs", "coffee beans", "beef", "veal", "beef bones", "beef roasts", "beef steaks", 
    "beetroot", "berries", "cherries", "biscuits", "cookies", "block cheese", "blue cheese", 
    "bok choy", "asian greens", "boxed chocolate", "breakfast cereal", "muesli", "oats", 
    "breast fillets", "brie cheese", "soft cheese", "broccoli", "cauliflower", "bubble tea", 
    "cabbage", "kale", "brussels sprouts", "capsicum", "chillies", "carrots", "parsnips", "celery", 
    "cheddar cheese", "cheese snacks", "chicken", "turkey mince", "chicken wings", 
    "chocolate bags", "chocolate bars", "chocolate blocks", "chocolate multipacks", "chocolate spreads", 
    "coconuts", "coffee capsules", "ground coffee", "instant coffee", "coffee mixes", 
    "corn", "corn chips", "crackers", "crispbreads", "cream cheese", "crumbed beef", "crumbed chicken", 
    "cucumber", "cupcakes", "deli crab", "deli lobster", "deli fish", "deli marinara mix", "deli oysters", "deli prawns", 
    "lamb", "lamb chops", "lamb cutlets", "lamb mince", "lamb roasts", "lamb shanks", "lamb steak", 
    "digestive health bread", "drumsticks", "maryland chicken", "eggplant", "energy drink", 
    "family favourites", "feta cheese", "firm cheese",
    "free range pork", "fruit cakes", "full cream milk", "gluten free cakes", "gluten free bread", 
    "gluten free cereal", "grapefruit", "grapes", "grated cheese", "ground coffee", 
    "gum", "mints", "haloumi", "bocconcini", "healthier start", "herbal tea", "honey", 
    "instant coffee", "jams", "kangaroo mince", "wallaby mince", "kebabs", "kiwi fruit", "kombucha", 
    "lamb offal", "lemons", "limes", "lettuce", "loaf cakes", "lollies", 
    "loose leaf tea", "mandarins", "mangoes", "marinades", "marinated chicken", 
    "mashed vegetables", "cut vegetables", "meat free beef", "medicated lozenges", 
    "melons", "meringues", "pavlovas", "milk", "flavoured milk", "muffins", "mushrooms", 
    "mustards", "nut spreads", "nuts", "trail mix", "oil", "onion", "leeks", 
    "oranges", "organic vegetables", "packaged donuts", "party packs", "passionfruit", 
    "pastry shells", "pates", "platters", "peaches", "nectarines", "pears", "peas", 
    "beans", "okra", "pineapples", "pizza", "pasta", "popcorn", "pork mince", 
    "pork chops", "pork marinade", "pork ribs", "pork roasts", "pork steaks", 
    "potato chips", "potatoes", "poultry deli", "prepacked seafood", "pretzels", 
    "probiotic milk", "pumpkin", "recipe bases", "meal bases", "rhubarbs", 
    "rice snacks", "corn snacks", "rye bread", "sausage mince", "savoury spreads", 
    "seasonal chocolates", "lollies", "single cans", "skim milk", "low fat milk", 
    "sliced cheese", "slices", "bites", "slow cook beef", "casserole beef", 
    "smoked fish", "cured fish", "snack crackers", "soft drink bottles", "soft drink cans", 
    "soy milk", "almond milk", "cow milk", "goat milk", "sliced cheese", "bread", "bagel", 
    "bialy", "croissant", "baguette", "toast", "burrito", "cake", "cheesecake", "chocolate cake", 
    "carrot cake", "strawberry cake", "ice-cream cake", "vanilla cake", "red velvet cake", 
    "cupcake", "fudge cake", "pancake", "poundcake", "chopped liver", "cheese", "mozzarella", 
    "brie", "feta", "blue cheese", "parmesan", "cheese stick", "cheesestrings", "congee", 
    "donuts", "jam", "sprinkles", "donut holes", "krispy kreme", "dumplings", "arepa", 
    "fun guo", "har gow", "momo", "pierogi", "wonton", "fruit", "cantaloupe", "durian", 
    "apricot", "blueberry", "raspberry", "blackberry", "french fries", "poutine", "gravy", 
    "cereal", "rice", "ice cream", "cookies and cream", "mint chocolate", "rocky road", 
    "biscuit tortoni", "blue moon", "queso", "hokey pokey", "moose tracks", "tiger tail", 
    "strawberry", "superman", "spumoni", "pistachio", "moon mist", "neapolitan", 
    "mashed potatoes", "wagyu", "steak", "bacon", "ham", "buffalo wing", "chicken balls", 
    "chicken nuggets", "chicken steak", "chicken feet", "roast chicken", "ribs", "fish", 
    "salmon", "shrimp", "prawn", "shark", "onion rings", "pasta", "lasagna", "linguini", "ravioli", 
    "carbonara", "bolognese", "spaghetti", "spaghetti and meatballs", "pancit canton", 
    "fettuccine", "pudding", "pupusa", "pie", "shepherds pie", "apple pie", "cream pie", 
    "pumpkin pie", "key lime pie", "peach pie", "pepperoni", "hawaiian", "margherita", 
    "rolls", "croquette", "egg roll", "spring roll", "lumpia", 
    "vegetable sandwich", "grilled cheese", "panini", "cheeseburgers", "bacon cheeseburger", 
    "hamburgers", "chicken burger", "hot dogs", "peanut butter and jam sandwich", 
    "chowder", "clam chowder", "corn chowder", "sinigang", "california roll", "stew", 
    "taco", "tamale", "turnover", "jamaican patty", "waffle", "roti canai", "chicken mince", "cream","milk","sandwich"
]

In [32]:
np.save("product_groups.npy",product_groups)

In [49]:
print("len of product groups self", len(product_groups))
print("intersection", len(set(product_groups_recipe).intersection(set(product_groups))))

len of product groups self 335
intersection 71


In [1]:
Categories = {
    "Dairy": ["block cheese", "blue cheese", "cheddar cheese", "cream cheese", "feta cheese", "cream"
              "brie cheese", "soft cheese", "firm cheese", "haloumi", "bocconcini", "probiotic milk",
              "full cream milk", "flavoured milk", "almond milk", "soy milk", "cow milk", "goat milk",
              "sliced cheese", "mozzarella", "parmesan", "cheese stick", "cheesestrings","milk","skim milk", 
              "low fat milk", "soy milk", "almond milk", "cow milk", "goat milk", "probiotic milk"],
    
    "Meat & Poultry": ["beef", "veal", "beef bones", "beef roasts", "beef steaks", "lamb",
                       "lamb chops", "lamb cutlets", "lamb mince", "lamb roasts", "lamb shanks",
                       "lamb steak", "chicken mince", "turkey mince", "chicken wings", "drumsticks",
                       "maryland chicken", "crumbed beef", "crumbed chicken", "marinated chicken",
                       "slow cook beef", "casserole beef", "sausage mince", "chicken",
                       "eggs", "free range pork"],
    
    "Seafood": ["deli crab", "deli lobster", "deli fish", "deli marinara mix", "deli oysters",
                "deli prawns", "prepacked seafood", "fish", "salmon", "shrimp", "prawn", "shark"],
    
    "Bread & Bakery": ["biscuits", "cookies", "digestive health bread", "gluten free bread",
                       "wholemeal bread", "white bread", "loaf cakes", "sponge cakes", "muffins",
                       "cupcakes", "packaged donuts", "toast", "bagel", "bialy", "croissant",
                       "baguette", "cake", "cheesecake", "chocolate cake", "carrot cake",
                       "strawberry cake", "ice-cream cake", "vanilla cake", "red velvet cake",
                       "fudge cake", "pancake", "poundcake", "gluten free cakes", "rye bread", "sandwich"],
    
    "Beverages": ["accessories coffee", "iced tea", "bubble tea", "energy drink", "kombucha",
               "sports drink", "soft drink bottles", "soft drink cans", "coffee beans",
               "coffee capsules", "ground coffee", "instant coffee", "coffee mixes",
               "tea black", "tea chai", "tea green", "tea herbal", "tea loose leaf",
               "tea organic", "tea white"],
    
    "Snacks": ["corn chips", "crackers", "crispbreads", "nuts", "pretzels", "rice snacks",
               "corn snacks", "lollies", "chocolate bags", "chocolate bars", "chocolate blocks",
               "gum", "mints", "potato chips", "popcorn", "snack crackers", "sesame snap",
               "ginger snap", "hardtack", "abernethy", "acıbadem kurabiyesi", "afghan biscuits",
               "alfajor", "almond biscuit", "lebkuchen", "aachener printen", "cornish fairing",
               "speculaas", "springerle", "kruidnoten"],
    
    "Condiments": ["honey", "jams", "mustards", "oil", "vinegar", "marinades", "savoury spreads",
                   "bbq sauce", "tomato sauce", "gravy"],
    
    "Prepared foods": ["pizza", "pasta", "popcorn", "meat free beef", "kebabs", "recipe bases",
                       "meal bases", "congee", "dumplings", "arepa", "fun guo", "har gow",
                       "momo", "pierogi", "wonton", "pancit canton", "fettuccine", "pudding",
                       "pupusa", "pie", "shepherds pie", "apple pie", "cream pie", "pumpkin pie",
                       "key lime pie", "peach pie", "croquette", "egg roll", "spring roll",
                       "lumpia", "burrito", "taco", "tamale", "turnover", "jamaican patty",
                       "waffle", "roti canai"],
    
    "Breakfast": ["breakfast cereal", "muesli", "oats", "fruit cakes", "pancake"],
    
    "Fruits": ["apples", "bananas", "berries", "cherries", "cantaloupe", "durian", "oranges",
               "grapefruit", "kiwi fruit", "mandarins", "mangoes", "passionfruit", "peaches",
               "nectarines", "pears", "pineapples", "blueberry", "raspberry", "blackberry"],
    
    "Vegetables": ["asparagus", "fennel", "artichokes", "avocados", "beetroot", "bok choy",
                   "asian greens", "broccoli", "cauliflower", "cabbage", "kale", "brussels sprouts",
                   "capsicum", "chillies", "carrots", "parsnips", "celery", "cucumber", "eggplant",
                   "lemons", "limes", "lettuce", "okra", "pumpkin", "rhubarbs"],
    
    "other": ["chopped liver", "cheese", "mozzarella", "brie", "feta", "blue cheese", "parmesan",
              "cheese stick", "cheesestrings", "snack crackers", "single cans", "medicated lozenges",
              "party packs", "marinated chicken", "healthier start", "savoury spreads", "nut spreads",
              "trail mix", "pastry shells", "pates", "platters", "meat free beef", "gluten free cereal"]
}

In [5]:
Categories.keys()

dict_keys(['Dairy', 'Meat & Poultry', 'Seafood', 'Bread & Bakery', 'Beverages', 'Snacks', 'Condiments', 'Prepared foods', 'Breakfast', 'Fruits', 'Vegetables', 'other'])

In [34]:
with open("Categories.json", "w") as outfile: 
    json.dump(Categories, outfile)

# Functions to categorise, quantity

In [23]:
#required files
with open('Categories.json', 'r') as file:
    Categories = json.load(file)
product_groups = list(np.load("product_groups.npy", allow_pickle = True))

with open("brand_names.txt", "r") as file:
    pattern = file.read()
    

In [24]:
def get_category(Categories, group):
    for key,val in Categories.items():  # for name, age in dictionary.iteritems():  (for Python 2.x)
        if group in val:
            return(key)
    return("Other")

In [25]:
#get quantity final
#the quantity information is always in the end
def get_quantity(input_text):
    input_quantity = input_text.split(" ")[-1]
    if input_quantity == "":
        return "",""
    elif input_quantity[0] in ['0','1','2','3','4','5','6','7','8','9']: #if the last text is quantity - get value and untis
        return(re.sub("[a-zA-Z//]*", "", input_quantity), (re.sub("[0-9\\.]*", "", input_quantity)).lower())
    elif input_quantity == "pack":
        return(input_text.split(" ")[-2], input_text.split(" ")[-1])
    else: #if the last text is not a quantity return empty 
        return "", ""

In [29]:
# Remove any brand name from the text
def get_info(pattern, Categories, product_groups, input_text):
    input_text = input_text.lower()
    
    #remove quantity
    #"[0-9]*pk|[0-9]*mg|[0-9]*gms|[0-9]*kg|[0-9]*ml|[0-9]*l"
    quantity = "".join(get_quantity(input_text))
    input_text = re.sub(quantity, "", input_text)
    #print("quantity:", quantity)
    
    #remove brand
    # Create a regex pattern to match the brands (case insensitive)
    cleaned_text = re.sub(pattern, '', input_text).strip()
    #print("item:", cleaned_text)
    
    #extract category
    best_match = process.extractOne(cleaned_text, product_groups)
    #print("Category:", get_category(best_match[0]) )
    
    #return(f"Best match product group: {best_match[0]} with a confidence of {best_match[1]}")
    #print(f"product group: {best_match[0]}")
    return {"item": cleaned_text.title(), "category": get_category(Categories, best_match[0]), "quantity": quantity}
    #"product group": best_match[0]

In [30]:
get_info(pattern, Categories, product_groups, "Em Wholefoods Hemp Oil Cold Pressed 250mL")

{'item': 'Hemp Oil Cold Pressed',
 'Category': 'Condiments',
 'quantity': '250ml'}

# Example

In [51]:
# Remove any brand name from the text
def get_info(input_text):
    input_text = input_text.lower()
    
    #remove quantity
    #"[0-9]*pk|[0-9]*mg|[0-9]*gms|[0-9]*kg|[0-9]*ml|[0-9]*l"
    quantity = "".join(get_quantity(input_text))
    input_text = re.sub(quantity, "", input_text)
    print("quantity:", quantity)
    
    #remove brand
    # Create a regex pattern to match the brands (case insensitive)
    cleaned_text = re.sub(pattern, '', input_text).strip()
    print("item:", cleaned_text)
    
    #extract category
    best_match = process.extractOne(cleaned_text, product_groups)
    print("Category:", get_category(best_match[0]) )
    
    return(f"Best match product group: {best_match[0]} with a confidence of {best_match[1]}")
    #return(f"product group: {best_match[0]}")

In [56]:
# Input text
print(get_info("Sunny queen eggs F/range X1 12pk 700gms"))
print("\n")
print(get_info("Em Wholefoods Hemp Oil Cold Pressed 250mL"))
print("\n")
print(get_info("Woolworths White Toast Soft 65Cy"))
print("\n")
print(get_info("Spc Spaghetti Rich Tomato"))
print("\n")
print(get_info("Patak's Butter Chicken Simmer Sauce Mild 450g"))
print("\n")

print(get_info("%C 1/2 fam ht chicken 1each"))
print("\n")

quantity: 700gms
item: eggs f/range x1 12pk
Category: Meat & Poultry
Best match product group: eggs with a confidence of 90.0


quantity: 250ml
item: hemp oil cold pressed
Category: Condiments
Best match product group: oil with a confidence of 90.0


quantity: 65cy
item: white toast soft
Category: Bread & Bakery
Best match product group: toast with a confidence of 90.0


quantity: 
item: spaghetti rich tomato
Category: Other
Best match product group: spaghetti with a confidence of 90.0


quantity: 450g
item: butter chicken simmer sauce mild
Category: Other
Best match product group: free range chicken with a confidence of 85.5


quantity: 1each
item: %c 1/2 fam ht chicken
Category: Meat & Poultry
Best match product group: chicken wings with a confidence of 85.5


