# OpenFoodFacts food categorization 

Leverage LLM to classify food found in OpenFoodFacts

<img src="TopicModelingWithSyntheticData2.png"/>

In [5]:
from pymongo import MongoClient
from pprint import pprint
from collections import Counter
from tqdm.notebook import trange, tqdm

from langchain.chat_models import ChatOpenAI

import pandas as pd
import re
import os
import sqlite3 
import requests


In [6]:
# Connect to Mongo and read collection were data was imported
mongo = MongoClient()
db = mongo['openfoodfacts']
products = db['products']

In [7]:
one_record = products.find_one({})

## Randomly select 5,000 products 

In [8]:
random_records = list(products.aggregate([{'$sample': { 'size': 5000 }}]))
len(random_records)

5000

In [9]:
df_random_records = pd.json_normalize(random_records)
df_random_records.head()

Unnamed: 0,_id,nutrition_grades,allergens_from_ingredients,ingredients_text_fr_debug_tags,id,emb_codes_debug_tags,amino_acids_prev_tags,categories_tags,categories_lc,brands_debug_tags,...,ingredients_text_en_ocr_1656609873_result,ecoscore_extended_data.impact.likeliest_recipe.en:wheat_malt,ecoscore_extended_data.impact.likeliest_recipe.fr:matiere_grasse_de_lait_anhydre,ecoscore_extended_data.impact.likeliest_recipe.en:palm_kernel_fat,ecoscore_extended_data.impact.likeliest_recipe.en:glycerol_monostearate,product_name_ab,ingredients_text_with_allergens_ab,ingredients_text_ab,languages.en:abkhaz,languages_codes.ab
0,3270160117444,c,,[],3270160117444,[],[],"[en:meals, en:gratins, en:potato-gratin, en:da...",fr,[],...,,,,,,,,,,
1,5000171031587,unknown,,,5000171031587,,,,,,...,,,,,,,,,,
2,206288016409,unknown,,,206288016409,,[],,,,...,,,,,,,,,,
3,2000000057613,unknown,"en:eggs, œufs, lait",[],2000000057613,[],[],"[en:meals, fr:omelettes, fr:omelette-avec-garn...",fr,[],...,,,,,,,,,,
4,659422216430,e,"en:gluten, wheat, malted barley flour, soy lec...",,659422216430,,[],"[en:snacks, en:sweet-snacks, en:biscuits-and-c...",en,[],...,,,,,,,,,,


## Extract information from records

We will use the key:value information to create text that will be used with LLM
to generate representative text so we can use for classification. Loop through all records

In [15]:
%%time

# Check for known terms that should be ignored
pattern = r"\b(test|%|test|hi|form|web|full|kcal|known|true|false|moderate)\b|:|\d+|\-|_|\."
# Keep track of frequent terms for post processing
frequent_terms = Counter()
# All data
product_data_all = []

for index, row in df_random_records.iterrows():
    seen_text = []
    product_data = {}
    # Check for text fields and keep only those
    for field in row.keys():
        data = row[field]
        # Transform list items into str
        if isinstance(data,list) and data:
            # If list join if all are string
            data = ",".join(data) if all(isinstance(item,str) for item in data) else data

        # Add only if str and value not seen already    
        if data and isinstance(data,str) and data not in seen_text:
            text = data
            frequent_terms.update([data])  
            # Seen for this product
            seen_text.append(text)
            if not re.search(pattern, text):
                product_data[field]=text
    product_data_all.append(product_data)

CPU times: total: 2min 41s
Wall time: 2min 54s


In [16]:
len(product_data_all)

5000

In [17]:
product_data_all[0:3]

[{'nutrition_grades': 'c',
  'categories_lc': 'fr',
  'nutrition_data': 'on',
  'product_name': 'GRATIN DAUPHINOIS',
  'nova_group_debug': 'no nova group when the product does not have ingredients',
  'creator': 'kiliweb',
  'brands': 'Picard',
  'ecoscore_grade': 'b',
  'brands_tags': 'picard',
  'traces_from_user': '(fr) ',
  'categories': 'Plats préparés, Gratins, Gratins de pomme de terre, Gratins dauphinois',
  'pnns_groups_1': 'Composite foods',
  'nova_groups_tags': 'unknown',
  'countries': 'France',
  'category_properties.ciqual_food_name:fr': 'Gratin dauphinois',
  'nutriments.sodium_unit': 'g',
  'nutrient_levels.sugars': 'low'},
 {'nutrition_grades': 'unknown',
  'lang': 'fr',
  'labels_lc': 'en',
  'product_name': 'John west',
  'nova_group_debug': 'no nova group when the product does not have ingredients',
  'creator': 'kiliweb',
  'brands': 'John West',
  'traces_from_user': '(fr) ',
  'nutrition_score_debug': 'no score when the product does not have a category',
  'last

In [18]:
# Calculate Common words - to filter
common_words = [t[0] for t in frequent_terms.most_common() if t[1]>3]
len(common_words)

5025

### Clean common values 

In [19]:
%%time
# Clean out common words
product_data_all_final = []
for p in product_data_all:
    cleaned_dict = {}
    for k,v in p.items():
        if v not in common_words:
            cleaned_dict[k] = v
            #print(v, v not in common_words)
    product_data_all_final.append(cleaned_dict)

CPU times: total: 2 s
Wall time: 2.26 s


In [20]:
product_data_all_final[0:5]

[{'product_name': 'GRATIN DAUPHINOIS',
  'categories': 'Plats préparés, Gratins, Gratins de pomme de terre, Gratins dauphinois',
  'category_properties.ciqual_food_name:fr': 'Gratin dauphinois'},
 {'product_name': 'John west', 'brands': 'John West'},
 {'product_name': 'Croissant beurre', '_keywords': 'beurre,croissant'},
 {'correctors_tags': 'teolemon,beniben,packbot,greensky',
  'product_name': 'Omelette Nature Taux de Sel Réduit',
  'debug_param_sorted_langs': 'fr,lc',
  'informers_tags': 'teolemon,beniben,greensky',
  'labels': 'Peu ou pas de sel, Peu de sel, Sans gluten, Allégé en sel, Taux de sel réduit',
  'editors_tags': 'beniben,packbot,greensky,teolemon',
  'last_modified_by': 'greensky',
  'ecoscore_data.agribalyse.name_fr': 'Omelette au fromage',
  'ecoscore_data.agribalyse.name_en': 'Omelette, with cheese'},
 {'product_name': 'Too good gourmet, chocolate chip cookies',
  'brands': 'Too Good Gourmet',
  '_keywords': 'sweet,too,biscuit,good,snack,gourmet,cookie,cake,and,chip,

In [21]:
def generate_text(product_dict):
    """Given a dictionary with product information, create text to feed LLM for data augmentation"""
    return ", ".join([f"{k}:{v}" for k,v in product_dict.items()])

In [23]:
generate_text(product_data_all_final[2001])

'product_name:Hamburger slices, _keywords:snack,salted,slice,hamburger'

## Generate Synthetic Text 

Call LLM with product data to generate well defined texts that can be used for categorization. 
The idea is to leverage the power of an LLM to enhance our data.

In [12]:
# Definition Generator class
from langchain.schema import HumanMessage
from langchain.schema import SystemMessage
from langchain.chat_models import ChatOpenAI

# This prompt will guide LLM to create a text based in our data
FOOD_PROMPT = """You are a helpful text creator that takes incomplete information about\
a food product and create well written explanation of what you are given. You may be given\
 a combination of key value pairs of information such as product_name: Rillettes de cabillaud \
 which may be comma delimited with other information. If you can't create a well written text, just write \
 NO INFORMATION FOUND."""

# Wrapper class for calling LLM
class OpenAIDefinitionGenerator():

    def __init__(self, verbose=False):
        """Initialize to use ChatGPT"""
        self.agent = ChatOpenAI(temperature=0)

    def generate_definition(self, text):
        messages = [
            SystemMessage(content=FOOD_PROMPT),
            HumanMessage(content=text)
        ]
        definition_obj = self.agent(messages)
        if definition_obj:
            definition = definition_obj.content
            return definition
        else:
            return None

In [13]:
# Instantiate
chatGPTdefinitionGenerator = OpenAIDefinitionGenerator(verbose=True)

## Test Text Generation

In [31]:
product_data_all_final[62]

{'product_name': 'Petit Beurre Pepite De Chocolat Pocket',
 'brands': 'Bisson',
 '_keywords': 'biscuit,sucre,snack,de,pepite,chocolat,bio,beurre,gateaux,sable,et,petit,pocket,bisson',
 'brands_tags': 'bisson',
 'categories': 'Snacks, Snacks sucrés, Biscuits et gâteaux, Biscuits, Biscuits sablés'}

In [33]:
# Test
chatGPTdefinitionGenerator.generate_definition(generate_text(product_data_all_final[62]))

"Petit Beurre Pepite De Chocolat Pocket by Bisson is a delightful snack that combines the classic taste of Petit Beurre biscuits with the indulgence of chocolate chips. Made with organic ingredients, these biscuits are a perfect treat for those who enjoy a sweet and satisfying snack.\n\nThe Bisson brand is known for its high-quality products, and their Petit Beurre Pepite De Chocolat Pocket is no exception. These biscuits are carefully crafted to ensure a perfect balance of flavors and textures. The buttery and crumbly texture of the biscuits pairs perfectly with the rich and creamy chocolate chips, creating a truly irresistible combination.\n\nWhether you're looking for a quick snack on the go or a sweet treat to enjoy with a cup of tea, these biscuits are a great choice. They are conveniently packaged in a pocket-sized format, making them easy to carry and enjoy wherever you are.\n\nIn terms of categories, Petit Beurre Pepite De Chocolat Pocket falls under the Snacks, Snacks sucrés, 

## Prepare text for feeding to LLM

In [34]:
final_texts = []
for product in product_data_all_final:
    final_texts.append(generate_text(product))

In [35]:
len(final_texts)

5000

In [43]:
product_data_all_final[512]

{'product_name': 'Arno Shortbread',
 'brands': 'Arnotts',
 '_keywords': 'arno,arnott,shortbread',
 'brands_tags': 'arnotts'}

## Save into local DB 

Create a sqlite cache of the generated data

In [124]:
conn = sqlite3.connect('synthetic_data.db')
c = conn.cursor() 
# c.execute('''CREATE TABLE IF NOT EXISTS generated_texts  
#              (id INTEGER PRIMARY KEY, text TEXT, generated_text TEXT)''')  


## Call LLM to Generate Synthetic Data

In [125]:
%%time
for text in final_texts:
    c.execute("SELECT * FROM generated_texts WHERE text = ?", (text,))
    if c.fetchone() is None:
        # Text does not exist, generate synthetic text
        generated_text = chatGPTdefinitionGenerator.generate_definition(text)
        
        c.execute("INSERT INTO generated_texts (text, generated_text) VALUES (?, ?)", (text, generated_text))
        conn.commit()
        
conn.close()

CPU times: total: 1min 3s
Wall time: 5h 54min 12s


At this point we have leverage our LLM to create syntethic data for our classification task
This can be seen in the next notebook