# FOOD VECTOR SEARCH

In [None]:
!pip install PyPDF2
!pip install chromadb

# 1 -  Recommandation based on user query

### Importation des modules

In [None]:
# Implémentation de la fonction de récupérer des clés d'api : huggingface_api et gemini_api
from google.colab import userdata
huggingface_api = userdata.get('huggingface')
gemini_api = userdata.get('GOOGLE_API_KEY')

In [None]:
import chromadb
import huggingface_hub
from huggingface_hub import InferenceClient
import PyPDF2
import os
import re
import json

### Importation des données

In [None]:
from foodDataSet import foodItems

### Création des clients et de la BD

In [None]:
chroma_client = chromadb.Client()
hf_client_emb = InferenceClient(
    "sentence-transformers/all-MiniLM-L6-v2",
    token=huggingface_api,
)

In [None]:
collectionName = "food_collection"

In [None]:
# fonction de génération des embeddings
def generateEmbeddings(texts):
  texts_embeddings = hf_client_emb.feature_extraction(texts)
  return texts_embeddings

### Développer la fonction du critère de recherche

#### Avec Gemini

In [None]:
import google.generativeai as genai

genai.configure(api_key=gemini_api)
model = genai.GenerativeModel('gemini-pro')


In [None]:
# recherche de critère avec l'api de gemini
def extractFilterCriteria_gemini(query):

  labels = ["vegan", "non-vegan", "vegetarian", "non-vegetarian", "pescatarian", "omnivore", "paleo", "ketogenic", "chinese", "indian", "japanese", "autre"]


  prompt = f"""
    You are a powerful AI trained to classify text into one of the following categories.
    Here are the categories: {labels}.

    Your task is to determine which category best fits the following text.

    Text: "{query}"

    Please respond with one of the categories listed above that best describes the text.

    """

  response = model.generate_content(prompt)

  return response.text


extractFilterCriteria_gemini("I want to eat Chocolate")

#### Avec Facebook Bart (open-source)

In [None]:
from transformers import pipeline

# Créer un pipeline pour la classification de texte
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
# Fonction pour classer le texte
def classifyText(text, labels):
    result = classifier(text, candidate_labels=labels)
    return result

# fonction de filtrage de critère
def extractFilterCriteria(query):

  criteria = {
      "diet":[], "cuisine":[]
  }
  dietlabels = ["vegan", "non-vegan", "vegetarian", "non-vegetarian", "pescatarian", "omnivore", "paleo", "ketogenic"]
  cuisineLabels = ["chinese", "indian", "japanese"]

  dietResult = classifyText(query, dietlabels)
  print(dietResult)
  highestDietScoreLabel = dietResult["labels"][0]
  dietScore = dietResult['scores'][0]

  if dietScore > 0.8:
    criteria["diet"].append(highestDietScoreLabel)
  else:
    cuisineResult = classifyText(query, cuisineLabels)
    print(cuisineResult)
    highestCuisineScoreLabel = cuisineResult["labels"][0]
    cuisineScore = cuisineResult['scores'][0]

    if cuisineScore > 0.8:
      criteria["cuisine"].append(highestCuisineScoreLabel)

  print('Extracted Filter Criteria:', criteria)

  return criteria

extractFilterCriteria("j'aime les repas de l'asie de l'Est")

### Recherche de similarité (interrogation)

In [None]:
def performSimilaritySearch(collection, queryTerm):

  try:

    query_embedding = generateEmbeddings([queryTerm])

    results = collection.query(
        query_embeddings= query_embedding,
        n_results= 5,
    )

    if len(results) == 0:
      return "Auun document trouvé"
    else:

      top_foodItems = [
          {
              'ids': [food_id for food_id in results['ids'][0]],
              'distances': [distance for distance in results['distances'][0]],
              'food_names': [food_name['food_name'] for food_name in results['metadatas'][0]]
          }
          for index, id in enumerate(results['ids'][0])
      ]

    return top_foodItems

  except Exception as e:
    print("Exception:", e)


### Fonction principale

In [None]:
def main(collection_name, query):

  try:

    collection = chroma_client.get_or_create_collection(name=collection_name)
    print("Collection created or retrieved successfully!")

    uniqueIds = [f"{food['food_id']}_{i}" for i, food in enumerate(foodItems)]
    foodTexts = [
      f"{food['food_name']}. {food['food_description']}. Ingredients: {', '.join(food['food_ingredients'])}"
      for food in foodItems
    ]
    embeddingsData = generateEmbeddings(foodTexts)
    metadata = [
      {
        'food_name': food['food_name']
      } for food in foodItems
    ]

    # ajouter le ids, documents et embeddings à la collection
    collection.add(
        ids = uniqueIds,
        documents = foodTexts,
        metadatas = metadata,
        embeddings= embeddingsData
    )

    #filterCriteria = extractFilterCriteria_gemini(query)
    #print(filterCriteria)
    initialResults = performSimilaritySearch(collection, query)


    for index, item in enumerate(initialResults[0]["food_names"]):
      print(f"Top {index + 1} Recommended Food Name: {item}")

  except Exception as e:

    print("Exception:", e)

In [None]:
# Exécution de la fonction principale
query = "egg, butter and rice"
main(collectionName, query)

In [None]:
# Supprimer la collection
chroma_client.delete_collection(collectionName)

# 2 -  Food recommendations from a PDF recipe

### Uploader les données

In [None]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/HoMe0o66TlJJ-WrIcR_8HQ/Chocolate-torte-Recipe.pdf

In [None]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/GvUxpXUD-oy1h5z-qKoVFg/crumble-pie.pdf

In [None]:
collectionName = "recipe_food"

In [None]:
from PyPDF2 import PdfReader

def extractTextFromPDF(filepath):

  # Ouvrir le fichier PDF
  reader = PdfReader(filepath)

  # Obtenir le nombre de pages
  number_of_pages = len(reader.pages)

  # Extraire le texte de toutes les pages
  text = ""
  for page_num in range(number_of_pages):
      page = reader.pages[page_num]
      text += page.extract_text()

  # Afficher le texte
  text = text.replace("\n", " ").replace("  ", " ")

  return text
#text = extractTextFromPDF("crumble-pie.pdf")

In [None]:
def promptUserInput(query):
  path = str(input("Entrer le chemin du fichier : "))
  return path

In [None]:
import ast

def extractIngredients_gemini(text):


  prompt = f"""

    Act as an expert in text processing and structured data extraction. Your task is to extract
    only the ingredients from the following recipe text while maintaining without their quantities
    and units.  Remove any quantities !!!,  Return the list in a structured format

    Here is the text:{text}

    Return the result as a python list where each ingredient is a separate item !
    The result must be only a list like : ['first ingredient', 'second ingredient', ...] !!!

    """

  response = model.generate_content(prompt)

  liste_ingredients = ast.literal_eval(response.text)

  ingredients = list(set(liste_ingredients))

  return ingredients

#extractFilterCriteria_gemini(text)

In [None]:
def storeEmbeddingsInChromaDB(foodItems):

  foodEmbeddings = []
  metadatas = [
      {
        'food_name': food['food_name']
      } for food in foodItems
    ]

  for item in foodItems:
    texte = " ".join(item["food_ingredients"])

    embedding = generateEmbeddings(texte)

    foodEmbeddings.append(embedding)

  ids = [f"{food['food_id']}_{i}" for i, food in enumerate(foodItems)]

  foodTexts = [
      f"Ingredients: {', '.join(food['food_ingredients'])}"
      for food in foodItems
    ]

  try:

    collection = chroma_client.get_or_create_collection(name=collectionName)
    print("ok")

    collection.add(
        ids = ids,
        documents = foodTexts,
        metadatas = metadatas,
        embeddings= foodEmbeddings
    )
    print("Embeddings stocker dans chromaDB ")

  except Exception as e:
    print("Exception store :", e)

  return collection

In [None]:
def main():

  try:

    collection = storeEmbeddingsInChromaDB(foodItems)
    print(collection)
    #filepath = promptUserInput(query)
    text = extractTextFromPDF('Chocolate-torte-Recipe.pdf')
    ingredients = extractIngredients_gemini(text)

    if len(ingredients) > 0:

      print("Ingredients extraient : ", ingredients)

      recipeEmbedding = generateEmbeddings(" ".join(str(ingredient) for ingredient in ingredients))

      results = collection.query(
          query_embeddings= [recipeEmbedding],
          n_results= 5,
      )

      if len(results) == 0:
        return "Auun document trouvé"
      else:

        top_foodItems = [
            {
                'ids': [food_id for food_id in results['ids'][0]],
                'distances': [distance for distance in results['distances'][0]],
                'food_names': [food_name['food_name'] for food_name in results['metadatas'][0]]
            }
            for index, id in enumerate(results['ids'][0])
        ]
        print(top_foodItems)
        if top_foodItems:
          for index, item in enumerate(top_foodItems[0]["food_names"]):
            print(f"Top {index + 1} Recommended Food Name: {item}")
        else:
          print("Aucun résultat trouvé.")

    else:
      print("Aucun ingrédient trouvé")

  except Exception as e:

    print("Exception main ", e)

In [None]:
# Exécution de la fonction principale
main()

In [None]:
# supprimer la collection
chroma_client.delete_collection(collectionName)