In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/163.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import csv
import json
import pandas as pd
import requests
from transformers import pipeline
from tqdm import tqdm

In [None]:
ENTITY_LINKING_API = ''

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [None]:
# %cd '/content/gdrive/MyDrive/similarity'

Fine-tuning of the model on `data/extra_dataset`; this will result in a model that is saved inside `model/sentence_transformers`

In [None]:
!python continue_training_models.py -model sentence-transformers/roberta-base-nli-mean-tokens -model_type sentence_bert -extra_dataset data/extra_dataset

Transforming each "sentence"  in a corpus (`data/instructions_to_embed`) into an embedding representation

Load in test data

In [6]:
test_data = []

with open('data/extra_dataset/test.csv', 'r') as csvfile:
  csvdictreader = csv.DictReader(csvfile)
  for row in csvdictreader:
    test_sample = {
        'inp_instruction': row['anchor'],
        'inp_ingred_full_name': row['anchor_full_ingred_name'],
        'inp_ingred_matched_name': row['anchor_matched_ingred_name'],
        'ref_ingred_matched_name': row['pos_matched_ingred_name']
    }
    test_data.append(test_sample)

test_data[:10]

[{'inp_instruction': 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the chicken and marinade to the pan.',
  'inp_ingred_full_name': 'chicken breasts',
  'inp_ingred_matched_name': 'Chicken:Breast',
  'ref_ingred_matched_name': 'Tofu:Firm'},
 {'inp_instruction': 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.',
  'inp_ingred_full_name': 'butter',
  'inp_ingred_matched_name': 'Butter',
  'ref_ingred_matched_name': 'Olive oil'},
 {'inp_instruction': 'Blended Vegetable Soup: In a large pot over high heat, add extra-virgin oil, garlic, ginger and red onions.',
  'inp_ingred_full_name': 'extra virgin olive oil',
  'inp_ingred_matched_name': 'Olive oil',
  'ref_ingred_matched_name': 'Sunflower oil'},
 {'inp_instruction': 'Broccoli pesto pasta: Drain the pasta and return it to the pan.',
  'inp_ingred_full_name': 'pasta',
  'inp_ingred_matched_name': 'Pasta',
  'ref_ingred_matched_name': 'White rice'},
 {'inp_instruction': 'Vegetarian t

Load NER model

In [7]:
ner_model_path = 'food_ner'
ner = pipeline('ner', model=ner_model_path + '/bert/bert', ignore_labels=[])

#load labels json
labels_filename = ner_model_path + '/labels.json'
json_file = open(labels_filename, 'r')
json_object = json.loads(json_file.read())
json_file.close()

original_labels_dict = json_object['labels']

# load config file
config_filename = ner_model_path + '/bert/bert/config.json'
json_file = open(config_filename, 'r')
json_object = json.loads(json_file.read())
json_file.close()

label_dict = json_object['label2id']

def get_nes(ner_results, input_text):
  ne_strings = []
  named_entities = []
  current_ne = None
  ne_end = 0
  for result in ner_results:
    entity_label = result['entity']
    token = result['word']
    score = result['score']
    label_str = str(label_dict[entity_label])
    label = original_labels_dict[label_str]
    start_char = result['start']
    end_char = result['end']
    #print(token , label, score, start_char, end_char)
    if 'B-FOOD' in label:
      if current_ne == None:
        current_ne = (start_char, end_char, score)
      elif '##' in token:
        #named_entities.append(current_ne)
        current_ne = (current_ne[0], end_char, score)
    elif label == 'O':
      if '##' in token and current_ne!=None:
        current_ne = (current_ne[0], end_char, score)
      elif current_ne != None:
        named_entities.append(current_ne)
        current_ne = None

    elif 'I-FOOD' in label:
      if current_ne != None:
        current_ne = (current_ne[0], end_char, score)
      else:
        current_ne = (start_char, end_char, score)
  if current_ne != None:
    named_entities.append(current_ne)

  for ne in named_entities:
    ne_strings.append(input_text[ne[0]:ne[1]])

  return ne_strings

In [8]:
def get_normalised_food(url, ingredient):
    url = f'{url}{ingredient}'
    try:
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # If successful, return the response content and decode from bytes to string
            return response.content.decode('UTF-8')
        else:
            # If not successful, print an error message
            print(f"Failed to fetch data. Status code: {response.status_code}")
            return None
    except Exception as e:
        # Print any exception that occurred during the request
        print(f"An error occurred: {str(e)}")
        return None

In [9]:
def place_all_ingreds_in_instruction(instruction):
    # Read all ingredients
    ingreds = set()
    with open('data/extra_dataset/KB.json', 'r', encoding='utf-8') as file:
    # Load JSON data from the file
        data = json.load(file)
        for ingred in data:
            ingreds.add(ingred['ingredient'])

    # Save all instruction versions to .tsv
    with open('data/instructions_to_embed/input.tsv', 'w', newline='', encoding='utf-8') as tsv_file:
        writer = csv.writer(tsv_file, delimiter='\t')
        for ingred in ingreds:
            writer.writerow([instruction.replace('{0}', ingred.lower())])

Given a query, find the top 5 "sentences" in the corpus that are most similar to the query, based on their embeddings

In [None]:
for sample in tqdm(test_data):

  # Extract instruction and identify all food items
  instruction_without_recipe_name = sample['inp_instruction'][sample['inp_instruction'].find(':') + 1:]
  ner_results = ner(instruction_without_recipe_name)
  nes = get_nes(ner_results, instruction_without_recipe_name)

  # Identify food item closest to targeted item to be replced
  raw_string_to_replace = ''

  if(len(nes) == 1):
    raw_string_to_replace = nes[0]
  if(len(nes) > 1):
    for raw_entity in nes:
      normalised = get_normalised_food(ENTITY_LINKING_API, raw_entity)
      if(normalised == sample['inp_ingred_matched_name'] or
         normalised in sample['inp_ingred_matched_name'] or
         sample['inp_ingred_matched_name'] in normalised):
        raw_string_to_replace = raw_entity
        break

  # Replace ingredient with {0}
  if(raw_string_to_replace):
    instruction_text = instruction_without_recipe_name.replace(raw_string_to_replace, '{0}')
    query = sample['inp_instruction'][:sample['inp_instruction'].find(':') + 1] + instruction_text
  else:
    print('nope')
  print('-----------------')

  # Replace {0} with knowledge base foods to make all sentence suggestions
  place_all_ingreds_in_instruction(query)

  # Compute embeddings
  !python process_sentence_corpus.py -model model_iterations/2/roberta-base-nli-mean-tokens_continue_training_2024_03_03_13_27_15 -model_type sentence_bert -sentences data/instructions_to_embed -output data/output/

  # Filter and rank best suggestions
  query = r"'%s'"%(sample['inp_instruction'])
  query_ingredient =  r"'%s'"%(raw_string_to_replace)
  ref_ingredient =  r"'%s'"%(sample["ref_ingred_matched_name"])

  !python text_search.py -model model_iterations/2/roberta-base-nli-mean-tokens_continue_training_2024_03_03_13_27_15 -model_type sentence_bert -embeddings data/output/ -query {query} -query_ingredient {query_ingredient} -ref_ingredient {ref_ingredient}


  0%|          | 0/587 [00:00<?, ?it/s]

-----------------
['Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the buckwheat flour and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the shrimp and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the brawn and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the blackcurrant and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the pomfret and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the kippers and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the advocaat and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the sausage and marinade to the pan.', 'Easy chicken fajitas: Heat a griddle pan until smoking hot and\xa0add the rabbit and mari

  0%|          | 1/587 [00:48<7:56:11, 48.76s/it]

-----------------
['Moroccan Pastilla: Take a sheet of filo pastry and brush it with buckwheat flour.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with shrimp.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with brawn.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with blackcurrant.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with pomfret.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with kippers.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with advocaat.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with sausage.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with rabbit.', 'Moroccan Pastilla: Take a sheet of filo pastry and brush it with mirin.']
2024-04-01 14:40:56.003786: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been reg

  0%|          | 2/587 [01:35<7:41:51, 47.37s/it]

nope
-----------------
["'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'", "'Moroccan Pastilla: Take a sheet of filo pastry and brush it with melted butter.'"]
2024-04-01 14:41:42.659352: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: 

  1%|          | 3/587 [02:12<6:55:23, 42.68s/it]

-----------------
['Broccoli pesto pasta: Drain the buckwheat flour and return it to the pan.', 'Broccoli pesto pasta: Drain the shrimp and return it to the pan.', 'Broccoli pesto pasta: Drain the brawn and return it to the pan.', 'Broccoli pesto pasta: Drain the blackcurrant and return it to the pan.', 'Broccoli pesto pasta: Drain the pomfret and return it to the pan.', 'Broccoli pesto pasta: Drain the kippers and return it to the pan.', 'Broccoli pesto pasta: Drain the advocaat and return it to the pan.', 'Broccoli pesto pasta: Drain the sausage and return it to the pan.', 'Broccoli pesto pasta: Drain the rabbit and return it to the pan.', 'Broccoli pesto pasta: Drain the mirin and return it to the pan.']
2024-04-01 14:42:16.577435: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 14:42:16.577501: E external/local_xla/xla/stream_executor/cud

  1%|          | 4/587 [02:57<7:05:14, 43.76s/it]

-----------------
['Vegetarian tacos: For the buckwheat flour, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the shrimp, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the brawn, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the blackcurrant, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the pomfret, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the kippers, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the advocaat, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the sausage, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the rabbit, heat a frying pan over a medium heat.', 'Vegetarian tacos: For the mirin, heat a frying pan over a medium heat.']
2024-04-01 14:43:03.847182: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
20

  1%|          | 5/587 [03:45<7:17:17, 45.08s/it]

nope
-----------------
Traceback (most recent call last):
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/inference/_common.py", line 51, in <module>
    from ._text_generation import TextGenerationStreamResponse, _parse_text_generation_error
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/inference/_text_generation.py", line 35, in <module>
    if is_pydantic_available():
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_runtime.py", line 181, in is_pydantic_available
    from pydantic import validator  # noqa: F401
  File "<frozen importlib._bootstrap>", line 1075, in _handle_fromlist
  File "/usr/local/lib/python3.10/dist-packages/pydantic/__init__.py", line 383, in __getattr__
    module = import_module(module_na

  1%|          | 6/587 [03:50<5:04:59, 31.50s/it]

^C
-----------------


In [None]:
# !python text_search.py -model model_iterations/2/roberta-base-nli-mean-tokens_continue_training_2024_03_03_13_27_15 -model_type sentence_bert -embeddings data/output/ -query {query}