In [182]:
import pandas as pd
import json
import re
from fractions import Fraction
import typing
import os
import ast
from googletrans import Translator, constants
from datasets import load_dataset
## LOADS AND CACHES DATASET WILL RUN FASTER BEYOND FIRST CALL
dataset = load_dataset("recipe_dataset", split="train")
dataset

Found cached dataset csv (/home/adil/.cache/huggingface/datasets/csv/recipe_dataset-64f57f729428df26/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Dataset({
    features: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER'],
    num_rows: 2231142
})

In [183]:
recipe_df: pd.DataFrame = pd.DataFrame(dataset[:1000])

In [184]:

unitConversions: dict = {
    'c.': '236.588 ml',
    'tsp.': '4.92892 ml',
    'tbsp.': '14.7868 ml',
    'oz.': '29.5735 ml',
    'pt.': '473.176 ml',
    'qt.':' 946.353 ml',
    'lb.': '453.592 grams',
    'gal.': '3785.41 ml',
    ' oz': ' 29.5735 ml',
}

def replace_units(text: str) -> str:
    # replaces american units with european units
    text= text.lower()
    for key, value in unitConversions.items():
        text = text.replace(key, value)
    return text
def replace_fractions(text:str)-> str:
    # replaces fractions with decimals
    text = text.lower()
    text = re.sub(r'(?:(\d+)[-\s])?(\d+/\d+)', frac2string, text)
    return text
def frac2string(s):
    # helper function for replace_fractions
    i, f = s.groups(0)
    f = Fraction(f)
    return str(int(i) + float(f))

# Function to simplify ingredients by only keeping the last word 
def keep_last_word(item):
    words = [x.split()[-1] for x in eval(item)]
    return str(words)

def convert_and_multiply_units(text: str) -> list:
    # converts units to floats within strings and multiplies them
    # e.g 1 0.5 0.24 ml becomes 0.12 ml
    ingredients: list = json.loads(text)
    ingredient_list = []
    for ingredient in ingredients: # ingredient is a string with units and one specific ingredient
        ingredient_split: list[str] = re.split(r'\s+|\)|\(', ingredient)
        filtered_result = [item for item in ingredient_split if item]
        converted_values: list= []
        for index, item in enumerate(filtered_result):
            try: # try to turn it into a float and multiply
                converted_value = float(item)
                if len(converted_values) >= 1 and isinstance(converted_values[-1], float):
                    converted_value = converted_values[-1] * converted_value # multiply latest float with current
                    converted_values.pop(-1) # remove latest value, since were using it to multiply
                    converted_values.append(float(converted_value)) 
                else:
                    converted_values.append(converted_value)   
            except ValueError:
                converted_values.append(item)
                pass
            except  IndexError:
                pass
        ingredient_string = " ".join(str(item) for item in converted_values) # join the list back together    
        ingredient_list.append(ingredient_string)
    return ingredient_list

recipe_df['ingredients'] = recipe_df['ingredients'].apply(lambda text: replace_units(text))
recipe_df['ingredients'] = recipe_df['ingredients'].apply(lambda text: replace_fractions(text))
recipe_df['ingredients'] = recipe_df['ingredients'].apply(lambda text: convert_and_multiply_units(text))
recipe_df['NER_simple'] = recipe_df['NER'].apply(keep_last_word)

recipe_df['NER_simple'].head()

0    ['sugar', 'milk', 'vanilla', 'nuts', 'butter',...
1                 ['beef', 'breasts', 'soup', 'cream']
2    ['corn', 'cheese', 'butter', 'powder', 'salt',...
3               ['chicken', 'gravy', 'soup', 'cheese']
4     ['butter', 'crumbs', 'butter', 'sugar', 'chips']
Name: NER_simple, dtype: object

In [185]:
# helper functions
def translator(output_path='supermarket_data/ah_products_en.csv', write=True) -> pd.DataFrame:
  if os.path.isfile(output_path): # if file exists skip
    print(f'File {output_path} already exists, skipping translation')
    ah_products = pd.read_csv(output_path)
  else:
    supermarket_df = pd.read_json('supermarket_data/supermarket.json', encoding='UTF-8')
    ah_products = pd.DataFrame(supermarket_df.iloc[0]['d'])
    translator = Translator()
    tarray = []

    for product in ah_products['n']:
      translations = translator.translate(product, src="nl", dest="en")
      tarray.append(translations.text)

    ah_products['l_en'] = tarray
    if write:
      ah_products.to_csv(output_path)
      print(f'Wrote translated products to {output_path}')
    else:
      print("Done translating inplace")
  return ah_products

def product_price_quantity_returner_per_ingredient(product: str, supermarket_df: pd.DataFrame) -> int:
  candidate_products = supermarket_df[supermarket_df['l_en'].str.contains(f'{product}')]
  if len(candidate_products) == 0:
    print(f'No products found for {product}')
    return 'NaN', 'NaN', 'NaN'  
  min_length_idx = candidate_products['l_en'].str.len().idxmin()
  # Use this index to get the shortest string
  return candidate_products.loc[min_length_idx, 'l_en'], candidate_products.loc[min_length_idx, 'p'], candidate_products.loc[min_length_idx, 's']

In [186]:

def mass_recipe_converter(text: list, products_df: pd.DataFrame) -> list:
    text = text.replace('+', 'NaN') # this is because literal_eval cannot handle + in strings, it's a regex thing
    simple_ingredients: list = ast.literal_eval(text)
    prices = []
    quantities = []
    products = []
    for ingredient in simple_ingredients:
        product, price, quantity = product_price_quantity_returner_per_ingredient(ingredient, products_df)
        prices.append(price)
        quantities.append(quantity)
        products.append(product)
    return products, prices, quantities
    
translated_products = translator()
triplet_NER = recipe_df['NER_simple'].apply(lambda text: mass_recipe_converter(text, translated_products))
recipe_df['NER_product'], recipe_df['NER_price'], recipe_df['NER_quantity'] = zip(*triplet_NER)


File supermarket_data/ah_products_en.csv already exists, skipping translation
No products found for breasts
No products found for paraffin
No products found for cornstarch
No products found for jello
No products found for shortening
No products found for shortening
No products found for Frango
No products found for pimentos
No products found for halves
No products found for yolks
No products found for breasts
No products found for chilies
No products found for flounder
No products found for cream-style
No products found for meats
No products found for flavoring
No products found for shortening
No products found for shortening
No products found for cornflakes
No products found for whites
No products found for Bisquick
No products found for consomme
No products found for catsup
No products found for cornstarch
No products found for allspice
No products found for consomme
No products found for Velveeta
No products found for pimento
No products found for jello
No products found for catsup


In [187]:
recipe_df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,NER_simple,NER_product,NER_price,NER_quantity
0,0,No-Bake Nut Cookies,"[236.588 ml firmly packed brown sugar, 118.294...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","['sugar', 'milk', 'vanilla', 'nuts', 'butter',...","[AH Cane sugar, AH Buttermilk, AH Muffin vanil...","[1.89, 0.85, 1.66, 2.09, 1.25, 2.79]","[500 g, 0,5 l, 300 g, 50 g, 100 g, 166 g]"
1,1,Jewell Ball'S Chicken,"[1.0 small jar chipped beef, cut up, 4.0 boned...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom...","['beef', 'breasts', 'soup', 'cream']","[AH roast beef, NaN, AH Pea soup, AH Ice creams]","[2.99, NaN, 1.19, 2.99]","[100 g, NaN, 300 ml, 8 stuks]"
2,2,Creamy Corn,"[946.352 ml pkg. frozen corn, 236.588 ml pkg. ...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","['corn', 'cheese', 'butter', 'powder', 'salt',...","[AH Popcorn Salt, AH Liver cheese, AH Herb but...","[1.05, 1.15, 1.25, 0.59, 1.69, 0.44]","[100 g, 150 g, 100 g, 90 g, 680 g, per stuk]"
3,3,Chicken Funny,"[1.0 large whole chicken, 621.0435 ml cans chi...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo...","['chicken', 'gravy', 'soup', 'cheese']","[AH Bapao chicken, Knorr Mix beef gravy, AH Pe...","[0.89, 0.6900000000000001, 1.19, 1.15]","[2 stuks, 18 g, 300 ml, 150 g]"
4,4,Reeses Cups(Candy),"[236.588 ml peanut butter, 177.441 ml graham c...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu...","['butter', 'crumbs', 'butter', 'sugar', 'chips']","[AH Herb butter, AH Seasoning breadcrumbs, AH ...","[1.25, 0.6900000000000001, 1.25, 1.89, 1.87]","[100 g, 150 g, 100 g, 500 g, 750 g]"


In [None]:

##### DOES NOTHING RIGHT NOW #####

import networkx as nx
from pyvis.network import Network
from IPython.core.display import HTML

# Enable the pyvis interactive display in the Jupyter notebook
from pyvis import network as net
net.template = "notebook"
net.NOTEBOOK = True


# Create a NetworkX graph
G = nx.Graph()
node_list = []
edge_list = []
for key, value in dictTopic.items():
    if data[int(key)].find('country') != None:
        country = data[int(key)].find('country').text
        country = country.replace(f'</country>', "")

        topic = dictTopic[key].replace(f'<topic type="str">', "")
        topic = topic.replace(f'</topic>', "")
        node_list.append(country)
        node_list.append(topic)
        edge_list.append((country, topic))
    else:
        #print("we didn't find a country for this specific article: ", data[int(key)].attrib['id'])
        pass
    
G.add_nodes_from(node_list)
G.add_edges_from(edge_list)

# Create a pyvis network object
net = Network(height="500px", width="100%", bgcolor="#222222", font_color="white", notebook=True)

# Add nodes and edges to the pyvis network object, and set tooltips for each node
for node in G.nodes():
    tooltip = "{}".format(node)
    net.add_node(node, title=tooltip)
for edge in G.edges():
    net.add_edge(*edge, title="hasTopic")
print("Please zoom-in using your scrollwheel and hover over the nodes and edges")
# Display the interactive network graph with tooltips in the Jupyter notebook

net.show("example.html")