In [149]:
import json 
from typing import Dict, List
import string
import pandas as pd
import numpy as np 
import glob
import re

In [150]:
# load files 
generic_path = "/home/victor/Documents/Expectation_data_generation/new_*.json"

In [151]:
list_file = glob.glob(generic_path)

In [152]:
list_file
# load one to test 
data_to_process = None
with open(list_file[-1], "r") as fp:
    data_to_process = json.load(fp)

In [153]:
def extract_text(list_files: List[str]):
    answer_dict = {}
    for f in list_files:
        print(f"Processing file: {f}")
        data = None 
        with open(f, "r", encoding="utf-8") as fp:
            data = json.load(fp)
        answer_dict[f] = data['choices']
    return answer_dict

In [154]:
raw_text_dict = extract_text(list_file)

Processing file: /home/victor/Documents/Expectation_data_generation/new_center_america_recipes_1.json
Processing file: /home/victor/Documents/Expectation_data_generation/new_china_recipes_1.json
Processing file: /home/victor/Documents/Expectation_data_generation/new_south_american_recipes_3.json
Processing file: /home/victor/Documents/Expectation_data_generation/new_south_american_recipes.json


In [201]:
def preprocess_text(dict_raw_text: Dict[str, List[str]]): 
    recipes = []
    pattern = r'\d+\.\s*(.*?)\d+\.'
    for k in dict_raw_text.keys():
        choices = dict_raw_text[k]
        print(f"Number of choices: {len(choices)}")
        # process each choice 
        for choice in choices:
            text = choice['message']['content']
            # matches = re.findall(pattern, text, re.DOTALL)
            # for match in matches:
            #     recipes.append(match.strip())
            recipes.append(text)
    return recipes

In [202]:
raw_recipes = preprocess_text(raw_text_dict)

Number of choices: 10
Number of choices: 10
Number of choices: 10
Number of choices: 1


In [556]:
def split_the_recipes(raw_list: List):
    splitted_recipes = []
    second_revision = []
    for idx, batch in enumerate(raw_list):
        print(f"Processing batch: {idx}")
        recipe_parts = re.split(r'\n\n(?=\d+\. |Recipe )', batch)
        print(f"Splitted recipes: {len(recipe_parts)}")
        if len(recipe_parts) > 1:
            splitted_recipes.extend(recipe_parts)
        else:
            second_revision.extend(recipe_parts)
        print("--------------------------------")
    return splitted_recipes, second_revision

In [606]:
splitted_recipes, second_revision = split_the_recipes(raw_recipes)

Processing batch: 0
Splitted recipes: 6
--------------------------------
Processing batch: 1
Splitted recipes: 16
--------------------------------
Processing batch: 2
Splitted recipes: 16
--------------------------------
Processing batch: 3
Splitted recipes: 1
--------------------------------
Processing batch: 4
Splitted recipes: 11
--------------------------------
Processing batch: 5
Splitted recipes: 16
--------------------------------
Processing batch: 6
Splitted recipes: 6
--------------------------------
Processing batch: 7
Splitted recipes: 6
--------------------------------
Processing batch: 8
Splitted recipes: 6
--------------------------------
Processing batch: 9
Splitted recipes: 6
--------------------------------
Processing batch: 10
Splitted recipes: 16
--------------------------------
Processing batch: 11
Splitted recipes: 16
--------------------------------
Processing batch: 12
Splitted recipes: 1
--------------------------------
Processing batch: 13
Splitted recipes: 16


In [607]:
len(second_revision)

3

In [608]:
splitted_recipes.extend(second_revision[0].split("### "))

In [609]:
splitted_recipes.extend(second_revision[1].split("**"))

In [610]:
splitted_recipes.extend(re.split(r'\n\d+\. ', second_revision[2]))

In [648]:
# second out non recipes 
def filter_out_non_recipes(list_pre: List[str]):
    suspect_non_recipes = []
    for text in list_pre:
        if "provide recipes" in text.lower() \
        or "while" in text.lower() \
        or "recipes from" in text.lower() \
        or "creating" in text.lower() \
        or "unfortunately" in text.lower() \
        or "i will give " in text.lower() \
        or "here are" in text.lower():
            suspect_non_recipes.append(text)
    return suspect_non_recipes

In [664]:
non_recipes = filter_out_non_recipes(splitted_recipes)

In [665]:
len(non_recipes)

5

In [651]:
len(splitted_recipes)

387

In [652]:
def remove_non_recipes_from_file(splitted_recipes: List[str], 
                                 non_recipes: List[str]):
    for n_recipe in non_recipes:
        splitted_recipes.remove(n_recipe)

In [622]:
remove_non_recipes_from_file(splitted_recipes, non_recipes)

In [653]:
len(splitted_recipes)

387

In [802]:
# get recipes title 
def get_titles(list_pre_recipes: List[str]):
    processing_dict = {}
    problems_index = []
    titles = []
    for idx, recipe in enumerate(list_pre_recipes):
        # split the recipe 
        splits = re.split(r"(?=ingredients|ingredient)", recipe.lower()) 
        if len(splits) < 2: 
            problems_index.append(idx)
        elif len(splits) == 2: 
            processing_dict[idx] = {"raw_text": recipe,
                                    "title": splits[0],
                                    "splitted_text": splits}
        else: 
            processing_dict[idx] = {"raw_text": recipe,
                                    "title": splits[0],
                                    "splitted_text": [splits[0]]+["\n\n".join(splits[1:])]}
            titles.append(splits[0])
    return titles, processing_dict, problems_index

In [803]:
# save preprocessed data 
with open("preprocessed_new_batch_recipes.json", "w") as fp:
    json.dump(splitted_recipes, fp)

In [804]:
titles, dict_process, problematic_index = get_titles(splitted_recipes)
problematic_index

[]

In [805]:
# splitted_recipes[361] = splitted_recipes[361] + "\n\n" + splitted_recipes[362]
# del splitted_recipes[362]

In [806]:
def check_expected_keys(dict_check: Dict, expected_key_size: int): 
    incomplete_index = []
    for idx in dict_check.keys():
        if len(dict_check[idx].keys()) != expected_key_size:
            incomplete_index.append(idx)
    return incomplete_index
    

In [807]:
incomplete_index = check_expected_keys(dict_process, 3)
print(len(incomplete_index))

0


In [808]:
def get_ingredients(dict_processing: Dict[int, Dict]):
    problematic_idx = []
    for idx, resto in dict_processing.items():
        ingredients = ""
        previous_split = resto["splitted_text"][-1]
        # split between two string 
        second_split = re.split(r'(?=preparation:|preparation steps:|instructions:|Preparation Steps:)', 
                                previous_split, 
                                re.IGNORECASE)
        if len(second_split) < 2:
            # try to get ingredients 
            if len(second_split) == 1:
                if "ingredients" in second_split[0].lower():
                    dict_processing[idx]["ingredients"] = re.findall(r'ingredients:(.*)(?=preparation steps|preparation|\n\n)',
                                                                     second_split[0].lower())
                    dict_processing[idx]["splitted_text"] = second_split
            problematic_index.append(idx)
        elif len(second_split) == 2:
            ingredients = second_split[0]
            dict_processing[idx]["ingredients"] = ingredients
            dict_processing[idx]["splitted_text"] = second_split
        else: 
            ingredients = second_split[0]
            dict_processing[idx]["ingredients"] = ingredients
            dict_processing[idx]["splitted_text"] = [second_split[0]] + ["\n\n".join(second_split[1:])]
    return problematic_idx

In [809]:
problem_idx = get_ingredients(dict_process)
problem_idx

[]

In [810]:
incomplete_index = check_expected_keys(dict_process, 4)
print(len(incomplete_index))

0


In [875]:
def get_preparation_steps_and_nutritional_information(
    dict_processing: Dict[int, Dict]):
        problematic_idx = []
        for idx, resto in dict_processing.items():
            previous_split = resto["splitted_text"][-1]
            # split between two string 
            second_split = re.split(r'(?=approximate nutrition per 100g:|estimated nutritional profile (approximation):|estimated nutritional profile \(approximation\):|nutritional|nutritional information:|nutritional)', 
                                    previous_split.lower(),
                                    re.IGNORECASE)
            nutritional_info = ""
            preparation = ""
            if len(second_split) < 2:
                # try to get the preparation steps
                if len(second_split) == 1:
                    preparation = re.findall(r'(?=Preparation steps|Preparation)(.*)(?=\n\n|)',
                                             resto['raw_text'].lower(),
                                             re.IGNORECASE)
                    nutritional_info = re.findall(
                        r'(?=nutritional|nutrition)(.*?)(?=\n\n|)',
                        resto['raw_text'].lower(),
                        re.IGNORECASE
                    )
                    if len(nutritional_info) == 0:
                        nutritional_info = ""
                    elif len(nutritional_info) == 1:
                        nutritional_info = nutritional_info[0]
                    else: 
                        nutritional_info = "; ".join(nutritional_info)
                else: 
                    problematic_idx.append(idx)
            else:
                preparation = second_split[0]
                if len(second_split) == 2:
                    nutritional_info = second_split[1]
                else:
                    print(f"len: {len(second_split)}")
                    #print(f"len: {second_split}")
                    nutritional_info = second_split[-1]
            dict_processing[idx]["preparation_steps"] = preparation
            dict_processing[idx]["nutritional_info"] = nutritional_info
            dict_processing[idx]["splitted_text"] = second_split
        return problematic_idx

In [876]:
problem_idx = get_preparation_steps_and_nutritional_information(dict_process)
problem_idx

len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 5
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3
len: 3

[]

In [877]:
len(problem_idx)

0

In [878]:
dict_process[0].keys()

dict_keys(['raw_text', 'title', 'splitted_text', 'ingredients', 'preparation_steps', 'nutritional_info'])

In [879]:
problematic_index = check_expected_keys(dict_process, 6)

In [880]:
problematic_index = check_expected_keys(dict_process, 6)

In [881]:
len(problematic_index)

0

In [882]:
def get_meal_type(text):
    result = []
    if "breakfast" in text:
        result.append("breakfast")
    elif "lunch" in text:
        result.append("lunch")
    elif "diner" in text:
        result.append("diner")
    else:
        pass
    if len(result) == 0:
        result = ["breakfast", "lunch", "diner"]
    return ";".join(result)

In [883]:
import re

def extract_nutritional_info(text):
    # Define a regular expression pattern to match numeric values
    pattern = r'(\d+(?:\.\d+)?)'

    # Define a dictionary to store the extracted values
    nutritional_info = {
        "Calories": None,
        "Protein": None,
        "Fat": None,
        "Fiber": None,
        "Carbohydrates": None
    }

    # Find all matches in the text using the regular expression pattern
    matches = re.findall(pattern, text)

    # Check if we found enough matches (at least 5) to populate all fields
    if len(matches) >= 5:
        # Assign the matched values to the corresponding keys in the dictionary
        nutritional_info["Carbohydrates"] = matches[0]
        nutritional_info["Protein"] = matches[1]
        nutritional_info["Fat"] = matches[2]
        nutritional_info["Fiber"] = matches[3]
        nutritional_info["Calories"] = matches[4]

    return nutritional_info

# Sample text containing nutritional information
sample_text = 'estimated  nutritional profile (approximation):\n   - carbohydrates: 28g\n   - protein: 8.5g\n   - fat: 6g\n   - fiber: 3g\n   - calories: 200'

# Extract the nutritional information using the function
result = extract_nutritional_info(sample_text)

# Print the extracted values
print(result)


{'Calories': '200', 'Protein': '8.5', 'Fat': '6', 'Fiber': '3', 'Carbohydrates': '28'}


In [884]:
import traceback
def generate_df(dict_preprocess: Dict[int, Dict]):
    df = pd.DataFrame(columns=[
            'title', 
            'raw_text', 
            'meal_type', 
            'cultural_restriction', 
            'calories',
            'allergies', 
            'recipeId', 
            'ingredients', 
            'preparation', 
            'Carbohydrates',
            'Protein', 
            'Fat', 
            'Fiber'])
    index_with_problem = []
    for idx in dict_preprocess.keys():
        print(f"Processing: {idx}")
        try:
            resto = dict_preprocess[idx]
            # extracting info 
            meal_type = get_meal_type(resto["raw_text"].lower())
            if isinstance(resto["nutritional_info"], str):
                print(f"string found: {resto['nutritional_info']}")
                nutritional_info = extract_nutritional_info(resto["nutritional_info"])
            elif isinstance(resto["nutritional_info"], list):
                if len(resto["nutritional_info"]) == 0:
                    nutritional_info = extract_nutritional_info("")
                else:
                    nutritional_info = extract_nutritional_info(resto["nutritional_info"][0])
            else:
                nutritional_info = extract_nutritional_info("")
            df.loc[idx] = {'title': resto["title"], 
            'raw_text': resto["raw_text"], 
            'meal_type': meal_type, 
            'cultural_restriction': "None", 
            'calories': nutritional_info["Calories"],
            'allergies': None, 
            'recipeId': None, 
            'ingredients': resto["ingredients"], 
            'preparation': resto["preparation_steps"], 
            'Carbohydrates': nutritional_info["Carbohydrates"],
            'Protein': nutritional_info["Protein"], 
            'Fat': nutritional_info["Fat"], 
            'Fiber': nutritional_info["Fiber"]}
        except Exception as e:
            print(f"Error {e}, index: {idx}")
            print(f"{traceback.print_exc()}")
            index_with_problem.append(idx)
            continue
    return df, index_with_problem

In [885]:
dict_process[0]['raw_text']

'1. **Gallo Pinto (Costa Rica/Nicaragua) - Breakfast**\n   Ingredients:\n   - 2 cups cooked black beans\n   - 3 cups of cooked rice\n   - 1 medium onion, chopped\n   - 1 red bell pepper, chopped\n   - 2 cloves garlic, minced\n   - Fresh cilantro, chopped\n   - 2 tablespoons vegetable oil\n   - Salt to taste\n\n   Preparation Steps:\n   1. In a large skillet, heat the oil over medium heat. Add onions, bell pepper, and garlic, and sauté until the onion is translucent.\n   2. Stir in the cooked beans and add a little bit of the bean broth to get a saucy consistency.\n   3. Add the cooked rice to the skillet and mix well with the beans.\n   4. Let everything cook together for a few minutes, stirring occasionally. Season with salt and mix in fresh cilantro before serving.\n\n   Estimated Nutritional Profile (Approximation):\n   - Carbohydrates: 25g\n   - Protein: 6g\n   - Fat: 5g\n   - Fiber: 4g\n   - Calories: 180'

In [886]:
dict_process[0]['nutritional_info']

'nutritional profile (approximation):\n   - carbohydrates: 25g\n   - protein: 6g\n   - fat: 5g\n   - fiber: 4g\n   - calories: 180'

In [887]:
new_df, index_problem = generate_df(dict_process)

Processing: 0
string found: nutritional profile (approximation):
   - carbohydrates: 25g
   - protein: 6g
   - fat: 5g
   - fiber: 4g
   - calories: 180
Processing: 1
string found: nutritional profile (approximation):
   - carbohydrates: 28g
   - protein: 8g
   - fat: 6g
   - fiber: 3g
   - calories: 200
Processing: 2
string found: nutritional profile (approximation):
   - carbohydrates: 35g
   - protein: 1g
   - fat: 14g
   - fiber: 2g
   - calories: 270
Processing: 3
string found: nutritional profile (approximation):
   - carbohydrates: 30g
   - protein: 10g
   - fat: 20g
   - fiber: 4g
   - calories: 330
Processing: 4
string found: nutritional content based on the exact 

ingredients and amounts you use.
Processing: 5
string found: nutritional information per 100g:
   - calories: 130
   - carbohydrates: 23g
   - protein: 4g
   - fat: 2.5g
   - fiber: 2g
Processing: 6
string found: nutritional information per 100g:
   - calories: 280
   - carbohydrates: 35g
   - protein: 9g
   - fat:

In [892]:
to_fix = new_df.loc[new_df["Carbohydrates"].isna(), :]

In [898]:
def extract_info_from_raw_text(raw_text: str):
    raw_text_lower = raw_text.lower()
    nutritional_info = {
        "Calories": None,
        "Protein": None,
        "Fat": None,
        "Fiber": None,
        "Carbohydrates": None
    }
    for k in nutritional_info.keys():
        # start extraction process
        value = k.lower()
        pattern = rf'({value}):\s+(\d+(?:\.\d+)?)'
        result = re.findall(pattern, raw_text_lower, re.IGNORECASE)
        if len(result) == 1:
            key, numeric_value = result[0]
        else:
            numeric_value = 0.0
        nutritional_info[k] = numeric_value
    return nutritional_info
        

In [899]:
extract_info_from_raw_text(dict_process[4]['raw_text'])

{'Calories': '210',
 'Protein': '11',
 'Fat': '14',
 'Fiber': '1.5',
 'Carbohydrates': '9'}

In [897]:
value = "Calories".lower()
re.findall(rf'({value}):\s+(\d+(?:\.\d+)?)', dict_process[4]['raw_text'].lower())


[('calories', '210')]

In [896]:
dict_process[4]['raw_text']

'5. **Huevos Rancheros (Honduras) - Breakfast**\n   Ingredients:\n   - 2 corn tortillas\n   - 4 large eggs\n   - 1 cup tomato sauce or salsa\n   - 1 onion, diced\n   - 1 green bell pepper, diced\n   - 2 cloves garlic, minced\n   - Chopped cilantro\n   - Vegetable oil for frying\n   - Salt and pepper to taste\n\n   Preparation Steps:\n   1. In a skillet, heat some oil and fry the tortillas one by one until crispy. Set aside on paper towels.\n   2. In the same skillet, add a little more oil if necessary and sauté onion and bell pepper until soft. Add garlic and fry for another minute.\n   3. Add tomato sauce or salsa to the skillet and bring to a simmer.\n   4. Crack the eggs into the skillet with the sauce, cover, and cook until the eggs reach your desired doneness.\n   5. Place each fried tortilla on a plate, top with two eggs along with some of the sauce, and sprinkle with cilantro.\n\n   Estimated Nutritional Profile (Approximation):\n   - Carbohydrates: 9g\n   - Protein: 11g\n   - F

In [894]:
to_fix.loc[4, "raw_text"]

'5. **Huevos Rancheros (Honduras) - Breakfast**\n   Ingredients:\n   - 2 corn tortillas\n   - 4 large eggs\n   - 1 cup tomato sauce or salsa\n   - 1 onion, diced\n   - 1 green bell pepper, diced\n   - 2 cloves garlic, minced\n   - Chopped cilantro\n   - Vegetable oil for frying\n   - Salt and pepper to taste\n\n   Preparation Steps:\n   1. In a skillet, heat some oil and fry the tortillas one by one until crispy. Set aside on paper towels.\n   2. In the same skillet, add a little more oil if necessary and sauté onion and bell pepper until soft. Add garlic and fry for another minute.\n   3. Add tomato sauce or salsa to the skillet and bring to a simmer.\n   4. Crack the eggs into the skillet with the sauce, cover, and cook until the eggs reach your desired doneness.\n   5. Place each fried tortilla on a plate, top with two eggs along with some of the sauce, and sprinkle with cilantro.\n\n   Estimated Nutritional Profile (Approximation):\n   - Carbohydrates: 9g\n   - Protein: 11g\n   - F

In [832]:
dict_process[35]

{'raw_text': '1. Gallo Pinto (Costa Rica/Nicaragua) - Breakfast\nIngredients: Rice, black beans, bell pepper, onion, garlic, Worcestershire sauce, cilantro.\nPreparation Steps:\n- Cook rice and set aside.\n- Sauté diced onions, bell pepper, and minced garlic until soft.\n- Add cooked beans and a little bean broth, Worcestershire sauce, and allow to simmer.\n- Mix in the rice, adjust the seasoning, and garnish with chopped cilantro.',
 'title': '1. gallo pinto (costa rica/nicaragua) - breakfast\n',
 'splitted_text': ['preparation steps:\n- cook rice and set aside.\n- sauté diced onions, bell pepper, and minced garlic until soft.\n- add cooked beans and a little bean broth, worcestershire sauce, and allow to simmer.\n- mix in the rice, adjust the seasoning, and garnish with chopped cilantro.'],
 'ingredients': 'ingredients: rice, black beans, bell pepper, onion, garlic, worcestershire sauce, cilantro.\n',
 'preparation_steps': ['preparation steps:'],
 'nutritional_info': []}

In [825]:
splitted_recipes[36]

'2. Pupusas (El Salvador) - Lunch/Dinner\nIngredients: Masa harina, water, filling (cheese, beans, chicharrón).\nPreparation Steps:\n- Mix masa harina with water to form a dough.\n- Make a ball from the dough, create a well, add the filling, and seal it.\n- Flatten the ball into a disc and griddle on both sides until golden brown.\n- Serve with curtido (a type of pickled cabbage slaw).'

In [826]:
dict_process[36]

{'raw_text': '2. Pupusas (El Salvador) - Lunch/Dinner\nIngredients: Masa harina, water, filling (cheese, beans, chicharrón).\nPreparation Steps:\n- Mix masa harina with water to form a dough.\n- Make a ball from the dough, create a well, add the filling, and seal it.\n- Flatten the ball into a disc and griddle on both sides until golden brown.\n- Serve with curtido (a type of pickled cabbage slaw).',
 'title': '2. pupusas (el salvador) - lunch/dinner\n',
 'splitted_text': ['preparation steps:\n- mix masa harina with water to form a dough.\n- make a ball from the dough, create a well, add the filling, and seal it.\n- flatten the ball into a disc and griddle on both sides until golden brown.\n- serve with curtido (a type of pickled cabbage slaw).'],
 'ingredients': 'ingredients: masa harina, water, filling (cheese, beans, chicharrón).\n',
 'preparation_steps': ['preparation steps:'],
 'nutritional_info': []}

In [468]:
index_problem

[35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 66,
 72,
 73,
 74,
 75,
 77,
 78,
 79,
 80,
 81,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 243,
 269,
 270,
 271,
 272,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 281,
 282,
 283,
 284,
 285,
 286,
 287,
 288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299

In [845]:
new_df.columns

Index(['title', 'raw_text', 'meal_type', 'cultural_restriction', 'calories',
       'allergies', 'recipeId', 'ingredients', 'preparation', 'Carbohydrates',
       'Protein', 'Fat', 'Fiber'],
      dtype='object')

In [850]:
sum(new_df['calories'].isna())

385

In [851]:
new_df

Unnamed: 0,title,raw_text,meal_type,cultural_restriction,calories,allergies,recipeId,ingredients,preparation,Carbohydrates,Protein,Fat,Fiber
0,1. **gallo pinto (costa rica/nicaragua) - brea...,1. **Gallo Pinto (Costa Rica/Nicaragua) - Brea...,breakfast,,,,,ingredients:\n - 2 cups cooked black beans\n...,"preparation steps:\n 1. in a large skillet, ...",,,,
1,2. **pupusas (el salvador) - lunch/dinner**\n,2. **Pupusas (El Salvador) - Lunch/Dinner**\n ...,lunch,,,,,ingredients (for the dough):\n - 2 cups masa...,"preparation steps:\n 1. in a bowl, mix masa ...",,,,
2,3. **plátanos fritos (central america) - break...,3. **Plátanos Fritos (Central America) - Break...,breakfast,,,,,ingredients:\n - 2 ripe plantains \n - veg...,preparation steps:\n 1. peel the plantains a...,,,,
3,4. **tamales (guatemala) - lunch/dinner**\n,4. **Tamales (Guatemala) - Lunch/Dinner**\n ...,lunch,,,,,ingredients (for the dough):\n - 4 cups masa...,"preparation steps:\n 1. in a large bowl, com...",,,,
4,5. **huevos rancheros (honduras) - breakfast**...,5. **Huevos Rancheros (Honduras) - Breakfast**...,breakfast,,,,,ingredients:\n - 2 corn tortillas\n - 4 la...,"preparation steps:\n 1. in a skillet, heat s...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,chilean completo - suitable for lunch or dinne...,Chilean Completo - Suitable for lunch or dinne...,lunch,,,,,ingredients:\n - 4 hot dog buns\n - 4 be...,preparation:\n 1. grill or boil the hot dog...,,,,
381,uruguayan torta frita - suitable for breakfast...,Uruguayan Torta Frita - Suitable for breakfast...,breakfast,,,,,ingredients:\n - 1kg flour\n - 1 tbsp sa...,"preparation:\n 1. in a bowl, mix flour, sal...",,,,
382,brazilian pão de queijo (cheese bread) - suita...,Brazilian Pão de Queijo (Cheese Bread) - Suita...,breakfast,,,,,ingredients:\n - 2 cups tapioca starch\n ...,preparation:\n 1. preheat oven to 400°f (20...,,,,
383,chilean cazuela - suitable for lunch or dinner...,Chilean Cazuela - Suitable for lunch or dinner...,lunch,,,,,"ingredients:\n - 500g beef or chicken, cube...","preparation:\n 1. in a large pot, sauté oni...",,,,


In [334]:
splitted_recipes[83]

'2. Steamed Bao Buns\n- Breakfast/Lunch\nIngredients: flour, sugar, yeast, milk, baking powder, pork filling.\nPreparation Steps: Knead dough, let it rise, prepare pork filling, fill dough, steam buns.\nAllergens: Gluten, dairy.\nEst. Nutritional per 100g: 250 kcal, 8g protein, 40g carbohydrates, 6g fat, 2g fiber.'

In [220]:
splited_test = re.split(r'(?=## |\n\n)(?=\d+\. |Recipe )', raw_recipes[3])

In [222]:
len(splited_test)

1

In [227]:
raw_recipes[28]

'Sure! Here are 25 different recipes from South America with ingredients, preparation steps, and nutritional information per 100g portion:\n\n1. Arepas (Venezuela/Colombia)\n   - Ingredients: \n     - 1 cup pre-cooked white cornmeal\n     - 1 cup warm water\n     - Salt to taste\n   - Preparation:\n     1. In a bowl, mix the cornmeal, water, and salt until you have a soft dough.\n     2. Shape the dough into small discs.\n     3. Cook the arepas on a griddle until golden brown on both sides.\n   - Nutrition per 100g: Carbs: 43g, Protein: 7g, Fat: 1g, Fiber: 3g, Calories: 200\n   - Appropriate for: Breakfast, Lunch, or Dinner\n\n2. Feijoada (Brazil)\n   - Ingredients:\n     - 500g black beans\n     - 500g assorted pork cuts (such as bacon, sausages, and pork ribs)\n     - 1 onion, chopped\n     - 3 garlic cloves, minced\n     - 2 bay leaves\n     - Salt and pepper to taste\n   - Preparation:\n     1. In a large pot, cook the black beans with water until tender.\n     2. In a separate pa

In [192]:
import re

def extract_recipe_info(text):
    # Extract recipe names
    #recipe_names = re.findall(r'\d+\.\s+\*\*?(.*?) (?:-|\n)', text)
    recipe_names = re.findall(r'\d+\. (.*?) (?:-|\n)', text)
    print(f"Recipes len: {len(recipe_names)}")

    # Extract ingredients
    #ingredient_sections = re.findall(r'\d+\.\s+(\*\*)*(.*?)Ingredients[:]?(.*?)\n\n', text, re.DOTALL)
    ingredient_sections = re.findall(r'Ingredients[:]?(.*?)\n\n', text, re.DOTALL)
    print(f"Ingredients len: {len(ingredient_sections)}")
    #ingredients_list = [(recipe_name, ingredients.strip()) for recipe_name, ingredients in ingredient_sections]
    ingredients_list = ingredient_sections

    # Extract preparation steps
    #preparation_sections = \
    #re.findall(r'\d+\.\s+(\*\*)*(.*?)(?:Preparation Steps:|Preparation:)(.*?)\n\n', 
    #           text, 
    #           re.DOTALL)
    preparation_sections = \
    re.findall(r'(?:Preparation Steps:|Preparation:)(.*?)\n\n', 
               text, 
               re.DOTALL)
    print(f"Len preparations: {len(preparation_sections)}")
    #preparation_steps_list = [(recipe_name, steps.strip()) for recipe_name, steps in preparation_sections]
    preparation_steps_list = preparation_sections
    
    # Extract nutritional information
    nutritional_sections = re.findall(r'\d+\.\s+\*\*(.*?)Estimated Nutritional Profile \(Approximation\):(.*?)\n\n', text, re.DOTALL)
    nutritional_info_list = [(recipe_name, info.strip()) for recipe_name, info in nutritional_sections]

    return recipe_names, ingredients_list, preparation_steps_list, nutritional_info_list

# Example usage
text = raw_recipes[1]
recipe_names, ingredients, preparation_steps, nutritional_info = extract_recipe_info(text)

# Print the results
print("Recipe Names:")
for name in recipe_names:
    print(name)
    
for i in ingredients:
    print(f"ingredients: {i}")
    
for i in preparation_steps:
    print(f"preparation: {i}")
    
for i in nutritional_info:
    print(f"preparation: {i}")

# print("\nIngredients:")
# for recipe_name, ingredient_text in ingredients:
#     print(f"Recipe: {recipe_name}")
#     print(ingredient_text)

# print("\nPreparation Steps:")
# for recipe_name, steps_text in preparation_steps:
#     print(f"Recipe: {recipe_name}")
#     print(steps_text)

# print("\nNutritional Information:")
# for recipe_name, nutritional_info_text in nutritional_info:
#     print(f"Recipe: {recipe_name}")
#     print(nutritional_info_text)

Recipes len: 15
Ingredients len: 16
Len preparations: 15
Recipe Names:
**Gallo Pinto (Costa Rica/Nicaragua)
**Pupusas (El Salvador)
**Plátanos Fritos (Honduras)
**Sopa de Res (Guatemala)
**Pollo en Crema (Honduras)
**Baleadas (Honduras)
**Hilachas (Guatemala)
**Enchiladas (Honduras)
**Arroz con Pollo (Panama)
**Salpicón (Guatemala)
**Tamalitos de Elote (Guatemala)
**Desayuno Chapín (Guatemala)
**Riguas (El Salvador)
**Tres Leches Cake (Nicaragua)
**Yuca Con Chicharrón (El Salvador)
ingredients: 
   - 1 cup cooked black beans
   - 2 cups cooked white rice
   - 1 small onion, diced
   - 1 bell pepper, diced
   - 2 cloves garlic, minced
   - Fresh cilantro, chopped
   - 2 tablespoons vegetable oil
   - Salt to taste
ingredients:  for the dough:
   - 2 cups masa harina
   - 1 1/2 cups water
   - 1/2 teaspoon salt
ingredients:  for the filling:
   - 1 cup refried beans
   - 1 cup shredded cheese (quesillo, mozzarella, or a similar cheese)
   - Oil for frying
ingredients: 
   - 2 ripe planta

In [199]:
recipe_parts = re.split(r'\n\n(?=\d+\. |Recipe )', raw_recipes[29])

In [200]:
recipe_parts

['1. Brazilian Feijoada\nIngredients:\n- 500g black beans\n- 300g pork ribs\n- 200g smoked sausage\n- 200g beef chunks\n- 1 onion, diced\n- 3 cloves of garlic, minced\n- 2 bay leaves\n- Salt and pepper to taste\n- 50g rice\n- 100g collard greens\n\nPreparation:\n1. Soak the black beans overnight, then drain and rinse.\n2. In a large pot, add the beans, pork ribs, smoked sausage, beef chunks, onion, garlic, bay leaves, salt, and pepper. Cover with water.\n3. Bring to a boil, then reduce heat and simmer for about 2 hours until the beans and meats are tender.\n4. Cook rice according to package instructions.\n5. Remove bay leaves and serve feijoada with rice and collard greens.\n\nNutritional Information per 100g:\nCarbohydrates: 9g\nProtein: 6g\nFat: 4g\nFiber: 1g\nCalories: 98 calories\nAppropriate for: Lunch or Dinner',
 '2. Argentinian Asado\nIngredients:\n- 1.5kg beef ribs\n- Salt to taste\n- Chimichurri sauce (parsley, garlic, vinegar, oil, oregano, red pepper flakes)\n\nPreparation:

In [184]:
def extract_second_way(text: str):
    # Define regular expression patterns for each section
    recipe_pattern = r'\d+\.\s+Recipe:\s+(.*?)\n'
    ingredient_pattern = r'Ingredients:\s+(.*?)(?=\nPreparation:|\nNutritional Information|\n\d+\.\s+Recipe|\Z)'
    preparation_pattern = r'Preparation:\s+(.*?)(?=\nNutritional Information|\n\d+\.\s+Recipe|\Z)'
    nutritional_pattern = r'Nutritional Information \(per 100g\):\s+(.*?)\n'

    # Extract recipe information
    recipe_names = re.findall(recipe_pattern, text)
    ingredients = re.findall(ingredient_pattern, text, re.DOTALL)
    preparation_steps = re.findall(preparation_pattern, text, re.DOTALL)
    nutritional_info = re.findall(nutritional_pattern, text)
    return recipe_names, ingredients, preparation_steps, nutritional_info

In [185]:
def generate_intermediate_df(list_text: List[str]):
    dict_text = {}
    for i, ltext in enumerate(list_text):
        print(f"Batch: {i}")
        text = ltext
        recipe_names, ingredients, preparation_steps, nutritional_info =\
            extract_recipe_info(text)
        if len(recipe_names) == 0:
            # use second way 
            recipe_names, ingredients, preparation_steps, nutritional_info =\
            extract_second_way(text)
        for idx, name in enumerate(recipe_names):
            print(f"index: {idx}, name: {name}")
            try:
                dict_text[name] = {"ingredients": ingredients[idx],
                                "preparation_steps": preparation_steps[idx],
                                "nutritional_info": nutritional_info[idx]}
            except Exception as e:
                print(f"Error: {e} recipe: {name}, index: {idx}")
                continue
    return dict_text

In [186]:
recipes_preprocessed = generate_intermediate_df(raw_recipes)

Batch: 0
Recipe len: 5
Len preparations: 5
index: 0, name: ('**', 'Gallo Pinto (Costa Rica/Nicaragua)')
index: 1, name: ('**', 'Pupusas (El Salvador)')
index: 2, name: ('**', 'Plátanos Fritos (Central America)')
index: 3, name: ('**', 'Tamales (Guatemala)')
index: 4, name: ('**', 'Huevos Rancheros (Honduras)')
Batch: 1
Recipe len: 15
Len preparations: 15
index: 0, name: ('**', 'Gallo Pinto (Costa Rica/Nicaragua)')
Error: list index out of range recipe: ('**', 'Gallo Pinto (Costa Rica/Nicaragua)'), index: 0
index: 1, name: ('**', 'Pupusas (El Salvador)')
Error: list index out of range recipe: ('**', 'Pupusas (El Salvador)'), index: 1
index: 2, name: ('**', 'Plátanos Fritos (Honduras)')
Error: list index out of range recipe: ('**', 'Plátanos Fritos (Honduras)'), index: 2
index: 3, name: ('**', 'Sopa de Res (Guatemala)')
Error: list index out of range recipe: ('**', 'Sopa de Res (Guatemala)'), index: 3
index: 4, name: ('**', 'Pollo en Crema (Honduras)')
Error: list index out of range reci

In [168]:
re.findall(r'\d+\.\s+\*\*(.*?)\n', raw_recipes[2])

[]

In [169]:
import re


# Split the text into individual recipes
recipes = re.split(r'\d+\.\s', text)[1:]

recipe_data = []

# Define regular expressions to extract the desired information
name_pattern = r'(.+?)\nIngredients:'
ingredients_pattern = r'Ingredients:(.*?)\n\nPreparation:'
preparation_pattern = r'Preparation:(.*?)\n\nNutritional Information'
nutritional_pattern = r'Nutritional Information per 100g:(.*?)\nAppropriate for:'

for recipe in recipes:
    name = re.search(name_pattern, recipe, re.DOTALL).group(1).strip()
    ingredients = [i.strip() for i in re.findall(r'[\-\d]\s(.*?)\n', re.search(ingredients_pattern, recipe, re.DOTALL).group(1))]
    preparation = [p.strip() for p in re.findall(r'\d+\.\s(.*?)\n', re.search(preparation_pattern, recipe, re.DOTALL).group(1))]
    nutritional_info = re.search(nutritional_pattern, recipe, re.DOTALL).group(1).strip()

    recipe_data.append({
        "Name": name,
        "Ingredients": ingredients,
        "Preparation": preparation,
        "Nutritional Info": nutritional_info
    })

# Print the extracted data for each recipe
for i, recipe in enumerate(recipe_data, 1):
    print(f"Recipe {i}:")
    print(f"Name: {recipe['Name']}")
    print(f"Ingredients: {', '.join(recipe['Ingredients'])}")
    print(f"Preparation: {'\n'.join(recipe['Preparation'])}")
    print(f"Nutritional Info: {recipe['Nutritional Info']}")
    print()

SyntaxError: f-string expression part cannot include a backslash (1288931886.py, line 32)

In [176]:
recipes = re.split(r'\d+\.\s', raw_recipes[29])[1:]

In [177]:
recipes[0]

'Brazilian Feijoada\nIngredients:\n- 500g black beans\n- 300g pork ribs\n- 200g smoked sausage\n- 200g beef chunks\n- 1 onion, diced\n- 3 cloves of garlic, minced\n- 2 bay leaves\n- Salt and pepper to taste\n- 50g rice\n- 100g collard greens\n\nPreparation:\n'

In [162]:
raw_recipes[29]

'1. Brazilian Feijoada\nIngredients:\n- 500g black beans\n- 300g pork ribs\n- 200g smoked sausage\n- 200g beef chunks\n- 1 onion, diced\n- 3 cloves of garlic, minced\n- 2 bay leaves\n- Salt and pepper to taste\n- 50g rice\n- 100g collard greens\n\nPreparation:\n1. Soak the black beans overnight, then drain and rinse.\n2. In a large pot, add the beans, pork ribs, smoked sausage, beef chunks, onion, garlic, bay leaves, salt, and pepper. Cover with water.\n3. Bring to a boil, then reduce heat and simmer for about 2 hours until the beans and meats are tender.\n4. Cook rice according to package instructions.\n5. Remove bay leaves and serve feijoada with rice and collard greens.\n\nNutritional Information per 100g:\nCarbohydrates: 9g\nProtein: 6g\nFat: 4g\nFiber: 1g\nCalories: 98 calories\nAppropriate for: Lunch or Dinner\n\n2. Argentinian Asado\nIngredients:\n- 1.5kg beef ribs\n- Salt to taste\n- Chimichurri sauce (parsley, garlic, vinegar, oil, oregano, red pepper flakes)\n\nPreparation:\n

In [148]:
re.findall(r'\d+\.\s+Recipe:\s+(.*?)\n', raw_recipes[30])

['Brazilian Feijoada',
 'Argentinean Empanadas',
 'Peruvian Ceviche',
 'Colombian Arepas',
 'Ecuadorian Locro de Papa']

In [147]:
raw_recipes[30]

"Creating 400 different recipes would take a considerable amount of time and space. However, I can provide you with a few sample recipes from South America that you can use as a reference. Please note that the nutritional information provided may vary depending on the specific ingredients used and the serving sizes. Additionally, it's important to consult with a healthcare professional or a registered dietitian for personalized dietary advice. Here are five recipes:\n\n1. Recipe: Brazilian Feijoada\nIngredients:\n- 500g black beans\n- 500g pork ribs\n- 300g smoked sausage (linguica)\n- 200g beef sirloin\n- 1 onion, chopped\n- 4 cloves of garlic, minced\n- 2 bay leaves\n- 1 tablespoon olive oil\n- Salt and pepper to taste\n\nPreparation:\n1. Soak the black beans in water overnight, then drain and rinse.\n2. In a large pot, add the drained beans, pork ribs, sausage, beef, onion, garlic, bay leaves, olive oil, and enough water to cover the ingredients.\n3. Bring the mixture to a boil, the

In [35]:
def extract_recipes(text: str):
    # split text 
    list_subtext = text.split("\n\n")
    print(f"len: {len(list_subtext)}")
    # identify recipes 
    digit_index = []
    for idx, t in enumerate(list_subtext):
        print(f"{t[0]}")
        if t[0].isdigit():
            digit_index.append(idx)
    return digit_index

In [None]:
def extract_info(text_list:List[str], indexes:List[int]):
    df = pd.DataFrame(columns=['title', 
                               'raw_text', 
                               'meal_type', 
                               'cultural_restriction', 
                               'calories',
                               'allergies', 
                               'recipeId', 
                               'ingredients', 
                               'preparation'])
    
    # extracting info 
    row_info = ["", "", "", "", "", "", "", "", ""]
    # extract info 
    for i in indexes:
        title = re.findall(r"^\d+\.\s*recipe:\s*(.*?)\n", raw_text.split('\n\n')[1].lower(), re.IGNORECASE)
        if len(title) == 1:
            title = title[0]
        elif len(title) 
        if i != indexes[-1]:
            raw_text = "\n\n".join(indexes[i:indexes[i+1]])
        else:
            pass
    

In [58]:
re.findall(r"^\d+\.\s*recipe:\s*(.*?)\n", raw_text.split('\n\n')[1].lower(), re.IGNORECASE)

['brazilian feijoada']

In [47]:
raw_text.split('\n\n')[1].lower()

'1. recipe: brazilian feijoada\ningredients:\n- 500g black beans\n- 500g pork ribs\n- 300g smoked sausage (linguica)\n- 200g beef sirloin\n- 1 onion, chopped\n- 4 cloves of garlic, minced\n- 2 bay leaves\n- 1 tablespoon olive oil\n- salt and pepper to taste'

In [36]:
indixes = extract_recipes(raw_text)

len: 18
C
1
P
N
2
F
P
N
3
P
N
4
P
N
5
P
N
T


In [37]:
indixes

[1, 4, 8, 11, 14]