In [1]:
import pandas as pd 
import numpy as np 
import json 
import os 
import glob
from typing import List
import re

In [2]:
# json file pattern 
json_file_pattern = "{type_food}_{cultural_restriction}.json"
food_restrictions = ["vegan", "vegetarian", "halal", "kosher", "None"]
type_food_max_calories = {"breakfast":900,
                           "morning snacks": 300,
                           "afternoon snacks": 300,
                           "lunch": 1200,
                           "dinner": 600}

In [4]:
def load_json_file(path):
    json_data = {}
    with open(path, "r") as fp:
        json_data = json.load(fp)
    return json_data

In [5]:
def extract_text_from_json(json_data):
    list_choices = json_data["choices"]
    extracted_text = {}
    for choice, answer in enumerate(list_choices):
        text = answer["message"]["content"]
        extracted_text[f"{choice}"] = text
    return extracted_text

## Text processing 

In [15]:
text = "Here are 100 different food recipes for afternoon snacks, categorized by their calorie range. All recipes are halal. Please note that the origin country and allergic warnings may vary:\n\n20-100 kcal per portion:\n\n1. Fresh Fruit Salad: 50 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: N/A.\n2. Rice Cakes with Hummus: 70 kcal per portion. Total portions: 1. Origin: Middle Eastern. Allergic warnings: None.\n3. Greek Yogurt with Honey: 90 kcal per portion. Total portions: 1. Origin: Greece. Allergic warnings: Dairy.\n4. Hard-boiled Egg: 70 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n5. Cucumber Slices with Tzatziki: 30 kcal per portion. Total portions: 2-3. Origin: Greece. Allergic warnings: Dairy.\n\n100-200 kcal per portion:\n\n6. Banana Smoothie: 150 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Dairy, Banana.\n7. Apple Slices with Peanut Butter: 170 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Peanuts.\n8. Guacamole with Tortilla Chips: 180 kcal per portion. Total portions: 1. Origin: Mexico. Allergic warnings: None.\n9. Grilled Vegetables Skewers: 150 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: None.\n10. Tomato Bruschetta: 120 kcal per portion. Total portions: 2-3. Origin: Italy. Allergic warnings: None.\n\n200-300 kcal per portion:\n\n11. Mini Spinach and Feta Quiches: 250 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: Dairy.\n12. Chicken Lettuce Wraps: 270 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n13. Cheese and Crackers: 280 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Dairy, Gluten.\n14. Baked Sweet Potato Fries: 230 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: None.\n15. Caprese Skewers: 220 kcal per portion. Total portions: 2-3. Origin: Italy. Allergic warnings: Dairy.\n\n300-400 kcal per portion:\n\n16. Chicken Satay with Peanut Sauce: 380 kcal per portion. Total portions: 2-3. Origin: Southeast Asia. Allergic warnings: Peanuts.\n17. Mini Falafel with Tahini Sauce: 320 kcal per portion. Total portions: 2-3. Origin: Middle Eastern. Allergic warnings: Sesame.\n18. Tuna Salad Lettuce Wraps: 350 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Fish.\n19. Vegetable Spring Rolls: 360 kcal per portion. Total portions: 2-3. Origin: East Asia. Allergic warnings: None.\n20. Mini Quesadillas: 380 kcal per portion. Total portions: 2-3. Origin: Mexico. Allergic warnings: Dairy, Gluten.\n\n400-500 kcal per portion:\n\n21. Avocado Toast with Poached Egg: 420 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n22. Fresh Tomato Soup: 480 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n23. Roasted Chickpeas: 450 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n24. Greek Style Pita Pizzas: 490 kcal per portion. Total portions: 1. Origin: Greece. Allergic warnings: Dairy, Gluten.\n25. Chicken Caesar Salad: 430 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Dairy, Fish.\n\n500-600 kcal per portion:\n\n26. Beef Sliders with Caramelized Onions: 570 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n27. Vegetable Stir-fry with Tofu: 550 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: Soy.\n28. Chicken Fajitas: 510 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n29. Spicy Chickpea Salad: 590 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n30. Mini Chicken Shawarma Wraps: 510 kcal per portion. Total portions: 1. Origin: Middle Eastern. Allergic warnings: None.\n\n600-700 kcal per portion:\n\n31. Baked Salmon Sushi Rolls: 650 kcal per portion. Total portions: 2-3. Origin: Japan. Allergic warnings: Fish.\n32. Vegetable Quesadillas: 610 kcal per portion. Total portions: 2-3. Origin: Mexico. Allergic warnings: Dairy, Gluten.\n33. Loaded Nachos: 670 kcal per portion. Total portions: 1. Origin: Mexico. Allergic warnings: Dairy, Gluten.\n34. Teriyaki Chicken Skewers: 640 kcal per portion. Total portions: 2-3. Origin: Japan. Allergic warnings: None.\n35. Sweet and Sour Meatballs: 670 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: None.\n\n700-800 kcal per portion:\n\n36. Chicken and Vegetable Kabobs: 720 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: None.\n37. Beef Empanadas: 780 kcal per portion. Total portions: 2-3. Origin: Latin America. Allergic warnings: None.\n38. Baked Mac and Cheese: 750 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Dairy, Gluten.\n39. Chicken Shawarma Plate: 710 kcal per portion. Total portions: 1. Origin: Middle Eastern. Allergic warnings: None.\n40. Stuffed Bell Peppers: 760 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n\n800-900 kcal per portion:\n\n41. Beef Stir-fry with Noodles: 820 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: None.\n42. Chicken Korma with Rice: 880 kcal per portion. Total portions: 1. Origin: South Asia. Allergic warnings: None.\n43. Cheeseburger Sliders: 850 kcal per portion. Total portions: 2-3. Origin: N/A. Allergic warnings: Dairy, Gluten.\n44. BBQ Chicken Pizza: 810 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Dairy, Gluten.\n45. Shrimp Tacos: 870 kcal per portion. Total portions: 2-3. Origin: Mexico. Allergic warnings: None.\n\n900-1000 kcal per portion:\n\n46. Chicken Alfredo Pasta: 940 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: Dairy, Gluten.\n47. Beef Kofta Kebabs: 980 kcal per portion. Total portions: 2-3. Origin: Middle Eastern. Allergic warnings: None.\n48. Salmon with Roasted Potatoes and Vegetables: 930 kcal per portion. Total portions: 1. Origin: N/A. Allergic warnings: None.\n49. Chicken Parmesan: 960 kcal per portion. Total portions: 1. Origin: Italy. Allergic warnings: Dairy, Gluten.\n50. Lamb Biryani: 950 kcal per portion. Total portions: 1. Origin: South Asia. Allergic warnings: None."

In [16]:
a = re.findall(r"[0-9]+\.", text)

In [18]:
type(a)

list

In [6]:
def text_processing(text: str):
    # numeric match
    matches = re.findall(r"[0-9]+\.", text)
    splitted_text = []
    total_matches = len(matches)
    for i in range(0, total_matches):
        current_match = text.find(matches[i])
        if i != len(symbols)-1:
            next_match = text.find(matches[i+1])
        else:
            next_match = None
        splitted_text.append(text[current_match:next_match])
    return splitted_text

In [7]:
def text_analysis(text_raw: List[str]):
    title = list(map(lambda x: x.split(":")[0] if not "Recipe:" in x else x.split("\n")[0].split(":")[1], 
                     map(lambda x: x.split(".")[1], text_raw)))
    return title

In [11]:
def generate_raw_dataframe(root_directory: str):
    json_file_pattern = "{type_food}_{cultural_restriction}.json"
    food_restrictions = ["vegan", "vegetarian", "halal", "kosher", "None"]
    type_food = ["breakfast", "morning snacks", "afternoon snacks", "lunch", "dinner"]
    raw_df = pd.DataFrame(data=[], columns=["title", "raw_text", "meal_type", "cultural_restriction"])
    raw_json = None
    for fr in food_restrictions:
        for tf in type_food:
            json_file = os.path.join(root_directory, 
                                     json_file_pattern.format(type_food = tf, 
                                                              cultural_restriction = fr
                                                              )
                                     ) 
            if os.path.isfile(json_file):
                with open(json_file, "r") as fp:
                    raw_json = json.load(fp)
                print(f"File found and read: {json_file}")
            else:
                print(f"file: {json_file} not found")
                raw_json = None
                continue
            if raw_json is not None:
                text_choices = extract_text_from_json(raw_json)
                for choice, txt in text_choices.items():
                    try:
                        print(f"processing choice: {choice}, food type: {fr} meal_type: {tf}")
                        print("--------------------------------------------------------------------")
                        temp_df = pd.DataFrame(data=[], columns=["title", "raw_text", "meal_type", "cultural_restriction"])
                        recipes_raw = text_processing(txt)
                        titles = text_analysis(recipes_raw)
                        dict_text = {"title": titles, "raw_text": recipes_raw}
                        temp_df = pd.DataFrame.from_dict(dict_text)
                        temp_df["meal_type"] = tf
                        temp_df["cultural_restriction"] = fr
                        print(temp_df.head(3))
                        if choice ==  3:
                            break
                        raw_df = pd.concat([raw_df, temp_df])
                    except Exception as e:
                        print(f"Error: {e} processing choice: {choice}")
    raw_df.reset_index(inplace=True)
    return raw_df

In [9]:
working_dir = os.getcwd()

In [13]:
target_dir = os.path.join(working_dir, 'raw_data')
print(target_dir)

/home/victor/Documents/Expectation_data_generation/src/meals_collection/raw_data


In [14]:
raw_df = generate_raw_dataframe(target_dir)

File found and read: /home/victor/Documents/Expectation_data_generation/src/meals_collection/raw_data/breakfast_vegan.json
processing choice: 0, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 0
processing choice: 1, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 1
processing choice: 2, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 2
processing choice: 3, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 3
processing choice: 4, food type: vegan meal_type: breakfast
--------------------------------------------------------------------


File found and read: /home/victor/Documents/Expectation_data_generation/src/meals_collection/raw_data/afternoon snacks_kosher.json
processing choice: 0, food type: kosher meal_type: afternoon snacks
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 0
processing choice: 1, food type: kosher meal_type: afternoon snacks
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 1
processing choice: 2, food type: kosher meal_type: afternoon snacks
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 2
processing choice: 3, food type: kosher meal_type: afternoon snacks
--------------------------------------------------------------------
Error: name 'symbols' is not defined processing choice: 3
processing choice: 4, food type: kosher meal_type: afternoon snacks
---------------------

In [200]:
raw_df.head()

Unnamed: 0,title,raw_text,meal_type,cultural_restriction
0,Fruit Salad,"1. Fruit Salad: 70 calories per portion, 4 por...",breakfast,vegan
1,Vegan Pancakes,"2. Vegan Pancakes: 150 calories per portion, 4...",breakfast,vegan
2,Overnight Chia Pudding,3. Overnight Chia Pudding: 200 calories per po...,breakfast,vegan
3,Avocado Toast,"4. Avocado Toast: 250 calories per portion, 2 ...",breakfast,vegan
4,Vegan Omelette,"5. Vegan Omelette: 300 calories per portion, 1...",breakfast,vegan


In [201]:
raw_df.tail()

Unnamed: 0,title,raw_text,meal_type,cultural_restriction
95,Mexican stuffed bell peppers with ground beef...,96. Mexican stuffed bell peppers with ground b...,dinner,
96,Caprese quinoa salad with grilled chicken - 3...,97. Caprese quinoa salad with grilled chicken ...,dinner,
97,Baked coconut shrimp with mango salsa and jas...,98. Baked coconut shrimp with mango salsa and ...,dinner,
98,One-pot creamy chicken and mushroom pasta - 4...,99. One-pot creamy chicken and mushroom pasta ...,dinner,
99,"Stuffed bell peppers with couscous, chickpeas...","100. Stuffed bell peppers with couscous, chick...",dinner,


In [207]:
raw_df.shape

(10200, 4)

In [225]:
obj = re.search(r"\d+\s*(calories|kcals)", '100. Stuffed bell peppers with couscous, chickpeas, and feta - 350 kcals per portion, 4 portions, vegetarian, contains lactose\n\n(Continued below...)')

In [227]:
obj.group()

'350 kcals'

In [236]:
def extract_calories(recipe_str: str):
    try:
        matches = re.search(r"\d+\s*(calories|kcals)", recipe_str)
        #print(f"matches: {matches}")
        matches_string = matches.group()
        numbers = re.findall(r'\d+', matches_string)
        return float(numbers[0])
    except Exception as e:
        print(f"Error extracting calories {e}")
        return -1

In [238]:
raw_df["calories"] = raw_df["raw_text"].apply(lambda x: extract_calories(x))

Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneTyp

In [251]:
def extract_restrictions(recipe_string:str):
    text = ""
    try:
        transformed_string = recipe_string.replace("/n", ".")
        transformed_string = transformed_string.replace("-", ",")
        matches_obj = re.search(r"(contains\s+([^.,\n]+)|Allergen Warnings:\s+([^.,\n]+))", transformed_string)
        text = matches_obj.group()
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return text

In [255]:
raw_df["allergies"] =raw_df["raw_text"].apply(lambda x: extract_restrictions(x))

Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extr

In [257]:
raw_df.to_csv("processed_recipes_dataset.csv", index=False, sep="|")

In [202]:
raw_df.to_csv("raw_recipes.csv", sep="|")

In [123]:
raw_df.shape

(7456, 4)

In [None]:
def process_raw_text(traw_text:str):
    pass 

In [145]:
raw_df.iloc[-1, 1].split("\n")

['12. Recipe: Creamy Garlic Parmesan Chicken',
 '    Total Calories per Portion: 450 kcals',
 '    Total Portions: 4',
 '    Origin: Italy',
 '    ',
 '13. Recipe: Lentil Soup',
 '    Total Calories per Portion: 200 kcals',
 '    Total Portions: 4',
 '    Origin: Various countries',
 '    ',
 '14. Recipe: Spinach and Feta Stuffed Chicken Breast',
 '    Total Calories per Portion: 350 kcals',
 '    Total Portions: 2',
 '    Origin: Greece',
 '    ',
 '15. Recipe: Honey Glazed Salmon with Roasted Vegetables',
 '    Total Calories per Portion: 400 kcals',
 '    Total Portions: 2',
 '    Origin: Various countries',
 '    ',
 '16. Recipe: Chicken Curry with Basmati Rice',
 '    Total Calories per Portion: 600 kcals',
 '    Total Portions: 3',
 '    Origin: India',
 '    ',
 '17. Recipe: Caprese Salad',
 '    Total Calories per Portion: 150 kcals',
 '    Total Portions: 2',
 '    Origin: Italy',
 '    ',
 '18. Recipe: Cilantro Lime Shrimp Tacos',
 '    Total Calories per Portion: 350 kcals',

In [148]:
test = open("./morning snacks_halal.json", "r")
test_data = json.load(test)

In [195]:
print(test_data["choices"][4]["message"]["content"])

6. Peanut butter on rice cakes: 180 calories per portion, 1 portion, contains peanuts, origin: United States
10. Apple slices with almond butter: 150 calories per portion, 1 portion, contains almonds, origin: Various
13. Greek yogurt with honey: 120 calories per portion, 1 portion, contains dairy, origin: Greece
18. Egg muffins with spinach and feta: 180 calories per portion, 2 portions, contains dairy, origin: Various
20. Almond milk chia pudding: 200 calories per portion, 1 portion, contains almonds, origin: Various
21. Broccoli and cheese mini quiches: 150 calories per portion, 2 portions, contains cheese, origin: Various
25. Bell pepper and cream cheese roll-ups: 120 calories per portion, 2 portions, contains dairy, origin: Various
27. Greek-style yogurt with pistachios and honey: 220 calories per portion, 1 portion, contains dairy and pistachios, origin: Greece
29. Smoked salmon on whole wheat crackers: 180 calories per portion, 2 portions, contains fish and gluten, origin: Variou

In [150]:
text = test_data["choices"][0]["message"]["content"]

In [172]:
answer = re.search(r"[0-9]+\.", text)

In [179]:
matches = re.finditer(r"[0-9]+\.", text)

In [180]:
symbols = []
for x in matches:
    symbols.append(x.group())
    print(x.group())

1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.


In [190]:
explited = []
for i in range(len(symbols)):
    c = text.find(symbols[i])
    if i != len(symbols)-1:
        n = text.find(symbols[i+1])
    else:
        n = None
    explited.append(text[c:n])

In [191]:
explited



In [184]:
text[291:403]



In [183]:
text.find(symbols[1])

403

In [169]:
answer.span()

(291, 293)

In [156]:
text[291:294]

'1. '

In [153]:
answer.start()

291

In [None]:
with open(f"{type_food}_{cultural_restriction}.json", "w", encoding='utf-8') as fp:
                json.dump(response, fp, ensure_ascii=False, indent=4)

In [8]:
# list files in the directory 
files = glob.glob("./*.json")

In [9]:
files

['./dinner_None.json',
 './lunch_vegan.json',
 './with_None_1500.json',
 './lunch_halal.json',
 './with_kosher_2000.json',
 './with_vegetarian_3000.json',
 './morning snacks_None.json',
 './morning snacks_kosher.json',
 './morning snacks_halal.json',
 './with_vegan_3000.json',
 './morning snacks_vegan.json',
 './without_restrictions.json',
 './dinner_halal.json',
 './breakfast_vegetarian.json',
 './dinner_vegetarian.json',
 './afternoon snacks_vegan.json',
 './with_halal_1500.json',
 './lunch_kosher.json',
 './with_halal_2500.json',
 './dinner_vegan.json',
 './with_None_1000.json',
 './afternoon snacks_vegetarian.json',
 './with_halal_2000.json',
 './afternoon snacks_None.json',
 './dinner_kosher.json',
 './with_kosher_1500.json',
 './with_None_2500.json',
 './afternoon snacks_halal.json',
 './breakfast_halal.json',
 './with_vegetarian_2500.json',
 './with_vegetarian_2000.json',
 './with_kosher_3000.json',
 './morning snacks_vegetarian.json',
 './afternoon snacks_kosher.json',
 './with

In [28]:
import os

In [29]:
os.getcwd()

'/home/victor/Documents/Expectation_data_generation/src/meals_collection'

In [40]:
processed_data = pd.read_csv('/home/victor/Documents/Expectation_data_generation/src/recipes/processed_recipes_dataset.csv', 
                       sep='|')

In [41]:
processed_data["recipeId"] = [f"food_{i}" for i in range(len(processed_data))]

In [42]:
processed_data.head(4)

Unnamed: 0,title,raw_text,meal_type,cultural_restriction,calories,allergies,recipeId
0,Fruit Salad,"1. Fruit Salad: 70 calories per portion, 4 por...",breakfast,vegan,70.0,contains fruits only,food_0
1,Vegan Pancakes,"2. Vegan Pancakes: 150 calories per portion, 4...",breakfast,vegan,150.0,,food_1
2,Overnight Chia Pudding,3. Overnight Chia Pudding: 200 calories per po...,breakfast,vegan,200.0,contains nuts (almonds),food_2
3,Avocado Toast,"4. Avocado Toast: 250 calories per portion, 2 ...",breakfast,vegan,250.0,,food_3


In [44]:
processed_data.to_csv('/home/victor/Documents/Expectation_data_generation/src/recipes/processed_recipes_dataset_id.csv',
                     sep='|')