In [147]:
import pandas as pd 
import numpy as np 
import json 
import os 
import glob
from typing import List
import re

In [10]:
# json file pattern 
json_file_pattern = "{type_food}_{cultural_restriction}.json"
food_restrictions = ["vegan", "vegetarian", "halal", "kosher", "None"]
type_food_max_calories = {"breakfast":900,
                           "morning snacks": 300,
                           "afternoon snacks": 300,
                           "lunch": 1200,
                           "dinner": 600}

In [15]:
def load_json_file(path):
    json_data = {}
    with open(path, "r") as fp:
        json_data = json.load(fp)
    return json_data

In [25]:
def extract_text_from_json(json_data):
    list_choices = json_data["choices"]
    extracted_text = {}
    for choice, answer in enumerate(list_choices):
        text = answer["message"]["content"]
        extracted_text[f"{choice}"] = text
    return extracted_text

In [196]:
def text_processing(text: str, minimum_characters_per_line = 4):
    matches = re.findall(r"[0-9]+\.", text)
    splitted_text = []
    total_matches = len(matches)
    for i in range(0, total_matches):
        current_match = text.find(matches[i])
        if i != len(symbols)-1:
            next_match = text.find(matches[i+1])
        else:
            next_match = None
        splitted_text.append(text[current_match:next_match])
    return splitted_text

In [197]:
def text_analysis(text_raw: List[str]):
    title = list(map(lambda x: x.split(":")[0] if not "Recipe:" in x else x.split("\n")[0].split(":")[1], 
                     map(lambda x: x.split(".")[1], text_raw)))
    return title

In [198]:
def generate_raw_dataframe():
    json_file_pattern = "{type_food}_{cultural_restriction}.json"
    food_restrictions = ["vegan", "vegetarian", "halal", "kosher", "None"]
    type_food = ["breakfast", "morning snacks", "afternoon snacks", "lunch", "dinner"]
    raw_df = pd.DataFrame(data=[], columns=["title", "raw_text", "meal_type", "cultural_restriction"])
    raw_json = None
    for fr in food_restrictions:
        for tf in type_food:
            json_file = json_file_pattern.format(type_food = tf, cultural_restriction = fr)
            if os.path.isfile(json_file):
                with open(json_file, "r") as fp:
                    raw_json = json.load(fp)
            else:
                print(f"file: {json_file} not found")
                raw_json = None
                continue
            if raw_json is not None:
                text_choices = extract_text_from_json(raw_json)
                for choice, txt in text_choices.items():
                    try:
                        print(f"processing choice: {choice}, food type: {fr} meal_type: {tf}")
                        print("--------------------------------------------------------------------")
                        temp_df = pd.DataFrame(data=[], columns=["title", "raw_text", "meal_type", "cultural_restriction"])
                        recipes_raw = text_processing(txt)
                        titles = text_analysis(recipes_raw)
                        dict_text = {"title": titles, "raw_text": recipes_raw}
                        temp_df = pd.DataFrame.from_dict(dict_text)
                        temp_df["meal_type"] = tf
                        temp_df["cultural_restriction"] = fr
                        print(temp_df.head(3))
                        if choice ==  3:
                            break
                        raw_df = pd.concat([raw_df, temp_df])
                    except Exception as e:
                        print(f"Error: {e} processing choice: {choice}")
    return raw_df

In [199]:
raw_df = generate_raw_dataframe()

processing choice: 0, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
Error: list index out of range processing choice: 0
processing choice: 1, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
                     title                                           raw_text  \
0              Fruit Salad  1. Fruit Salad: 70 calories per portion, 4 por...   
1           Vegan Pancakes  2. Vegan Pancakes: 150 calories per portion, 4...   
2   Overnight Chia Pudding  3. Overnight Chia Pudding: 200 calories per po...   

   meal_type cultural_restriction  
0  breakfast                vegan  
1  breakfast                vegan  
2  breakfast                vegan  
processing choice: 2, food type: vegan meal_type: breakfast
--------------------------------------------------------------------
                              title  \
0                     Avocado Toast   
1             

In [200]:
raw_df.head()

Unnamed: 0,title,raw_text,meal_type,cultural_restriction
0,Fruit Salad,"1. Fruit Salad: 70 calories per portion, 4 por...",breakfast,vegan
1,Vegan Pancakes,"2. Vegan Pancakes: 150 calories per portion, 4...",breakfast,vegan
2,Overnight Chia Pudding,3. Overnight Chia Pudding: 200 calories per po...,breakfast,vegan
3,Avocado Toast,"4. Avocado Toast: 250 calories per portion, 2 ...",breakfast,vegan
4,Vegan Omelette,"5. Vegan Omelette: 300 calories per portion, 1...",breakfast,vegan


In [201]:
raw_df.tail()

Unnamed: 0,title,raw_text,meal_type,cultural_restriction
95,Mexican stuffed bell peppers with ground beef...,96. Mexican stuffed bell peppers with ground b...,dinner,
96,Caprese quinoa salad with grilled chicken - 3...,97. Caprese quinoa salad with grilled chicken ...,dinner,
97,Baked coconut shrimp with mango salsa and jas...,98. Baked coconut shrimp with mango salsa and ...,dinner,
98,One-pot creamy chicken and mushroom pasta - 4...,99. One-pot creamy chicken and mushroom pasta ...,dinner,
99,"Stuffed bell peppers with couscous, chickpeas...","100. Stuffed bell peppers with couscous, chick...",dinner,


In [207]:
raw_df.shape

(10200, 4)

In [225]:
obj = re.search(r"\d+\s*(calories|kcals)", '100. Stuffed bell peppers with couscous, chickpeas, and feta - 350 kcals per portion, 4 portions, vegetarian, contains lactose\n\n(Continued below...)')

In [227]:
obj.group()

'350 kcals'

In [236]:
def extract_calories(recipe_str: str):
    try:
        matches = re.search(r"\d+\s*(calories|kcals)", recipe_str)
        #print(f"matches: {matches}")
        matches_string = matches.group()
        numbers = re.findall(r'\d+', matches_string)
        return float(numbers[0])
    except Exception as e:
        print(f"Error extracting calories {e}")
        return -1

In [238]:
raw_df["calories"] = raw_df["raw_text"].apply(lambda x: extract_calories(x))

Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneType' object has no attribute 'group'
Error extracting calories 'NoneTyp

In [251]:
def extract_restrictions(recipe_string:str):
    text = ""
    try:
        transformed_string = recipe_string.replace("/n", ".")
        transformed_string = transformed_string.replace("-", ",")
        matches_obj = re.search(r"(contains\s+([^.,\n]+)|Allergen Warnings:\s+([^.,\n]+))", transformed_string)
        text = matches_obj.group()
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return text

In [255]:
raw_df["allergies"] =raw_df["raw_text"].apply(lambda x: extract_restrictions(x))

Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extracting text: 'NoneType' object has no attribute 'group'
Error extr

In [257]:
raw_df.to_csv("processed_recipes_dataset.csv", index=False, sep="|")

In [202]:
raw_df.to_csv("raw_recipes.csv", sep="|")

In [123]:
raw_df.shape

(7456, 4)

In [None]:
def process_raw_text(traw_text:str):
    pass 

In [146]:
for 

Unnamed: 0,title,raw_text,meal_type,cultural_restriction
7,Chicken Alfredo Pasta,8. Recipe: Chicken Alfredo Pasta\n Total Cal...,dinner,
8,Quinoa Stuffed Bell Peppers,9. Recipe: Quinoa Stuffed Bell Peppers\n Tot...,dinner,
9,Beef Tacos with Guacamole,10. Recipe: Beef Tacos with Guacamole\n Tot...,dinner,
10,Pineapple Fried Rice,11. Recipe: Pineapple Fried Rice\n Total Ca...,dinner,
11,Creamy Garlic Parmesan Chicken,12. Recipe: Creamy Garlic Parmesan Chicken\n ...,dinner,


In [145]:
raw_df.iloc[-1, 1].split("\n")

['12. Recipe: Creamy Garlic Parmesan Chicken',
 '    Total Calories per Portion: 450 kcals',
 '    Total Portions: 4',
 '    Origin: Italy',
 '    ',
 '13. Recipe: Lentil Soup',
 '    Total Calories per Portion: 200 kcals',
 '    Total Portions: 4',
 '    Origin: Various countries',
 '    ',
 '14. Recipe: Spinach and Feta Stuffed Chicken Breast',
 '    Total Calories per Portion: 350 kcals',
 '    Total Portions: 2',
 '    Origin: Greece',
 '    ',
 '15. Recipe: Honey Glazed Salmon with Roasted Vegetables',
 '    Total Calories per Portion: 400 kcals',
 '    Total Portions: 2',
 '    Origin: Various countries',
 '    ',
 '16. Recipe: Chicken Curry with Basmati Rice',
 '    Total Calories per Portion: 600 kcals',
 '    Total Portions: 3',
 '    Origin: India',
 '    ',
 '17. Recipe: Caprese Salad',
 '    Total Calories per Portion: 150 kcals',
 '    Total Portions: 2',
 '    Origin: Italy',
 '    ',
 '18. Recipe: Cilantro Lime Shrimp Tacos',
 '    Total Calories per Portion: 350 kcals',

In [148]:
test = open("./morning snacks_halal.json", "r")
test_data = json.load(test)

In [195]:
print(test_data["choices"][4]["message"]["content"])

6. Peanut butter on rice cakes: 180 calories per portion, 1 portion, contains peanuts, origin: United States
10. Apple slices with almond butter: 150 calories per portion, 1 portion, contains almonds, origin: Various
13. Greek yogurt with honey: 120 calories per portion, 1 portion, contains dairy, origin: Greece
18. Egg muffins with spinach and feta: 180 calories per portion, 2 portions, contains dairy, origin: Various
20. Almond milk chia pudding: 200 calories per portion, 1 portion, contains almonds, origin: Various
21. Broccoli and cheese mini quiches: 150 calories per portion, 2 portions, contains cheese, origin: Various
25. Bell pepper and cream cheese roll-ups: 120 calories per portion, 2 portions, contains dairy, origin: Various
27. Greek-style yogurt with pistachios and honey: 220 calories per portion, 1 portion, contains dairy and pistachios, origin: Greece
29. Smoked salmon on whole wheat crackers: 180 calories per portion, 2 portions, contains fish and gluten, origin: Variou

In [150]:
text = test_data["choices"][0]["message"]["content"]

In [172]:
answer = re.search(r"[0-9]+\.", text)

In [179]:
matches = re.finditer(r"[0-9]+\.", text)

In [180]:
symbols = []
for x in matches:
    symbols.append(x.group())
    print(x.group())

1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.


In [190]:
explited = []
for i in range(len(symbols)):
    c = text.find(symbols[i])
    if i != len(symbols)-1:
        n = text.find(symbols[i+1])
    else:
        n = None
    explited.append(text[c:n])

In [191]:
explited



In [184]:
text[291:403]



In [183]:
text.find(symbols[1])

403

In [169]:
answer.span()

(291, 293)

In [156]:
text[291:294]

'1. '

In [153]:
answer.start()

291

In [None]:
with open(f"{type_food}_{cultural_restriction}.json", "w", encoding='utf-8') as fp:
                json.dump(response, fp, ensure_ascii=False, indent=4)

In [8]:
# list files in the directory 
files = glob.glob("./*.json")

In [9]:
files

['./dinner_None.json',
 './lunch_vegan.json',
 './with_None_1500.json',
 './lunch_halal.json',
 './with_kosher_2000.json',
 './with_vegetarian_3000.json',
 './morning snacks_None.json',
 './morning snacks_kosher.json',
 './morning snacks_halal.json',
 './with_vegan_3000.json',
 './morning snacks_vegan.json',
 './without_restrictions.json',
 './dinner_halal.json',
 './breakfast_vegetarian.json',
 './dinner_vegetarian.json',
 './afternoon snacks_vegan.json',
 './with_halal_1500.json',
 './lunch_kosher.json',
 './with_halal_2500.json',
 './dinner_vegan.json',
 './with_None_1000.json',
 './afternoon snacks_vegetarian.json',
 './with_halal_2000.json',
 './afternoon snacks_None.json',
 './dinner_kosher.json',
 './with_kosher_1500.json',
 './with_None_2500.json',
 './afternoon snacks_halal.json',
 './breakfast_halal.json',
 './with_vegetarian_2500.json',
 './with_vegetarian_2000.json',
 './with_kosher_3000.json',
 './morning snacks_vegetarian.json',
 './afternoon snacks_kosher.json',
 './with