In [33]:
import pandas as pd
import numpy as np
import re 
import sys
import os 
import json
import string
from functools import partial
from typing import Dict, List, Optional
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer

In [107]:
from owlready2 import *

In [114]:
%load_ext autoreload
%autoreload 2
sys.path.insert(0, os.path.abspath('..'))
from recipe_gpt.preprocessing_utilities.preprocessing_functions import (check_nan_columns, 
                                                                        extract_nutritional_numerical,
                                                                        remove_punctuation,
                                                                        get_files,
                                                                        load_files, 
                                                                        replace_by_key)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [139]:
path = '/home/victor/Documents/recipe_gpt/output/raw_recipes/df_final_7000_normalized.csv'
df_recipes = pd.read_csv(path, index_col=0, sep='|')

In [140]:
df_recipes.columns

Index(['title', 'raw_text', 'cultural_restriction', 'calories', 'allergies',
       'recipeId', 'ingredients', 'preparation', 'carbs', 'fat', 'fiber',
       'protein', 'taste', 'cooking_style', 'meal_type', 'prep_time',
       'cuisine', 'price', 'ingredients_list'],
      dtype='object')

## Clear columns 

#### Fix allergies column 

In [142]:
df_recipes['allergies'].value_counts()

allergies
NotAllergens                                           3500
Milk                                                    627
contains gluten                                         330
contains dairy                                          258
Tree nuts                                               220
                                                       ... 
Soy, gluten (in some sauces)                              1
Crustacean shellfish, soy, mollusks (oyster sauce).       1
Wheat, soy, shellfish.                                    1
Gluten, shellfish, soy (in some wrappers)                 1
Gluten, soy                                               1
Name: count, Length: 360, dtype: int64

In [236]:
dict_allergens = {
    'gluten': ['gluten'],
    'lactose': ['lactose', 'milk', 'cow', 'cheese'],
    'eggs': ['eggs', 'egg'],
    'peanuts': ['peanuts'],
    'seafood': ['fish', 'seafood'],
    'wheat': ['wheat'],
    'dairy': ['dairy'],
    'tree nuts': ['tree', 'nut', 'nuts', 'trees', 'almonds'],
    'soy': ['soy', 'soybeans'],
    'shellfish': ['shellfish', 'crustacean', 'shellfish', 'mollusks', 'oyster', 'shrimp', 'crab', 'lobster'],
    'NotAllergens': ['notallergens', 'na', 'none', 'free', 'allergenfree', 'nutfree'],
    'sesame': ['sesame'],
    'garlic': ['garlic'],
    'poultry': ['poultry'],
    'meat': ['meat', 'beef'], 
    'pork': ['pork', 'porkbased'], 
    'legumes': ['legumes', 'chickpea', 'flour', 'olives'],
    'corn': ['corn'],
    'coconut': ['coconut'],
    'nightshade': ['nightshade', 'nightshades'],
    'mustard': ['mustard'],
    'citrus': ['citrus'],
    'spices': ['spices', 'turmeric'],
    'black beans': ['black', 'beans']
}

In [237]:
partial_replace = partial(replace_by_key, replace_dict=dict_allergens)

In [238]:
processed_tasted = df_recipes['allergies'].apply(lambda x: partial_replace(x.replace(';', ' ')))

In [239]:
processed_tasted.value_counts()

allergies
NotAllergens                  3523
lactose                        695
gluten                         460
tree nuts                      399
dairy                          382
                              ... 
gluten, soy, peanuts             1
pork, soy, gluten                1
sesame, eggs                     1
soy, shellfish, eggs             1
pork, NotAllergens, gluten       1
Name: count, Length: 86, dtype: int64

In [233]:
mask = processed_tasted == 'none'
sum(mask)

5

In [234]:
df_to_fix = df_recipes.loc[mask, :]

In [235]:
df_to_fix['allergies'].tolist()

['contains fruits only',
 'Vegan',
 'contains black beans (Mexico)',
 'contains cilantro (Mexico)',
 'contains rice']

#### Cultural restriction 

In [104]:
unique_values = df_recipes['cultural_restriction'].value_counts()

In [105]:
unique_values

cultural_restriction
vegetarian                                                                      1598
halal                                                                           1594
vegan                                                                           1543
kosher                                                                          1434
NotRestriction                                                                   271
Meat-based                                                                       238
meat-based                                                                       116
Vegetarian                                                                       104
grain-based                                                                       22
Grain-based                                                                       13
Dessert                                                                           13
veggie                                      

In [106]:
# Basic categories 
cultural_restriction_list = ['vegetarian', 'halal', 'vegan', 'kosher', 'NotRestriction',
                             'meat-based', 'grain-based', 'dessert', 'pescatarian',
                             'seafood-based', 'Beverage', 'Dairy', 'Alcoholic', 'Non-alcoholic',
                             'Non-vegetarian', 'alcohol-based', 'keto', 'Seasoning', 'Cocktail']

In [108]:
# Create an ontology for cultural restriction
onto = get_ontology("http://example.org/onto.owl")

In [111]:
with onto:
    # Create a basic class for each category
    class Vegetarian(Thing):
        pass
    
    class NotRestriction(Thing):
        pass
    
    class Beverage(Thing):
        pass
    
    # Create derivate classes 
    class Halal(NotRestriction):
        pass
    
    class Kosher(NotRestriction):
        pass
    
    class Keto(NotRestriction):
        pass
    
    class SeaFood(NotRestriction):
        pass
    
    class Pescatarian(NotRestriction):
        pass
    
    class MeatBased(NotRestriction):
        pass
    
    class Vegan(Vegetarian):
        pass
    
    class FruitBased(Vegan):
        pass
    
    class GrainBased(Vegetarian):
        pass
    
    class Diary(Vegetarian):
        pass
    
    class NonAlcoholic(Beverage):
        pass
    
    class Alcoholic(Beverage):
        pass
    
    class Cocktail(Alcoholic):
        pass

In [112]:
onto_path = '/home/victor/Documents/recipe_gpt/output/ontologies/cultural_restrictions.owl'
onto.save(file=onto_path, format='rdfxml')

In [123]:
translate_dict = {
    'vegetarian': ['vegetarian', 'veggie', 'veggies'], 
    'halal': ['halal'], 
    'vegan': ['vegan'], 
    'kosher': ['kosher'], 
    'NotRestriction': ['NotRestriction', 'non-vegetarian', 'notrestriction'],
    'meat-based': ['meat-based', 'meat'], 
    'grain-based': ['grain-based', 'grain'], 
    'dessert': ['dessert'], 
    'pescatarian': ['pescatarian', 'pescetarian', 'fish'],
    'seafood-based': ['seafood-based', 'fish', 'seafood'], 
    'beverage': ['beverage'], 
    'dairy': ['dairy'], 
    'alcohol-based': ['alcoholic', 'alcohol-based', 'cocktail', 'alcohol'], 
    'non-alcoholic': ['non-alcoholic'],
    'keto': []
}

In [124]:
partial_replace = partial(replace_by_key, replace_dict=translate_dict)

In [125]:
processed_tasted = df_recipes['cultural_restriction'].apply(lambda x: partial_replace(x))

In [136]:
processed_tasted.value_counts()

cultural_restriction
vegetarian                    1714
halal                         1594
vegan                         1543
kosher                        1436
meat-based                     358
NotRestriction                 272
grain-based                     37
dessert                         14
pescatarian                     13
seafood-based                    9
alcohol-based                    4
beverage                         2
pescatarian, seafood-based       2
dairy                            1
keto                             1
Name: count, dtype: int64

In [127]:
mask = processed_tasted == 'none'
sum(mask)

1

In [135]:
processed_tasted[mask] = 'NotRestriction'

In [137]:
df_recipes['cultural_restriction'] = processed_tasted

In [134]:
df_recipes.loc[mask, 'cultural_restriction'] = 'NotRestriction'

6932    Seasoning
Name: cultural_restriction, dtype: object

In [128]:
df_to_fix = df_recipes.loc[mask, :]

In [133]:
df_to_fix['title'].tolist()

['Vanilla Salt']

#### calories, carbs, fat, fiber, protein

In [100]:
df_recipes['calories']

0         70.0
1        300.0
2        400.0
3        550.0
4        600.0
         ...  
6995     922.0
6996     894.0
6997     319.0
6998    3813.0
6999    1171.0
Name: calories, Length: 7000, dtype: float64

#### Clear taste

In [18]:
basic_profiles = ['sweet', 'salty', 'sour', 'bitter', 'umami']

In [20]:
dict_tastes = dict(zip(basic_profiles+['other'], [[], [], [], [], [], [], []]))

In [64]:
dict_tastes['salty'] = ['salt', 'salty', 'savory']
dict_tastes['sweet'] = []
dict_tastes['sour'] = ['citrus-y', 'citrus']


In [22]:
values = df_recipes['taste'].value_counts()

In [115]:
# test replace function 
replace_by_key('sweet  in coconut chutney   salty  in idli and sambar   sour  in sambar with tamarind extract   umami  in sambar with lentils and vegetables', dict_tastes)

'umami, sweet, sour, salty'

In [66]:
# partial to apply 
partial_replace = partial(replace_by_key, replace_dict=dict_tastes)

In [67]:
processed_tasted = df_recipes['taste'].apply(lambda x: partial_replace(x))

In [90]:
processed_tasted.value_counts()

taste
salty                                4082
sweet                                1592
sour                                  859
umami                                 220
bitter                                 72
umami, salty                           61
sour, salty                             4
sweet, salty                            2
umami, sour, sweet, bitter, salty       2
bitter, umami, salty                    1
umami, sweet, sour, salty               1
bitter, sweet, salty                    1
sweet, umami                            1
umami, sour                             1
sweet, sour                             1
umami, sour, salty                      1
Name: count, dtype: int64

In [92]:
df_recipes['taste'] =  processed_tasted

In [69]:
mask = processed_tasted == 'none'
sum(mask)

99

In [70]:
df_to_fix = df_recipes.loc[mask, :]

In [73]:
# save to process recipes 
output_path = '/home/victor/Documents/recipe_gpt/output/raw_recipes'
df_to_fix.to_csv(os.path.join(output_path, 'df_to_fix_7000_with_ingredients_fixed.csv'), index=True, sep='|')

In [77]:
file_pattern = '/home/victor/Documents/recipe_gpt/output/raw_output/fixing_taste*.json'
file_list = get_files(file_pattern)

Found files number: 1


In [79]:
answer_dict = load_files(file_list)

In [81]:
df_calories = pd.DataFrame.from_dict(answer_dict, 
                                     orient='index',
                                     columns=['raw_text']
)

In [85]:
fix_patch = df_calories['raw_text'].apply(lambda x: partial_replace(x))

In [89]:
processed_tasted[mask] = fix_patch

In [80]:
processed_tasted[mask]

53      none
90      none
111     none
453     none
559     none
        ... 
6706    none
6801    none
6889    none
6907    none
6955    none
Name: taste, Length: 99, dtype: object

In [72]:
df_to_fix['title'].tolist()

['Vegan Breakfast Croissant',
 'Vegan Egg McMuffin',
 'Vegan Breakfast Rice Paper Rolls',
 'Instant Pot Vegan Yogurt',
 'Vegan Buffalo Cauliflower Tacos',
 'Avocado Toast ',
 'Spicy Thai Coconut Soup',
 'Thai Red Curry with Tofu and Vegetables',
 'Vegan Cauliflower Shawarma Wraps',
 'Vegan Roasted Cauliflower Tacos',
 'Tofu Tikka Masala',
 'Vegan Mapo Tofu',
 'Lentil Salad with Avocado Dressing',
 'Irish Soda Bread',
 'Hard-Boiled Eggs',
 'Vegetable Frittata Muffins',
 'Roasted butternut squash fries with spicy mayo',
 'Cheese and Crackers',
 'Caprese Pasta',
 'Roasted cauliflower and chickpea tacos',
 'Sweet potato and chickpea curry',
 'Mexican Stuffed Bell Peppers',
 'Thai Red Curry with Vegetables',
 'Mediterranean Roasted Vegetable Quiche',
 'Pancake cereal',
 'Indian poha cutlet',
 'Venezuelan arepas with cheese and avocado',
 'Vegetable and Cheese Muffin Frittatas',
 'Lebanese Kaak (Sesame Bread Rings)',
 'Moroccan Baghrir (Thousand',
 'Turkish Gözleme',
 'Moroccan Semolina Panc

In [27]:
for val in list(values.index):
    lower_val = val.lower()
    lower_val = remove_punctuation(lower_val, list(string.punctuation))
    splitted = val.split(' ')
    for word in splitted:
        if word in dict_tastes.keys():
            dict_tastes[word].append(lower_val)
        else:
            dict_tastes['other'] = [lower_val]

In [31]:
dict_tastes['salty']

['salty',
 'salty  in the red lentil dal \nsalty  in the naan bread',
 'grilled vegetable and halloumi skewers recipe  the key taste profile is \n  salty',
 'the key taste profile for this recipe is both salty and umami',
 'the key taste profile of this recipe is  salty',
 'key taste profile  salty',
 'sweet  in coconut chutney   salty  in idli and sambar   sour  in sambar with tamarind extract   umami  in sambar with lentils and vegetables',
 'key taste profile for the recipe  salty',
 'bitter  sweet  salty',
 'the key taste profile in this recipe is  salty']

#### Clear cooking style 

In [6]:
cooking_style = df_recipes['cooking_style'].value_counts()

In [7]:
import string

In [9]:
dict_cooking_style = {}
flag = False
for cs in cooking_style.index:
    flag = False
    lower = cs.lower()
    without_punt = remove_punctuation(lower, list(string.punctuation))
    splitted = without_punt.split(' ')
    for word in splitted:
        if word in dict_cooking_style.keys():
            dict_cooking_style[word].append(cs)
            flag = True
            break
    if not flag:
        dict_cooking_style[without_punt] = []
        flag = False

In [10]:
vals = set(dict_cooking_style.keys())

In [12]:
second_check = []
for v in vals:
    if len(v.split(' ')) > 3:
        second_check.append(v)

In [14]:
len(second_check)

60

In [15]:
second_check

['there is no cooking involved in this recipe',
 '1  sauteed\n2  blend\n3  slow cooked\n4  serve',
 'no cooking style  no cooking required',
 'no cooking style specified  this recipe does not require cooking',
 'no cooking style is indicated in the recipe  it is a no cook recipe',
 'no cooking is involved',
 'there is no cooking style indicated in this recipe',
 'no cooking required for this recipe',
 'the cooking style of this recipe is not applicable',
 'pulse oats  combine  stir  roll  refrigerate',
 'this recipe is a compilation of different dishes and does not have a specific cooking style  some components may involve sautéing or roasting  but overall  it does not fall into a single cooking style',
 'the cooking style of this recipe is not applicable as it does not involve any cooking',
 'no cooking style required',
 'no cooking style mentioned for this recipe',
 'no cooking style  no bake',
 'not applicable  this recipe does not involve any cooking',
 'no cooking style  as this r

#### Fix Prep time

In [97]:
df_recipes['prep_time'].info()

<class 'pandas.core.series.Series'>
Index: 7000 entries, 0 to 6999
Series name: prep_time
Non-Null Count  Dtype  
--------------  -----  
7000 non-null   float64
dtypes: float64(1)
memory usage: 109.4 KB


#### Fix cuisine

In [63]:
values_cuisine = df_recipes['cuisine'].value_counts()

In [66]:
mask = values_cuisine < 10

In [69]:
list_cuisines = list(values_cuisine.index)

In [70]:
cuisine_dict = {}
flag = False
for cuisine in list_cuisines:
    flag = False
    splitted = cuisine.split(' ')
    for word in splitted:
        if word in cuisine_dict.keys():
            flag = True
    if not flag:
        cuisine_dict[cuisine] = 0
        flag = False

In [73]:
unique_cuisines = set(cuisine_dict.keys())

In [91]:
# check long phrases 
second_check = []
for cuisine in unique_cuisines:
    splitted = cuisine.split(' ')
    if len(splitted) > 2:
        second_check.append(cuisine)
        break


In [92]:
len(second_check)

1

In [90]:
second_check

['The cuisine type of this recipe is "fusion."']

In [87]:
second_check

['The cuisine type of this recipe is "fusion."',
 'Fried Plantains.',
 'Mocktail',
 'Tex-Mex',
 'Bangladeshi',
 'The cuisine type of this recipe is: Indian.',
 'Baking.',
 'The cuisine type of this recipe is Asian.',
 'The cuisine type of the recipe "Spicy Lentil Soup" is "Global."',
 'The cuisine type of the recipe is "vegetarian."',
 'The cuisine type of the recipe is "vegetarian"',
 'No specific cuisine type indicated.',
 'Argentinean',
 'The cuisine type of the Lentil and Vegetable Stew recipe is "vegetarian"',
 'Biscuits',
 'Quiche',
 'vegetarian',
 'donut',
 'Steamed',
 'The cuisine type of the "Veggie and Hummus Wrap" recipe is "Vegetarian"',
 'Malaysian',
 'Brazilian.',
 'Unclear',
 'Parfait',
 'Paraguayan Chipa - Paraguayan',
 'This recipe does not specify a cuisine type.',
 'Tropical',
 'Fritters: Vegetable',
 'Cuisine type: Contemporary',
 'Israeli',
 'The cuisine type for this recipe is "Mediterranean."',
 'The cuisine type of the recipe is Belgian.',
 'Irish',
 'Argentine'

In [84]:
second_check

['The cuisine type of this recipe is "fusion."',
 'Fried Plantains.',
 'Mocktail',
 'Tex-Mex',
 'Bangladeshi',
 'The cuisine type of this recipe is: Indian.',
 'Baking.',
 'The cuisine type of this recipe is Asian.',
 'The cuisine type of the recipe "Spicy Lentil Soup" is "Global."',
 'The cuisine type of the recipe is "vegetarian."',
 'The cuisine type of the recipe is "vegetarian"',
 'No specific cuisine type indicated.',
 'Argentinean',
 'The cuisine type of the Lentil and Vegetable Stew recipe is "vegetarian"',
 'Biscuits',
 'Quiche',
 'vegetarian',
 'donut',
 'Steamed',
 'The cuisine type of the "Veggie and Hummus Wrap" recipe is "Vegetarian"',
 'Malaysian',
 'Brazilian.',
 'Unclear',
 'Parfait',
 'Paraguayan Chipa - Paraguayan',
 'This recipe does not specify a cuisine type.',
 'Tropical',
 'Fritters: Vegetable',
 'Cuisine type: Contemporary',
 'Israeli',
 'The cuisine type for this recipe is "Mediterranean."',
 'The cuisine type of the recipe is Belgian.',
 'Irish',
 'Argentine'

In [74]:
unique_cuisines

{'Afghan',
 'Albanian',
 'Algerian',
 'American',
 'American.',
 'Andean',
 'Appetizer',
 'Appetizer.',
 'Arabic',
 'Argentina',
 'Argentine',
 'Argentinean',
 'Argentinean.',
 'Argentinian',
 'Asian',
 'Asian.',
 'Assorted mini muffins (blueberry, chocolate chip, and banana) is of the cuisine type: Assorted.',
 'Australian',
 'Avocado Toast',
 'Avocado toast',
 'BBQ',
 'Baked Cauliflower Bites is a recipe with the cuisine type "American."',
 'Baked Salmon with Asparagus is a recipe of the cuisine type: Seafood.',
 'Baked salmon with lemon and dill: Cuisine type - "European"',
 'Bakery',
 'Baking',
 'Baking.',
 'Banana bread',
 'Bangladeshi',
 'Barbecue',
 'Belgian',
 'Beverage',
 'Bilingual',
 'Biscuits',
 'Bolivian',
 'Bolivian.',
 'Brazilian',
 'Brazilian.',
 'Bread',
 'Breakfast',
 'Breakfast.',
 'British',
 'Brunch',
 'Brussels Sprouts\nRoasted',
 'Buffalo',
 'Burger',
 'Cabbage Soup: Cabbage soup recipe is most commonly associated with the cuisine type "European".',
 'Cajun',
 'C

#### Fix price 

In [5]:
df_recipes['price'].info()

<class 'pandas.core.series.Series'>
Index: 7000 entries, 0 to 6999
Series name: price
Non-Null Count  Dtype  
--------------  -----  
6995 non-null   float64
dtypes: float64(1)
memory usage: 109.4 KB


In [30]:
fixed_prices = df_recipes['price'].apply(lambda x: extract_nutritional_numerical(x))

In [52]:
mask = fixed_prices == 2.5
print(f"To process: {sum(mask)}")

To process: 16


In [53]:
df_recipes.loc[mask, 'price']

395     2.5
595     2.5
663     2.5
1095    2.5
2322    2.5
2967    2.5
5688    2.5
6555    2.5
6735    2.5
6818    2.5
6839    2.5
6858    2.5
6870    2.5
6930    2.5
6993    2.5
6996    2.5
Name: price, dtype: object

In [54]:
fixed_prices[mask] = 3.0

In [57]:
df_recipes['price'].value_counts()

price
2.0    6334
1.0     601
3.0      60
Name: count, dtype: int64

In [56]:
df_recipes['price'] = fixed_prices

#### Clean titles

In [22]:
mask = df_recipes['title'].str.contains('(', regex=False)
print(f"Total recipes with pattern {sum(mask)} ")

Total recipes with pattern 905 


In [23]:
df_to_fix = df_recipes.loc[mask, :]

In [24]:
mask_1 = df_to_fix['title']

485                           Aloo Tikki (Potato Patties)
521         Vegan Energy Balls (Dates, Nuts, and Coconut)
1110                    Vegan Thai Coconut Soup (Tom Kha)
1349      Lentil Curry (200 calories/portion, 4 portions)
1350    Mediterranean Quinoa Salad (250 calories/porti...
                              ...                        
6866    Persian Zucchini Stew With Chicken (Khoresh Ka...
6881                    Sautéed Tofu (Dau Hu Xao Dau Hao)
6899                   (Korean Black Bean Noodles) Recipe
6971     Russian Potato Salad Olivier (From South Russia)
6976                        Rolled Oats Master Mix (Oamc)
Name: title, Length: 905, dtype: object

In [19]:
df_to_fix['title_fixed'] = df_to_fix['title'].apply(lambda x: x.replace('', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_fix['title_fixed'] = df_to_fix['title'].apply(lambda x: x.replace('**', ''))


In [20]:
df_recipes.loc[mask, 'title'] = df_to_fix['title_fixed']

In [138]:
out_path = '/home/victor/Documents/recipe_gpt/output/raw_recipes/df_final_7000_normalized.csv'
df_recipes.to_csv(out_path, sep='|', index=True)