In [14]:
import numpy as np
import pandas as pd
import re 
import sys
import os 
import json
from pathlib import Path
from typing import Dict, List, Optional
from functools import partial
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer

In [28]:
%load_ext autoreload
%autoreload 2
sys.path.insert(0, os.path.abspath('..'))
from recipe_gpt.preprocessing_utilities.preprocessing_functions import (check_nan_columns, 
                                                                        extract_nutritional_numerical,
                                                                        remove_punctuation,
                                                                        get_files,
                                                                        load_files, 
                                                                        replace_by_key,
                                                                        extract_pattern, 
                                                                        remove_pattern,
                                                                        check_pattern)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
base_path = Path.cwd().parent
path = os.path.join(base_path, 'output/raw_recipes/df_final_7000_normalized_final.csv')
df_recipes = pd.read_csv(path, index_col=0, sep='|')

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
print(stopwords.words('english'))
nltk.download('stopwords')

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /home/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
list_stop_words = list(set(stopwords.words('english')))

In [7]:
list_measures = ["bag", "bags", "liter", "liters", "bar", "bottle", "bottles", "bowl", "bowls", "box", "boxes", "carton", "jar", "jars",
                 "cup", "cups", "drop", "glass", "piece", "roll", "rolles", "slice", "slices", "spoon", "spoons", "spoonful", "lbs",
                 "all-purpose", "purpose", "diced", "sliced", "shopped","oil",
                 "tablespoon", "tablespoons", "large", "teaspoon", "teaspoons", "tube", "chunks", "chunk", "dice", "dices", "juice", "use", "contains",
                 "contain"]

In [8]:
country_names = ['united','state',  'north', 'america', 'italian', 'mediterranean', 'swiss', 'mexico', 'american', 'middle', 'eastern',
                 'asian', 'usa', 'japan', 'indian', 'moroccan', 'cuban', 'russian', 'japanese', 'british', 'italy',  'greece', 'france',
                 'vietnam', 'turkish', 'holland', 'lebanese', 'belgian', 'india', 'indonesian',  'chilean', 'syrian', 'venezuelan', 'ireland',
                 'swedish', 'filipino', 'polish', 'singaporean', 'israeli', 'brazilian', 'sri', 'lanka', 'jamaican',  'finnish', 'karelian',
                 'afghan',   'nigerian', 'egyptian',     'haitian', 'iraqi', 'maltese', 'algerian', 'canadian',  'ethiopian', 'iranian', 'malaysian',
                 'arabic', 'norwegian', 'brazil', 'belgium', 'russia', 'egypt', 'pakistan', 'dutch', 'african', 'malaysia', 'spain', 'korea', 'lebanon',
                 'tunisian', 'scotland', 'china', 'iran', 'hungarian',  'monterrey', 'latin', 'southern', 'persian', 'argentina', 'albanian', 'scottish',
                 'california', 'israel', 'east',  'spanish', 'irish',  'scandinavia', 'canada',  'southeast', 'asia', 'mongolian',
]

In [9]:
additional_filter = ['optional', 'ripe', 'cooked', 'nutritional', 'chopped', 'cooking', 'powdered', 'serving', 'mixed', 'block','firm',
'drained', 'pressed', 'rinsed', 'melted', 'english', 'type', 'choice', 'pound', 'crumbled','small', 'vital', 'premade', 'scramble', 'tbsp', 'finely',
'sized', 'medium', 'ounce', 'smoke', 'peeled', 'grated', 'pre', 'made', 'based', 'granulated', 'cold', 'cubed', 'add', 'in', 'recipe', 'breakfast',
'mashed', 'free', 'nib', 'active', 'dry', 'softened', 'packed', 'kernel', 'juiced', 'sheet', 'julienned',   'day','old', 'meal', 'tsp', 'stick',
'star', 'inch', 'removed', 'store',  'bought', 'homemade', 'extra',  'least', 'hour', 'refrigerated', 'pocket', 'warm', 'etc', 'shred', 'thick',
'round', 'package', 'shaving', 'plus', 'simple', 'choose','favorite', 'preferred', 'scrambled', 'additional', 'cut', 'strip','paper', 'kind','prefer',
'can', 'overnight','frying','squash', 'apologize', 'language', 'model', 'provide','specific','trained','however','offer','general', 'substituted',
'desired','piece', 'matcha', 'bar', 'added', 'pan', 'crushed',  'calorie','per','portion', 'mixture', 'soft', 'roughly',  'low', 'nutrition',
'colder', 'thin', 'thinning','drizzle', 'suggested', 'energy', 'bite', 'mini', 'desiccated', 'friendly', 'half',  'serve', 'approximately', 'ingredient',
'preparation', 'step',  'listed', 'please', 'note', 'count', 'may', 'vary', 'depending', 'size', 'brand', 'used', 'frosting', 'instant','icing', 'frothing',
'slightly', 'stale', 'dusting', 'approx', 'adjust', 'according', 'whipped', 'heavy', 'crumb', 'includes', 'following', 'total', 'sorry', 'access',
'database', 'tbd', 'dressing', 'quantity', 'best', 'calculate', 'unfortunately', 'ability', 'information', 'like', 'give', 'online', 'tool', 'apps',
'text','directly', 'certainly', 'help', 'varies', 'rough', 'estimate', 'around', 'provides',  'depend', 'amount', 'hulled', 'estimated', 'button', 'fact',
'need', 'make', 'omitted', 'along', 'still', 'real', 'time', 'unable', 'individual', 'exact', 'without', 'knowing',  'consistency', 'beaten', 'culture',
'estimating', 'approximate', 'uncooked', 'average', 'range', 'allergic', 'warning',  'kcals', 'unknown', 'title', 'bay',  'street', 'frank', 'redhot',
'cuisine', 'grilled', 'solid', 'start', 'using', 'grater', 'food', 'processor', 'place', 'clean', 'kitchen', 'towel', 'squeeze', 'excess', 'mixing',
'combine', 'mix', 'well', 'evenly', 'combined', 'heat', 'skillet', 'take', 'form', 'compact', 'heated', 'cook', 'minute', 'side', 'golden', 'remove',
'lined', 'plate', 'absorb', 'repeat', 'process', 'remaining', 'adding', 'divided', 'deep', 'undrained', 'young', 'trimmed', 'thread', 'coloring',
'blanched', 'fried', 'instruction', 'cleaned', 'slider', 'steamed','jumbo', 'full', 'stir', 'fry', 'snap', 'textured', 'bit', 'snow', 'check',  'shape',
'reduction', 'classic', 'sub', 'string', 'metal', 'capability', 'keep', 'mind', 'basic', 'substitute',  'part', 'prepared', 'separately', 'ripened',
'label', 'ring', 'shortening', 'boat', 'ing', 'suitable',  'rehydrated', 'sparkling', 'fermented', 'style',  'ensure', 'required', 'regular', 'necessary',
'allergy', 'giant', 'gigantes', 'diagonally', 'split', 'blended','mild', 'sure', 'ensure', 'required', 'regular', 'necessary', 'allergy','dish','one',
'toothpick', 'generate', 'roasting', 'hard', 'yolk', 'everything', 'coarse', 'trail', 'ready', 'kilogram', 'combination', 'pit', 'flesh', 'mash', 'fork',
'cover', 'plastic', 'pressing', 'onto', 'surface', 'prevent', 'browning', 'refrigerate', 'allow', 'meld', 'let','move', 'depends', 'generally', 'great',
'top', 'marinade', 'rib', 'segmented', 'northern', 'flat', 'people', 'regional', 'influence', 'popular', 'delicious', 'lover', 'preserved', 'end', 'similar',
'see', 'eyed', 'fine', 'century', 'semi', 'johnny', 'recommended', 'calculator', 'determine', 'value',  'traditional', 'typically', 'available', 'kcal',
'able', 'achieve', 'room', 'temperature', 'circular', 'bubble', 'bottom', 'carefully', 'flip', 'another', 'stack', 'follows', 'precooked', 'tongue',
'mentioned', 'replace', 'fermentation', 'center', 'tilt', 'motion', 'appear', 'edge','lift', 'cool', 'bolillos', 'quick', 'easy', 'exception', 'try',
'parfait', 'root', 'loose', 'suggestion', 'thickness', 'fully', 'also', 'known', 'loosely', 'chop', 'could',  'quality', 'slicing', 'increase', 'assist',
'providing', 'prepare', 'grade', 'estimation', 'would', 'data', 'caloric', 'quarter', 'provided', 'list', 'including', 'precise', 'associated','important',
'content', 'number', 'yield', 'know', 'considered', 'segment', 'miniature', 'preheat', 'oven', 'line', 'cutter', 'resembles', 'overmix', 'turn',  'wire',
'rack', 'follow','continue', 'imitation', 'spider', 'allergen', 'caution', 'sensitivity', 'account', 'dietary', 'restriction', 'always', 'adapt',
'accordingly', 'included', 'different', 'none', 'confirm', 'safe', 'potential', 'trigger', 'cause', 'possible', 'intolerance', 'texture', 'sealing', 'exclude',
'dinner', 'club', 'saturated', 'cholesterol', 'carbohydrate', 'fiber', 'factor','idea', 'covered', 'course', 'reduce', 'immersion', 'blender', 'source',
'healthier', 'measurement', 'bomba', 'avoid', 'product', 'present', 'easily', 'adapted', 'accommodate', 'find', 'sure', 'substitution', 'common', 'sharp',
'alternative', 'non', 'dash', 'purpose', 'difficulty', 'intermediate', 'garden', 'intolerant','avoiding', 'skip']

In [10]:
total_stop_words = list_stop_words + list_measures + additional_filter

In [11]:
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /home/victor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/victor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def text_preprocessing_nltk(text:str, stop_words:List[str]=None, lema=False, steam=False):
  # 1 lowercase
  new_text = text.lower()
  new_text = re.sub(r"(ingredient|ingredients).*:", " ", new_text)
  # remove numbers
  new_text = re.sub(r"\d+", " ", new_text)
  # 2 Removing puntuation
  new_text = re.sub(r'[^A-Za-z0-9 ]+', ' ', new_text)
  # 3 Tokenization
  words = word_tokenize(new_text)
  # 4 Stop words filtering
  if stop_words:
    filtered_words = [w for w in words if not w in stop_words]
  else:
    filtered_words = words
  # Filtering short words
  filtered_words = list(filter(lambda x: len(x)>2, filtered_words))
  if not lema and not steam:
    return filtered_words
  elif not lema and steam:
    # 5 Stemming
    porter = PorterStemmer()
    stemmed = [porter.stem(w) for w in filtered_words]
    return stemmed
  else:
    # 6 lematization
    lemmatizer = WordNetLemmatizer()
    lemas = [lemmatizer.lemmatize(w) for w in filtered_words]
    return lemas

In [13]:
list_of_words = text_preprocessing_nltk(df_recipes["ingredients"][1], stop_words=total_stop_words, lema=True)
recipe_text = " ".join(list_of_words)
recipe_text

'chickpea flour water yeast turmeric garlic powder onion powder salt pepper taste bell pepper onion tomato fresh cilantro'

In [18]:
processing_nltk = partial(text_preprocessing_nltk, stop_words=total_stop_words, lema=True)
recipes_corpus = df_recipes["ingredients"].apply(lambda x: ", ".join(processing_nltk(x)))
list_sentences = recipes_corpus.tolist()

In [20]:
df_recipes['ingredients_list'] = list_sentences

In [23]:
df_recipes['cuisine'].fillna('International', inplace=True)

In [24]:
check_nan_columns(df_recipes)

{}

In [21]:
df_recipes.columns

Index(['title', 'raw_text', 'cultural_restriction', 'calories', 'allergens',
       'recipeId', 'ingredients', 'preparation', 'carbs', 'fat', 'fiber',
       'protein', 'taste', 'cooking_style', 'meal_type', 'prep_time',
       'cuisine', 'price', 'ingredients_list'],
      dtype='object')

In [26]:
df_recipes.rename(columns={'title':'name',
                         'preparation':'instructions', 
                         'carbs': 'carbohydrates'}, inplace=True)

In [104]:
out_path = os.path.join(base_path, 'output/raw_recipes/recipes_dataset_final.csv')
df_recipes.to_csv(out_path, sep='|', index=True)

#### Check ingredients and preparation instruction 

In [96]:
partial_check = partial(check_pattern, pattern=r'(preparation:|preparation steps:|instructions:)')

In [97]:
mask = df_recipes['ingredients'].apply(partial_check)
sum(mask)

0

In [92]:
mask = df_recipes['ingredients'].str.contains('ingredients:')
sum(mask)

14

In [93]:
partial_extract = partial(extract_pattern, pattern=r'((preparation:|preparation steps:|instructions:)[\s\S]*)')

In [94]:
extracted_instructions = df_recipes.loc[mask, 'instructions'].apply(partial_extract)

In [90]:
df_recipes.loc[mask, 'instructions'] = extracted_instructions.tolist()

In [95]:
print(df_recipes.loc[mask, 'instructions'].tolist()[0])

preparation:
1. in a medium bowl, combine the diced tomatoes, red onion, jalapeno pepper, chopped cilantro, minced garlic, and lime juice.
2. stir everything together until well mixed.
3. taste and add salt if needed.
4. cover the salsa and refrigerate for at least 30 minutes to allow the flavors to meld together.

tortilla chips:
ingredients:
- corn tortillas
- vegetable oil for frying
- salt to taste

preparation:
1. cut the corn tortillas into triangle-shaped pieces.
2. heat vegetable oil in a deep pot or skillet over medium heat.
3. fry the tortilla triangles in batches until golden brown and crispy.
4. remove the chips from the oil and place them on a paper towel-lined plate to drain excess oil.
5. sprinkle with salt while still warm.
6. allow the chips to cool before serving.

to serve:
1. place the guacamole and salsa in separate serving bowls.
2. arrange the tortilla chips on a platter or individual plates.
3. serve the guacamole and salsa alongside the tortilla chips for dippi

In [98]:
df_recipes.columns

Index(['name', 'raw_text', 'cultural_restriction', 'calories', 'allergens',
       'recipeId', 'ingredients', 'instructions', 'carbohydrates', 'fat',
       'fiber', 'protein', 'taste', 'cooking_style', 'meal_type', 'prep_time',
       'cuisine', 'price', 'ingredients_list'],
      dtype='object')

In [101]:
df_recipes = df_recipes.drop(['meal_type'], axis=1)

In [103]:
len(df_recipes.columns)

18