In [4]:
import os
if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

import re
import string
import json
import pandas as pd
from tqdm import tqdm 

from utils.data_loading import read_recipes

## Loading and cleaning up data
___

In [None]:
# reading Recipe1M data 
# d return a list of recipes, where each object is a dictionary containg recipe text, ingredients etc.
with open('data/recipe1M/layer1.json') as f:
    d = json.load(f)

In [3]:
with open('data/recipe1M/response.json') as f:
    d = json.load(f)

In [19]:
ingredients = d[0]['ingredients']

In [20]:
ingredients

[{'text': 'penne'},
 {'text': 'cheese sauce'},
 {'text': 'cheddar cheese'},
 {'text': 'gruyere cheese'},
 {'text': 'dried chipotle powder'},
 {'text': 'unsalted butter'},
 {'text': 'all - purpose flour'},
 {'text': 'milk'},
 {'text': '14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)'},
 {'text': '2 ounces semisoft cheese (page 23), grated (1/2 cup)'},
 {'text': 'kosher salt'},
 {'text': 'dried chipotle powder'},
 {'text': 'garlic powder'},
 {'text': '(makes about 4 cups)'}]

In [21]:
ingredients = [list(x.values())[0] for x in ingredients]

In [22]:
(ingredients)

['penne',
 'cheese sauce',
 'cheddar cheese',
 'gruyere cheese',
 'dried chipotle powder',
 'unsalted butter',
 'all - purpose flour',
 'milk',
 '14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)',
 '2 ounces semisoft cheese (page 23), grated (1/2 cup)',
 'kosher salt',
 'dried chipotle powder',
 'garlic powder',
 '(makes about 4 cups)']

In [23]:
id = []
ingredients = []

# Looping through and appending the values to a newly created list 
for attr in tqdm(d):
  id.append(attr['id'])
  raw_ingredients = attr['ingredients']
  ingredients.append([list(x.values())[0] for x in raw_ingredients])

100%|██████████| 1029720/1029720 [00:06<00:00, 170097.59it/s]


In [24]:
recipe_data = {'ID': id , 'Ingredients': ingredients}
recipe_df = pd.DataFrame(recipe_data)
recipe_df.head(3)

Unnamed: 0,ID,Ingredients
0,000018c8a5,"[penne, cheese sauce, cheddar cheese, gruyere ..."
1,000033e39b,"[elbow macaroni, American cheese, celery, gree..."
2,000035f7ed,"[tomatoes, kosher salt, red onion, green bell ..."


In [25]:
recipe_df.to_csv('recipe_ingredients.csv', sep=';', index=None)

In [26]:
ingredient_dict = {}

# Iterate over each row in the original DataFrame
for index, row in recipe_df.iterrows():
    recipe = row['ID']
    ingredients = row['Ingredients']
    
    # Iterate over each ingredient in the ingredients list
    for ingredient in ingredients:
        # If the ingredient is already in the dictionary, append the recipe to its list of occurrences
        if ingredient in ingredient_dict:
            ingredient_dict[ingredient].append(recipe)
        # If the ingredient is not in the dictionary, create a new entry with the recipe
        else:
            ingredient_dict[ingredient] = [recipe]

# Convert the ingredient dictionary to a DataFrame
switched_df = pd.DataFrame(ingredient_dict.items(), columns=['Ingredient', 'IDs'])

# Print the switched DataFrame
print(switched_df)

                                              Ingredient  \
0                                                  penne   
1                                           cheese sauce   
2                                         cheddar cheese   
3                                         gruyere cheese   
4                                  dried chipotle powder   
...                                                  ...   
170527                    1 packages cranberries (12 oz)   
170528      12 ounces, weight Monterey Jack Cheese Cubed   
170529                               1 ENCHILADA FILLING   
170530                                        1 TOPPINGS   
170531  15 prawns, or more to taste, peeled and deveined   

                                                      IDs  
0       [000018c8a5, 006a7c00c4, 00ab15a16a, 00b7ee800...  
1       [000018c8a5, 008ae190f6, 0151eb1521, 017dd0a21...  
2       [000018c8a5, 00003a70b1, 00010c7867, 000c3fbb3...  
3       [000018c8a5, 0048d5de55, 006337

In [49]:
switched_df.to_csv('data/ingredients.csv', sep=';', index=None)

In [38]:
import numpy as np

def remove_singles(lst):
    if len(lst) > 1:
        return lst
    else:
        return np.NaN

In [44]:
switched_df['IDs'] = switched_df['IDs'].apply(remove_singles)

In [45]:
switched_df.dropna(subset=['IDs'], how='all', inplace=True)

In [48]:
switched_df.head(-5)

Unnamed: 0,Ingredient,IDs
0,penne,"[000018c8a5, 006a7c00c4, 00ab15a16a, 00b7ee800..."
1,cheese sauce,"[000018c8a5, 008ae190f6, 0151eb1521, 017dd0a21..."
2,cheddar cheese,"[000018c8a5, 00003a70b1, 00010c7867, 000c3fbb3..."
3,gruyere cheese,"[000018c8a5, 0048d5de55, 006337a42a, 0090963a0..."
4,dried chipotle powder,"[000018c8a5, 000018c8a5, 006b0a8758, 009aa605e..."
...,...,...
167251,16 each carrot curls Target 2 lb For $3.00 thr...,"[f9efef9e53, ffa49a7012]"
167421,strawberry vinaigrette,"[fa36a5829a, ff87a39bca]"
167482,1 can undrained mushrooms,"[fa59e3c880, fef60cd84d]"
167905,1 cup cajeta,"[fb1e094375, fe791ba1cc]"


## Cleaning foods
___

In [2]:
recipe_df = read_recipes()

In [5]:
TRASH_WORDS = ['can', 'cup', 'lb.', 'frozen', 'cooked', 'package', 'ounce', 'ounces',
               'pounds', 'pound', 'sm', 'teaspoon', 'teaspoons', 'spoon', 'spoons', 'fl',
               'oz', 'medium', 'bottle']

In [14]:
def remove_numbers_and_specific_words(sentence):
    # Remove numbers
    if type(sentence) == str:
        sentence = re.sub(r'\d+', '', sentence)
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        # Remove specific words
        for word in TRASH_WORDS:
            sentence = sentence.replace(word, '')
        
    return sentence

In [15]:
recipe_df['Ingredient'].apply(remove_numbers_and_specific_words)

0                                 penne
1                          cheese sauce
2                        cheddar cheese
3                        gruyere cheese
4                 dried chipotle powder
                      ...              
33343                         tbsp reto
33344                  prepared pudding
33345         M  Ms peanut chocolate dy
33346     s Cleaned Dried Pumpkin Seeds
33347                     Makes about  
Name: Ingredient, Length: 33348, dtype: object