In [1]:
import re
import pandas as pd

# 1. Read Data

In [2]:
df = pd.read_csv('recipes.csv', delimiter=';')

# Filter nan columns for Ingredients and Directions
df = df[df['Ingredients'].notnull()]
df = df[df['Directions'].notnull()]

df.head(3)

Unnamed: 0,Recipe Name,Review Count,Recipe Photo,Author,Prepare Time,Cook Time,Total Time,Ingredients,Directions,RecipeID
0,Golden Crescent Rolls Recipe,304,https://images.media-allrecipes.com/userphotos...,Mike A.,25 m,15 m,3 h 10 m,"yeast,water,white sugar,salt,egg,butter,flour,...","Dissolve yeast in warm water.**Stir in sugar, ...",7000
1,Poppy Seed Bread with Glaze Recipe,137,https://images.media-allrecipes.com/userphotos...,Christina Jun,15 m,1 h,1 h 20 m,"flour,salt,baking powder,poppy,butter,vegetabl...",'Preheat oven to 350 degrees F (175 degrees C)...,7001
2,Applesauce Bread I Recipe,124,https://images.media-allrecipes.com/userphotos...,GAF55,10 m,1 h 20 m,1 h 30 m,"flour,egg,white sugar,vegetable oil,applesauce...",Preheat oven to 350 degrees F (175 degrees C)....,7003


In [3]:
df.iloc[0].Directions

'Dissolve yeast in warm water.**Stir in sugar, salt, eggs, butter, and 2 cups of flour. Beat until smooth. Mix in remaining flour until smooth. Scrape dough from side of bowl. Knead dough, then cover it and let rise in a warm place until double (about 1 1/2 hours).**Punch down dough. Divide in half. Roll each half into a 12-inch circle. Spread with butter. Cut into 10 to 15 wedge. Roll up the wedges starting with the wide end. Place rolls with point under on a greased baking sheet. Cover and let rise until double (about 1 hour).**Bake at 400 degrees F (205 degrees C) for 12-15 minute or until golden brown. Brush tops with butter when they come out of the oven.**'

In [4]:
df.iloc[0].Ingredients

'yeast,water,white sugar,salt,egg,butter,flour,butter'

# 2. Parse Data

## 2.1 Single Example

In [5]:
text = df.iloc[0].Directions
ingredients = df.iloc[0].Ingredients.split(',')

ingredients

['yeast', 'water', 'white sugar', 'salt', 'egg', 'butter', 'flour', 'butter']

In [6]:
matches = []
for item in ingredients:
    matches += [(item, match.start(),match.end()) for match in re.finditer(item, text)]
    
matches

[('yeast', 9, 14),
 ('water', 23, 28),
 ('salt', 46, 50),
 ('egg', 52, 55),
 ('butter', 58, 64),
 ('butter', 350, 356),
 ('butter', 629, 635),
 ('flour', 80, 85),
 ('flour', 123, 128),
 ('butter', 58, 64),
 ('butter', 350, 356),
 ('butter', 629, 635)]

## 2.2 Apply to All

In [7]:
def parse_data_from_df(df):
    data_parsed = []
    
    # Iterate over dataframe
    for ingredients,direction in zip(df.Ingredients,df.Directions):    
        matches = []
        
        # Find matched ingredients in current recipie
        for item in ingredients.split(','):
            # add start and end poses to list
            matches += [(item, match.start(),match.end()) for match in re.finditer(item, direction)]

        data_parsed.append({"recipe":direction,"ingredients": matches})
    
    return data_parsed

In [8]:
parsed = parse_data_from_df(df)
len(parsed)

12345

In [9]:
parsed[0]

{'recipe': 'Dissolve yeast in warm water.**Stir in sugar, salt, eggs, butter, and 2 cups of flour. Beat until smooth. Mix in remaining flour until smooth. Scrape dough from side of bowl. Knead dough, then cover it and let rise in a warm place until double (about 1 1/2 hours).**Punch down dough. Divide in half. Roll each half into a 12-inch circle. Spread with butter. Cut into 10 to 15 wedge. Roll up the wedges starting with the wide end. Place rolls with point under on a greased baking sheet. Cover and let rise until double (about 1 hour).**Bake at 400 degrees F (205 degrees C) for 12-15 minute or until golden brown. Brush tops with butter when they come out of the oven.**',
 'ingredients': [('yeast', 9, 14),
  ('water', 23, 28),
  ('salt', 46, 50),
  ('egg', 52, 55),
  ('butter', 58, 64),
  ('butter', 350, 356),
  ('butter', 629, 635),
  ('flour', 80, 85),
  ('flour', 123, 128),
  ('butter', 58, 64),
  ('butter', 350, 356),
  ('butter', 629, 635)]}

# 3. Analyze For Missing Ingredients

In [10]:
directions_all = df.Directions.to_list()
ingredients_all = df.Ingredients.to_list()

assert len(directions_all) == len(ingredients_all)

In [11]:
not_found = []

for ingredients,direction,i in zip(df.Ingredients,df.Directions, df.index):    
    for item in ingredients.split(','):
        # Check if item is exist in direction text. (If not, there may be misspelling)
        item_match = [match for match in re.finditer(item, direction)]
        if len(item_match) == 0:
            not_found.append((item,i))

not_found = dict(not_found)        
len(not_found)

862

In [13]:
list(not_found.items())[0:5]

[('white sugar', 12350),
 ('vegetable oil', 12349),
 ('baking soda', 12035),
 ('walnut', 11997),
 ('lemon', 11984)]