## 0. 🎯Import libraries

In [12]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from datetime import datetime, timedelta

import spacy
import re

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

### 0.1. Load CSV file 

In [14]:
df_posts = pd.read_csv('../data/posts.csv')
df_posts.head()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Classic Tiramisu Recipe (original Italian pizz...,1701864000.0,19,0,0.8,19,5,False,https://reddit.com/r/recipes/comments/18c2c0q/...,https://www.diyfoodhacks.com/classic-tiramisu-...
1,Orange Cookies 🍊🧡,1701750000.0,175,0,0.97,175,6,False,https://reddit.com/r/recipes/comments/18b3ir1/...,https://i.redd.it/37t5h7ssje4c1.jpg
2,"Stir Fry Supreme – Chives, cashews and Shrimp",1701695000.0,100,0,0.91,100,9,False,https://reddit.com/r/recipes/comments/18ajm70/...,https://i.redd.it/6vrftswiz94c1.jpeg
3,Sous Vide Chicken and Potatoes,1701651000.0,9,0,1.0,9,1,False,https://reddit.com/r/recipes/comments/18a88g3/...,https://i.redd.it/rcgqae55e64c1.jpg
4,Chicken Riggies,1701551000.0,1,0,1.0,1,1,False,https://reddit.com/r/recipes/comments/189d72m/...,https://i.redd.it/bn11tg3i5y3c1.jpg


## 1. Data cleaning

### 1.1 Filter out posts with non-english titles

In [15]:
# load the English language model into spacy
nlp = spacy.load("en_core_web_sm")

# filter the english posts by applying custom function
filtered_df_posts = df_posts[df_posts['title'].apply(chadtools.is_english, model=nlp)]

filtered_df_posts.tail()


Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
2062,Bhindi,1567057000.0,15,0,0.75,15,3,False,https://reddit.com/r/recipes/comments/cwwkcq/b...,https://i.redd.it/y6698lnkqbj31.jpg
2063,Restaurant Style Phool Gobhi Masala Recipe,1567056000.0,21,0,0.88,21,1,False,https://reddit.com/r/recipes/comments/cwwfal/r...,https://i.redd.it/ycwjgo0pnbj31.jpg
2064,Celery and Soy Stuffed Butternut Squash,1566290000.0,7,0,0.74,7,1,False,https://reddit.com/r/recipes/comments/csv234/c...,https://imgur.com/OyakVfz
2065,Grilled Nectarine Caprese Salad,1566144000.0,1723,0,0.97,1723,22,False,https://reddit.com/r/recipes/comments/cs2z3v/g...,https://i.redd.it/tzjwjnulc8h31.jpg
2066,The right way to cut watermelon,1565469000.0,0,0,0.36,0,8,False,https://reddit.com/r/recipes/comments/con4hp/t...,https://i.redd.it/w72ozivbkof31.jpg


In [17]:
test_comment_1 = """One of my favorite Ukrainian recipes is the lesser known green version of the famous borshch. This one replaces the beets with sorrel.

It is also eaten in other ex-PLC countries like Poland, belarus, and Lithuania!

## [Ukrainian Green Borshch](https://cookingtoentertain.com/green-borscht/)

**INGREDIENTS**
  
• 500 grams Pork Ribs

• 500 grams Young Potatoes cubed

• 200 grams Sorrel fresh

• 1 Onion

• 1 Carrot

• 5 Eggs 4 hardboiled

• 1 tbsp Sour Cream or Smetana if you can find it


**INSTRUCTIONS**
 
1. In a pot add the pork ribs along with salt and pepper and the bay leaves. Add water up to 60% of the pot. Bring to a boil, then lower to a simmer and cover with a lid for one hour.
Add in the potatoes and bring back up to a boil. Let cook for 10 minutes.

2. While the potatoes are cooking, quickly fry some grated onion and carrot in a pan with a bit of oil. Add to the borshch and give everything a stir. Also chop up the hard boiled eggs and add that in.

3. In a small bowl beat together an egg and the sour cream. Swirl the pot of boiling borshch and slowly pour in the egg mixture so it cooks immediately as it hits the soup.

4. Turn off the heat and add in the chopped sorrel. Give everything a good stir and let sit for a few minutes before serving. Taste for salt and pepper and adjust as needed.
"""

test_comment_2 = """Recipe here originally: Leftover Turkey Soup

Stock (optional to make; can use chicken broth instead):

1 turkey carcass

Water

Salt

Soup:

1 tablespoon extra virgin olive oil

1 yellow onion, peeled and diced

4 carrots, peeled and diced

4 ribs celery, trimmed and diced

1 fennel bulb, trimmed, cored, and thinly sliced

5 cloves garlic, peeled and minced

5 sprigs thyme, bundled together with kitchen twine

6-7 cups prepared stock from above or use chicken broth

4 cups chopped or shredded leftover turkey; use in addition to any meat you pull off the turkey carcass

¾ cup pastina or ditalini

1 lemon, juiced

½ cup fresh parsley, minced

Big pinch of fennel fronds, minced

Crushed red pepper to taste

Salt and pepper

Make the stock:

Place the turkey carcass in a large stockpot and cover with 12 cups water. You may need more depending on the size of the carcass. Try your best to immerse the bird with water, but if your pot isn’t big enough, it’s ok if the back bone sticks out a bit. Add a big pinch of salt to the water.

Bring to a boil and then simmer for 2-3 hours. You may wish to flip the bird once during simmering. The liquid should reduce by almost half.

Cover the pot (with foil, if the turkey is sticking out) and transfer to the refrigerator overnight.

The next day, remove the carcass from the stock. Pick off any remaining meat and set it aside in a bowl to be added to the soup. Discard the carcass.

If the stock is very gelatinous, place it on the heat over medium-high just until the gelatin melts, and the stock returns to a liquid. Turn off the heat and strain through a fine-mesh sieve.

Give the pot a quick rinse and wipe it out. Return it to the stovetop.

Cook the soup aromatics:

Heat 1 tablespoon olive oil over medium heat. Add the onion, carrots, celery, and fennel. Season with salt and pepper. Cook for 8-10 minutes.

Add the garlic and cook for 1 minute until fragrant. Add the bundle of thyme.

Simmer the soup:

Pour in the prepared stock and the chopped turkey. Add salt, pepper, and crushed red pepper. Bring to a boil. Reduce heat and simmer for 30 minutes. Remove and discard the thyme.

Finish the soup:

Return the soup to a boil. Add the pastina and cook for 3-4 minutes. Taste and add salt and pepper.

Finish the soup by adding parsley, fennel fronds, and lemon juice.

To serve:

Ladle the soup into bowls and serve with lemon wedges and minced parsley on the side. Enjoy!"""


In [18]:
def extract_ingredients(comment):
    # Define a regular expression pattern to match ingredients
    ingredients_pattern = re.compile(r'\*\*INGREDIENTS\*\*([\s\S]*?)(?:\*\*INSTRUCTIONS\*\*|$)')

    # Find matches in the comment using the pattern
    matches = ingredients_pattern.search(comment)

    # If matches are found, extract and clean up the ingredients
    if matches:
        ingredients_text = matches.group(1).strip()
        ingredients_list = [ingredient.strip() for ingredient in re.split(r'\n•|\n', ingredients_text) if ingredient.strip()]
        return ingredients_list
    else:
        return None

# Test the function with the provided comments
ingredients_test_comment_1 = extract_ingredients(test_comment_1)
ingredients_test_comment_2 = extract_ingredients(test_comment_2)

# Print the results
print("Ingredients from Test Comment 1:")
print(ingredients_test_comment_1)

print("\nIngredients from Test Comment 2:")
print(ingredients_test_comment_2)

Ingredients from Test Comment 1:
['• 500 grams Pork Ribs', '500 grams Young Potatoes cubed', '200 grams Sorrel fresh', '1 Onion', '1 Carrot', '5 Eggs 4 hardboiled', '1 tbsp Sour Cream or Smetana if you can find it']

Ingredients from Test Comment 2:
None


In [None]:
# Extract the ingredients section
ingredients_section = re.search(r'\*\*INGREDIENTS\*\*(.*?)\*\*', test_comment_1, re.DOTALL)
if ingredients_section:
    # Extract the bullet points from the ingredients section
    ingredients_list = [ingredient.strip() for ingredient in re.split(r'\n\s*•\s*', ingredients_section.group(1)) if ingredient.strip()]

else:
    print("Ingredients section not found.")

In [8]:
ingredients_list

['500 grams Pork Ribs',
 '500 grams Young Potatoes cubed',
 '200 grams Sorrel fresh',
 '1 Onion',
 '1 Carrot',
 '5 Eggs 4 hardboiled',
 '1 tbsp Sour Cream or Smetana if you can find it']

In [10]:
separators = ['grams', 'tbsp']

def extract_ingredient_names(ingredient): 
    for sep in separators:
        if sep in ingredient:
            return ingredient.split(sep)[1].strip()
        
    return ingredient

ingredient_names = [extract_ingredient_names(ingredient) for ingredient in ingredients_list]

ingredient_names

['Pork Ribs',
 'Young Potatoes cubed',
 'Sorrel fresh',
 '1 Onion',
 '1 Carrot',
 '5 Eggs 4 hardboiled',
 'Sour Cream or Smetana if you can find it']