## 0. 🎯Import libraries

In [3]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from datetime import datetime, timedelta

import spacy
import re

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

### 0.1. Load json file 

In [4]:
df_posts = pd.read_json('../data/posts.json', orient='records', lines=True)
df_posts.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,media_metadata.5drdh5pxay051.s.u,media_metadata.5drdh5pxay051.id,media_metadata.d1whnrybrjl31.status,media_metadata.d1whnrybrjl31.e,media_metadata.d1whnrybrjl31.m,media_metadata.d1whnrybrjl31.p,media_metadata.d1whnrybrjl31.s.y,media_metadata.d1whnrybrjl31.s.x,media_metadata.d1whnrybrjl31.s.u,media_metadata.d1whnrybrjl31.id
0,,recipes,,t2_i9192ot8,False,,0,False,Classic Tiramisu Recipe (original Italian pizz...,"[{'e': 'text', 't': 'Recipe'}]",...,,,,,,,,,,
1,,recipes,,t2_g90hdupc,False,,0,False,Orange Cookies 🍊🧡,"[{'e': 'text', 't': 'Recipe'}]",...,,,,,,,,,,
2,,recipes,,t2_mudt5t8m,False,,0,False,"Stir Fry Supreme – Chives, cashews and Shrimp","[{'e': 'text', 't': 'Recipe'}]",...,,,,,,,,,,
3,,recipes,,t2_7xjeg,False,,0,False,Sous Vide Chicken and Potatoes,"[{'e': 'text', 't': 'Recipe'}]",...,,,,,,,,,,
4,,recipes,,t2_dl64q0hy,False,,0,False,Chicken Riggies,"[{'e': 'text', 't': 'Recipe'}]",...,,,,,,,,,,


## 1. Data cleaning

In [5]:
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

# filter out only the columns we want
df_filtered = df_posts[selected_cols].copy()
df_filtered.head()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Classic Tiramisu Recipe (original Italian pizz...,1701863565,19,0,0.8,19,5,False,https://reddit.com/r/recipes/comments/18c2c0q/...,https://www.diyfoodhacks.com/classic-tiramisu-...
1,Orange Cookies 🍊🧡,1701749797,174,0,0.97,174,6,False,https://reddit.com/r/recipes/comments/18b3ir1/...,https://i.redd.it/37t5h7ssje4c1.jpg
2,"Stir Fry Supreme – Chives, cashews and Shrimp",1701694537,101,0,0.91,101,9,False,https://reddit.com/r/recipes/comments/18ajm70/...,https://i.redd.it/6vrftswiz94c1.jpeg
3,Sous Vide Chicken and Potatoes,1701651021,7,0,0.9,7,1,False,https://reddit.com/r/recipes/comments/18a88g3/...,https://i.redd.it/rcgqae55e64c1.jpg
4,Chicken Riggies,1701551289,1,0,1.0,1,1,False,https://reddit.com/r/recipes/comments/189d72m/...,https://i.redd.it/bn11tg3i5y3c1.jpg


In [6]:
df_cleaned = df_posts.dropna(axis=1, how='all')
df_cleaned.tail()

Unnamed: 0,subreddit,selftext,author_fullname,saved,gilded,clicked,title,link_flair_richtext,subreddit_name_prefixed,hidden,...,media_metadata.5drdh5pxay051.s.u,media_metadata.5drdh5pxay051.id,media_metadata.d1whnrybrjl31.status,media_metadata.d1whnrybrjl31.e,media_metadata.d1whnrybrjl31.m,media_metadata.d1whnrybrjl31.p,media_metadata.d1whnrybrjl31.s.y,media_metadata.d1whnrybrjl31.s.x,media_metadata.d1whnrybrjl31.s.u,media_metadata.d1whnrybrjl31.id
2062,recipes,,t2_3ftl8yf0,False,0,False,Bhindi,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",r/recipes,False,...,,,,,,,,,,
2063,recipes,,t2_3ftl8yf0,False,0,False,Restaurant Style Phool Gobhi Masala Recipe,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",r/recipes,False,...,,,,,,,,,,
2064,recipes,,t2_71qg7,False,0,False,Celery and Soy Stuffed Butternut Squash,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",r/recipes,False,...,,,,,,,,,,
2065,recipes,,t2_1jfyjxw1,False,0,False,Grilled Nectarine Caprese Salad,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",r/recipes,False,...,,,,,,,,,,
2066,recipes,,t2_7xjeg,False,0,False,The right way to cut watermelon,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",r/recipes,False,...,,,,,,,,,,


### 1.1 Filter out posts with non-english titles

In [8]:
# load the English language model into spacy
nlp = spacy.load("en_core_web_sm")

# filter the english posts by applying custom function
df_filtered = df_filtered[df_filtered['title'].apply(chadtools.is_english, model=nlp)]

df_filtered.tail()


Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
2062,Bhindi,1567056689,14,0,0.72,14,3,False,https://reddit.com/r/recipes/comments/cwwkcq/b...,https://i.redd.it/y6698lnkqbj31.jpg
2063,Restaurant Style Phool Gobhi Masala Recipe,1567055721,19,0,0.85,19,1,False,https://reddit.com/r/recipes/comments/cwwfal/r...,https://i.redd.it/ycwjgo0pnbj31.jpg
2064,Celery and Soy Stuffed Butternut Squash,1566290287,7,0,0.69,7,1,False,https://reddit.com/r/recipes/comments/csv234/c...,https://imgur.com/OyakVfz
2065,Grilled Nectarine Caprese Salad,1566143980,1729,0,0.97,1729,22,False,https://reddit.com/r/recipes/comments/cs2z3v/g...,https://i.redd.it/tzjwjnulc8h31.jpg
2066,The right way to cut watermelon,1565468590,0,0,0.29,0,8,False,https://reddit.com/r/recipes/comments/con4hp/t...,https://i.redd.it/w72ozivbkof31.jpg


### 1.2 Change data to more appropriate types 

In [9]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2067 entries, 0 to 2066
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                2067 non-null   object 
 1   created_utc          2067 non-null   int64  
 2   ups                  2067 non-null   int64  
 3   downs                2067 non-null   int64  
 4   upvote_ratio         2067 non-null   float64
 5   score                2067 non-null   int64  
 6   num_comments         2067 non-null   int64  
 7   is_original_content  2067 non-null   bool   
 8   permalink            2067 non-null   object 
 9   url                  2067 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(3)
memory usage: 147.5+ KB


#### 1.2.1 Convert created_utc to a datetime object

In [10]:
df_filtered['created_utc'] = df_filtered['created_utc'].apply(lambda x: datetime.fromtimestamp(x))

In [11]:
specific_datetime = pd.to_datetime("2020-08-31 10:59:00")

# Filter out rows where 'created_utc' is before the specific datetime
df_filtered = df_filtered[df_filtered['created_utc'] >= specific_datetime]

df_filtered

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Classic Tiramisu Recipe (original Italian pizz...,2023-12-06 11:52:45,19,0,0.80,19,5,False,https://reddit.com/r/recipes/comments/18c2c0q/...,https://www.diyfoodhacks.com/classic-tiramisu-...
1,Orange Cookies 🍊🧡,2023-12-05 04:16:37,174,0,0.97,174,6,False,https://reddit.com/r/recipes/comments/18b3ir1/...,https://i.redd.it/37t5h7ssje4c1.jpg
2,"Stir Fry Supreme – Chives, cashews and Shrimp",2023-12-04 12:55:37,101,0,0.91,101,9,False,https://reddit.com/r/recipes/comments/18ajm70/...,https://i.redd.it/6vrftswiz94c1.jpeg
3,Sous Vide Chicken and Potatoes,2023-12-04 00:50:21,7,0,0.90,7,1,False,https://reddit.com/r/recipes/comments/18a88g3/...,https://i.redd.it/rcgqae55e64c1.jpg
4,Chicken Riggies,2023-12-02 21:08:09,1,0,1.00,1,1,False,https://reddit.com/r/recipes/comments/189d72m/...,https://i.redd.it/bn11tg3i5y3c1.jpg
...,...,...,...,...,...,...,...,...,...,...
1985,Bitter gourd yogurt curry....with no bitternes...,2020-10-16 20:18:12,6,0,0.64,6,6,False,https://reddit.com/r/recipes/comments/jcgb7j/b...,https://i.redd.it/bpootodgbit51.jpg
1986,Punjabi Aloo Samosa,2020-10-14 18:51:34,34,0,0.94,34,1,False,https://reddit.com/r/recipes/comments/jb5peu/p...,https://i.redd.it/9kndhfs2m3t51.jpg
1987,Ottolenghi's Baked Orzo w/Mozzarella,2020-09-24 17:59:05,23,0,0.84,23,5,False,https://reddit.com/r/recipes/comments/iz12pg/o...,https://i.redd.it/l7osuhkcm4p51.jpg
1988,Mushroom Barley Stew with Crispy Oyster Mushrooms,2020-09-20 01:27:07,2698,0,0.98,2698,41,False,https://reddit.com/r/recipes/comments/iw3wli/m...,https://i.redd.it/511qxuct57o51.jpg


#### 1.2.2 Change columns to a more size-efficient integer/ float type

In [12]:
int_cols = df_filtered.select_dtypes(include=('int64')).columns
df_filtered[int_cols].describe()

Unnamed: 0,ups,downs,score,num_comments
count,1094.0,1094.0,1094.0,1094.0
mean,690.558501,0.0,690.558501,19.539305
std,881.449412,0.0,881.449412,20.227883
min,1.0,0.0,1.0,0.0
25%,75.0,0.0,75.0,6.0
50%,257.0,0.0,257.0,13.0
75%,1091.25,0.0,1091.25,26.0
max,6011.0,0.0,6011.0,197.0


In [14]:
df_filtered['ups'] = df_filtered['ups'].astype('int16')
df_filtered['downs'] = df_filtered['downs'].astype('bool')
df_filtered['score'] = df_filtered['score'].astype('int16')
df_filtered['num_comments'] = df_filtered['num_comments'].astype('int16')   
df_filtered['upvote_ratio'] = df_filtered['upvote_ratio'].astype('float16')

In [15]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1094 entries, 0 to 1989
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   title                1094 non-null   object        
 1   created_utc          1094 non-null   datetime64[ns]
 2   ups                  1094 non-null   int16         
 3   downs                1094 non-null   bool          
 4   upvote_ratio         1094 non-null   float16       
 5   score                1094 non-null   int16         
 6   num_comments         1094 non-null   int16         
 7   is_original_content  1094 non-null   bool          
 8   permalink            1094 non-null   object        
 9   url                  1094 non-null   object        
dtypes: bool(2), datetime64[ns](1), float16(1), int16(3), object(3)
memory usage: 53.4+ KB


In [None]:
test_comment_1 = """One of my favorite Ukrainian recipes is the lesser known green version of the famous borshch. This one replaces the beets with sorrel.

It is also eaten in other ex-PLC countries like Poland, belarus, and Lithuania!

## [Ukrainian Green Borshch](https://cookingtoentertain.com/green-borscht/)

**INGREDIENTS**
  
• 500 grams Pork Ribs

• 500 grams Young Potatoes cubed

• 200 grams Sorrel fresh

• 1 Onion

• 1 Carrot

• 5 Eggs 4 hardboiled

• 1 tbsp Sour Cream or Smetana if you can find it


**INSTRUCTIONS**
 
1. In a pot add the pork ribs along with salt and pepper and the bay leaves. Add water up to 60% of the pot. Bring to a boil, then lower to a simmer and cover with a lid for one hour.
Add in the potatoes and bring back up to a boil. Let cook for 10 minutes.

2. While the potatoes are cooking, quickly fry some grated onion and carrot in a pan with a bit of oil. Add to the borshch and give everything a stir. Also chop up the hard boiled eggs and add that in.

3. In a small bowl beat together an egg and the sour cream. Swirl the pot of boiling borshch and slowly pour in the egg mixture so it cooks immediately as it hits the soup.

4. Turn off the heat and add in the chopped sorrel. Give everything a good stir and let sit for a few minutes before serving. Taste for salt and pepper and adjust as needed.
"""

test_comment_2 = """Recipe here originally: Leftover Turkey Soup

Stock (optional to make; can use chicken broth instead):

1 turkey carcass

Water

Salt

Soup:

1 tablespoon extra virgin olive oil

1 yellow onion, peeled and diced

4 carrots, peeled and diced

4 ribs celery, trimmed and diced

1 fennel bulb, trimmed, cored, and thinly sliced

5 cloves garlic, peeled and minced

5 sprigs thyme, bundled together with kitchen twine

6-7 cups prepared stock from above or use chicken broth

4 cups chopped or shredded leftover turkey; use in addition to any meat you pull off the turkey carcass

¾ cup pastina or ditalini

1 lemon, juiced

½ cup fresh parsley, minced

Big pinch of fennel fronds, minced

Crushed red pepper to taste

Salt and pepper

Make the stock:

Place the turkey carcass in a large stockpot and cover with 12 cups water. You may need more depending on the size of the carcass. Try your best to immerse the bird with water, but if your pot isn’t big enough, it’s ok if the back bone sticks out a bit. Add a big pinch of salt to the water.

Bring to a boil and then simmer for 2-3 hours. You may wish to flip the bird once during simmering. The liquid should reduce by almost half.

Cover the pot (with foil, if the turkey is sticking out) and transfer to the refrigerator overnight.

The next day, remove the carcass from the stock. Pick off any remaining meat and set it aside in a bowl to be added to the soup. Discard the carcass.

If the stock is very gelatinous, place it on the heat over medium-high just until the gelatin melts, and the stock returns to a liquid. Turn off the heat and strain through a fine-mesh sieve.

Give the pot a quick rinse and wipe it out. Return it to the stovetop.

Cook the soup aromatics:

Heat 1 tablespoon olive oil over medium heat. Add the onion, carrots, celery, and fennel. Season with salt and pepper. Cook for 8-10 minutes.

Add the garlic and cook for 1 minute until fragrant. Add the bundle of thyme.

Simmer the soup:

Pour in the prepared stock and the chopped turkey. Add salt, pepper, and crushed red pepper. Bring to a boil. Reduce heat and simmer for 30 minutes. Remove and discard the thyme.

Finish the soup:

Return the soup to a boil. Add the pastina and cook for 3-4 minutes. Taste and add salt and pepper.

Finish the soup by adding parsley, fennel fronds, and lemon juice.

To serve:

Ladle the soup into bowls and serve with lemon wedges and minced parsley on the side. Enjoy!"""


In [None]:
def extract_ingredients(comment):
    # Define a regular expression pattern to match ingredients
    ingredients_pattern = re.compile(r'\*\*INGREDIENTS\*\*([\s\S]*?)(?:\*\*INSTRUCTIONS\*\*|$)')

    # Find matches in the comment using the pattern
    matches = ingredients_pattern.search(comment)

    # If matches are found, extract and clean up the ingredients
    if matches:
        ingredients_text = matches.group(1).strip()
        ingredients_list = [ingredient.strip() for ingredient in re.split(r'\n•|\n', ingredients_text) if ingredient.strip()]
        return ingredients_list
    else:
        return None

# Test the function with the provided comments
ingredients_test_comment_1 = extract_ingredients(test_comment_1)
ingredients_test_comment_2 = extract_ingredients(test_comment_2)

# Print the results
print("Ingredients from Test Comment 1:")
print(ingredients_test_comment_1)

print("\nIngredients from Test Comment 2:")
print(ingredients_test_comment_2)

In [None]:
pprint(ingredients_list)

In [None]:
separators = ['grams', 'tbsp']

def extract_ingredient_names(ingredient): 
    for sep in separators:
        if sep in ingredient:
            return ingredient.split(sep)[1].strip()
        
    return ingredient

ingredient_names = [extract_ingredient_names(ingredient) for ingredient in ingredients_list]

ingredient_names

In [None]:
# Extract the ingredients section
ingredients_section = re.search(r'\*\*INGREDIENTS\*\*(.*?)\*\*', test_comment_1, re.DOTALL)
if ingredients_section:
    # Extract the bullet points from the ingredients section
    ingredients_list = [ingredient.strip() for ingredient in re.split(r'\n\s*•\s*', ingredients_section.group(1)) if ingredient.strip()]

else:
    print("Ingredients section not found.")

In [None]:
ingredients_list