# 🧹 02 Data Cleaning

This notebook will clean up the initially scraped posts.csv file by:
1. Removing posts with non-English titles
2. Removing irrelevant columns
3. Converting relevant columns to the correct data types

## 0. 🎯Import libraries

In [1]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from datetime import datetime, timedelta

import spacy
import re

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

### 0.1. Load json file 

In [2]:
df_posts = pd.read_json('../data/posts.json', orient='records', lines=True)
df_posts.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,is_gallery,media_metadata,gallery_data,poll_data,author_cakeday,crosspost_parent_list,crosspost_parent
0,,recipes,,t2_9mmv4,False,,0,False,Buffalo Chicken Tenders,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
1,,recipes,,t2_s92gwguui,False,,0,False,Prawn Katsu Baos,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
2,,recipes,,t2_2elyzmmv,False,,0,False,Cinnamon Rolls,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
3,,recipes,Cast Iron goodness with a pair of eggs sunny s...,t2_ub96nnb4,False,,0,False,Bacon Jalapeño Sweet Potato Hash,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
4,,recipes,,t2_9mmv4,False,,0,False,Mushroom-Taleggio Risotto,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,


## 1. Data cleaning

In [4]:
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

# filter out only the columns we want
df_filtered = df_posts[selected_cols].copy()
df_filtered.head()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Buffalo Chicken Tenders,1705944195,179,0,0.95,179,10,False,https://reddit.com/r/recipes/comments/19d0wfc/...,https://i.redd.it/qtwisr8gz0ec1.jpeg
1,Prawn Katsu Baos,1705528588,269,0,0.95,269,11,False,https://reddit.com/r/recipes/comments/1998zka/...,https://i.redd.it/q81uyef4o2dc1.jpeg
2,Cinnamon Rolls,1704476711,253,0,0.96,253,21,False,https://reddit.com/r/recipes/comments/18zcqmd/...,https://i.redd.it/7uef78dbsnac1.jpeg
3,Bacon Jalapeño Sweet Potato Hash,1704325340,119,0,0.98,119,9,False,https://reddit.com/r/recipes/comments/18xxyl1/...,https://i.redd.it/jcbdya99abac1.jpeg
4,Mushroom-Taleggio Risotto,1704050722,184,0,0.98,184,6,False,https://reddit.com/r/recipes/comments/18vf164/...,https://i.redd.it/qc5akriilo9c1.jpeg


In [10]:
df_cleaned = df_filtered.dropna(axis=1, how='all')
df_cleaned.tail()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
2061,Eggplant Chickpea Dip,1567530528,10,0,0.82,10,1,False,https://reddit.com/r/recipes/comments/cz7pe8/e...,https://imgur.com/gjUBUU7
2062,End-Of-Summer Sesame Slaw,1567414819,23,0,0.77,23,4,False,https://reddit.com/r/recipes/comments/cymk0i/e...,https://i.redd.it/8nehck9hb5k31.jpg
2063,Bhindi,1567056689,15,0,0.75,15,3,False,https://reddit.com/r/recipes/comments/cwwkcq/b...,https://i.redd.it/y6698lnkqbj31.jpg
2064,Restaurant Style Phool Gobhi Masala Recipe,1567055721,19,0,0.87,19,1,False,https://reddit.com/r/recipes/comments/cwwfal/r...,https://i.redd.it/ycwjgo0pnbj31.jpg
2065,Celery and Soy Stuffed Butternut Squash,1566290287,5,0,0.65,5,1,False,https://reddit.com/r/recipes/comments/csv234/c...,https://imgur.com/OyakVfz


### 1.1 Filter out posts with non-english titles

In [11]:
# load the English language model into spacy
nlp = spacy.load("en_core_web_sm")

# filter the english posts by applying custom function
df_filtered = df_filtered[df_filtered['title'].apply(chadtools.is_english, model=nlp)]

df_filtered.tail()


Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
2061,Eggplant Chickpea Dip,1567530528,10,0,0.82,10,1,False,https://reddit.com/r/recipes/comments/cz7pe8/e...,https://imgur.com/gjUBUU7
2062,End-Of-Summer Sesame Slaw,1567414819,23,0,0.77,23,4,False,https://reddit.com/r/recipes/comments/cymk0i/e...,https://i.redd.it/8nehck9hb5k31.jpg
2063,Bhindi,1567056689,15,0,0.75,15,3,False,https://reddit.com/r/recipes/comments/cwwkcq/b...,https://i.redd.it/y6698lnkqbj31.jpg
2064,Restaurant Style Phool Gobhi Masala Recipe,1567055721,19,0,0.87,19,1,False,https://reddit.com/r/recipes/comments/cwwfal/r...,https://i.redd.it/ycwjgo0pnbj31.jpg
2065,Celery and Soy Stuffed Butternut Squash,1566290287,5,0,0.65,5,1,False,https://reddit.com/r/recipes/comments/csv234/c...,https://imgur.com/OyakVfz


In [14]:
df_filtered.to_csv('../data/check_eng.csv', index=False)

### 1.2 Change data to more appropriate types 

In [18]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2066 entries, 0 to 2065
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                2066 non-null   object 
 1   created_utc          2066 non-null   int64  
 2   ups                  2066 non-null   int64  
 3   downs                2066 non-null   int64  
 4   upvote_ratio         2066 non-null   float64
 5   score                2066 non-null   int64  
 6   num_comments         2066 non-null   int64  
 7   is_original_content  2066 non-null   bool   
 8   permalink            2066 non-null   object 
 9   url                  2066 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(3)
memory usage: 147.4+ KB


#### 1.2.1 Convert created_utc to a datetime object

In [19]:
df_filtered['created_utc'] = df_filtered['created_utc'].apply(lambda x: datetime.fromtimestamp(x))

In [20]:
specific_datetime = pd.to_datetime("2020-08-31 10:59:00")

# Filter out rows where 'created_utc' is before the specific datetime
df_filtered = df_filtered[df_filtered['created_utc'] >= specific_datetime]

df_filtered

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Prawn Katsu Baos,2024-01-17 21:56:28,256,0,0.95,256,9,False,https://reddit.com/r/recipes/comments/1998zka/...,https://i.redd.it/q81uyef4o2dc1.jpeg
1,Cinnamon Rolls,2024-01-05 17:45:11,251,0,0.96,251,21,False,https://reddit.com/r/recipes/comments/18zcqmd/...,https://i.redd.it/7uef78dbsnac1.jpeg
2,Bacon Jalapeño Sweet Potato Hash,2024-01-03 23:42:20,111,0,0.97,111,9,False,https://reddit.com/r/recipes/comments/18xxyl1/...,https://i.redd.it/jcbdya99abac1.jpeg
3,Mushroom-Taleggio Risotto,2023-12-31 19:25:22,174,0,0.98,174,6,False,https://reddit.com/r/recipes/comments/18vf164/...,https://i.redd.it/qc5akriilo9c1.jpeg
4,Cinnamon Oatmeal Chocolate Chip Cookies (Recipe),2023-12-31 13:19:31,206,0,0.95,206,16,False,https://reddit.com/r/recipes/comments/18v7m3w/...,https://i.redd.it/aki9a36yrm9c1.jpeg
...,...,...,...,...,...,...,...,...,...,...
1986,Bitter gourd yogurt curry....with no bitternes...,2020-10-16 20:18:12,8,0,0.66,8,6,False,https://reddit.com/r/recipes/comments/jcgb7j/b...,https://i.redd.it/bpootodgbit51.jpg
1987,Punjabi Aloo Samosa,2020-10-14 18:51:34,39,0,0.96,39,1,False,https://reddit.com/r/recipes/comments/jb5peu/p...,https://i.redd.it/9kndhfs2m3t51.jpg
1988,Ottolenghi's Baked Orzo w/Mozzarella,2020-09-24 17:59:05,22,0,0.84,22,5,False,https://reddit.com/r/recipes/comments/iz12pg/o...,https://i.redd.it/l7osuhkcm4p51.jpg
1989,Mushroom Barley Stew with Crispy Oyster Mushrooms,2020-09-20 01:27:07,2698,0,0.98,2698,41,False,https://reddit.com/r/recipes/comments/iw3wli/m...,https://i.redd.it/511qxuct57o51.jpg


#### 1.2.2 Change columns to a more size-efficient integer/ float type

In [21]:
int_cols = df_filtered.select_dtypes(include=('int64')).columns
df_filtered[int_cols].describe()

Unnamed: 0,ups,downs,score,num_comments
count,1097.0,1097.0,1097.0,1097.0
mean,678.95351,0.0,678.95351,19.53783
std,876.807327,0.0,876.807327,20.338842
min,6.0,0.0,6.0,0.0
25%,76.0,0.0,76.0,6.0
50%,247.0,0.0,247.0,13.0
75%,1028.0,0.0,1028.0,26.0
max,6047.0,0.0,6047.0,197.0


In [22]:
df_filtered['ups'] = df_filtered['ups'].astype('int16')
df_filtered['downs'] = df_filtered['downs'].astype('bool')
df_filtered['score'] = df_filtered['score'].astype('int16')
df_filtered['num_comments'] = df_filtered['num_comments'].astype('int16')   
df_filtered['upvote_ratio'] = df_filtered['upvote_ratio'].astype('float16')

In [23]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1097 entries, 0 to 1990
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   title                1097 non-null   object        
 1   created_utc          1097 non-null   datetime64[ns]
 2   ups                  1097 non-null   int16         
 3   downs                1097 non-null   bool          
 4   upvote_ratio         1097 non-null   float16       
 5   score                1097 non-null   int16         
 6   num_comments         1097 non-null   int16         
 7   is_original_content  1097 non-null   bool          
 8   permalink            1097 non-null   object        
 9   url                  1097 non-null   object        
dtypes: bool(2), datetime64[ns](1), float16(1), int16(3), object(3)
memory usage: 53.6+ KB


### Save df_filtered as a CSV file

In [24]:
df_filtered.to_csv('../data/filtered_posts.csv', index=False)

In [25]:
test_comment_1 = """One of my favorite Ukrainian recipes is the lesser known green version of the famous borshch. This one replaces the beets with sorrel.

It is also eaten in other ex-PLC countries like Poland, belarus, and Lithuania!

## [Ukrainian Green Borshch](https://cookingtoentertain.com/green-borscht/)

**INGREDIENTS**
  
• 500 grams Pork Ribs

• 500 grams Young Potatoes cubed

• 200 grams Sorrel fresh

• 1 Onion

• 1 Carrot

• 5 Eggs 4 hardboiled

• 1 tbsp Sour Cream or Smetana if you can find it


**INSTRUCTIONS**
 
1. In a pot add the pork ribs along with salt and pepper and the bay leaves. Add water up to 60% of the pot. Bring to a boil, then lower to a simmer and cover with a lid for one hour.
Add in the potatoes and bring back up to a boil. Let cook for 10 minutes.

2. While the potatoes are cooking, quickly fry some grated onion and carrot in a pan with a bit of oil. Add to the borshch and give everything a stir. Also chop up the hard boiled eggs and add that in.

3. In a small bowl beat together an egg and the sour cream. Swirl the pot of boiling borshch and slowly pour in the egg mixture so it cooks immediately as it hits the soup.

4. Turn off the heat and add in the chopped sorrel. Give everything a good stir and let sit for a few minutes before serving. Taste for salt and pepper and adjust as needed.
"""

test_comment_2 = """Recipe here originally: Leftover Turkey Soup

Stock (optional to make; can use chicken broth instead):

1 turkey carcass

Water

Salt

Soup:

1 tablespoon extra virgin olive oil

1 yellow onion, peeled and diced

4 carrots, peeled and diced

4 ribs celery, trimmed and diced

1 fennel bulb, trimmed, cored, and thinly sliced

5 cloves garlic, peeled and minced

5 sprigs thyme, bundled together with kitchen twine

6-7 cups prepared stock from above or use chicken broth

4 cups chopped or shredded leftover turkey; use in addition to any meat you pull off the turkey carcass

¾ cup pastina or ditalini

1 lemon, juiced

½ cup fresh parsley, minced

Big pinch of fennel fronds, minced

Crushed red pepper to taste

Salt and pepper

Make the stock:

Place the turkey carcass in a large stockpot and cover with 12 cups water. You may need more depending on the size of the carcass. Try your best to immerse the bird with water, but if your pot isn’t big enough, it’s ok if the back bone sticks out a bit. Add a big pinch of salt to the water.

Bring to a boil and then simmer for 2-3 hours. You may wish to flip the bird once during simmering. The liquid should reduce by almost half.

Cover the pot (with foil, if the turkey is sticking out) and transfer to the refrigerator overnight.

The next day, remove the carcass from the stock. Pick off any remaining meat and set it aside in a bowl to be added to the soup. Discard the carcass.

If the stock is very gelatinous, place it on the heat over medium-high just until the gelatin melts, and the stock returns to a liquid. Turn off the heat and strain through a fine-mesh sieve.

Give the pot a quick rinse and wipe it out. Return it to the stovetop.

Cook the soup aromatics:

Heat 1 tablespoon olive oil over medium heat. Add the onion, carrots, celery, and fennel. Season with salt and pepper. Cook for 8-10 minutes.

Add the garlic and cook for 1 minute until fragrant. Add the bundle of thyme.

Simmer the soup:

Pour in the prepared stock and the chopped turkey. Add salt, pepper, and crushed red pepper. Bring to a boil. Reduce heat and simmer for 30 minutes. Remove and discard the thyme.

Finish the soup:

Return the soup to a boil. Add the pastina and cook for 3-4 minutes. Taste and add salt and pepper.

Finish the soup by adding parsley, fennel fronds, and lemon juice.

To serve:

Ladle the soup into bowls and serve with lemon wedges and minced parsley on the side. Enjoy!"""


In [26]:
def extract_ingredients(comment):
    # Define a regular expression pattern to match ingredients
    ingredients_pattern = re.compile(r'\*\*INGREDIENTS\*\*([\s\S]*?)(?:\*\*INSTRUCTIONS\*\*|$)')

    # Find matches in the comment using the pattern
    matches = ingredients_pattern.search(comment)

    # If matches are found, extract and clean up the ingredients
    if matches:
        ingredients_text = matches.group(1).strip()
        ingredients_list = [ingredient.strip() for ingredient in re.split(r'\n•|\n', ingredients_text) if ingredient.strip()]
        return ingredients_list
    else:
        return None

# Test the function with the provided comments
ingredients_test_comment_1 = extract_ingredients(test_comment_1)
ingredients_test_comment_2 = extract_ingredients(test_comment_2)

# Print the results
print("Ingredients from Test Comment 1:")
print(ingredients_test_comment_1)

print("\nIngredients from Test Comment 2:")
print(ingredients_test_comment_2)

Ingredients from Test Comment 1:
['• 500 grams Pork Ribs', '500 grams Young Potatoes cubed', '200 grams Sorrel fresh', '1 Onion', '1 Carrot', '5 Eggs 4 hardboiled', '1 tbsp Sour Cream or Smetana if you can find it']

Ingredients from Test Comment 2:
None


In [27]:
pprint(ingredients_list)

NameError: name 'ingredients_list' is not defined

In [None]:
separators = ['grams', 'tbsp']

def extract_ingredient_names(ingredient): 
    for sep in separators:
        if sep in ingredient:
            return ingredient.split(sep)[1].strip()
        
    return ingredient

ingredient_names = [extract_ingredient_names(ingredient) for ingredient in ingredients_list]

ingredient_names

In [None]:
# Extract the ingredients section
ingredients_section = re.search(r'\*\*INGREDIENTS\*\*(.*?)\*\*', test_comment_1, re.DOTALL)
if ingredients_section:
    # Extract the bullet points from the ingredients section
    ingredients_list = [ingredient.strip() for ingredient in re.split(r'\n\s*•\s*', ingredients_section.group(1)) if ingredient.strip()]

else:
    print("Ingredients section not found.")

In [None]:
ingredients_list