# ✅Step 2: Scraping Reddit Recipes

## 0. 🎯Import libraries

In [5]:
import sys
import json
import requests as r
from tqdm import tqdm
import pandas as pd
from time import sleep

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Load credentials and set up Session

Using a function defined in our `chadtools.py` script, we can authenticate with the Reddit API using our own personal `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [6]:
s = r.Session()
headers = chadtools.authenticate_and_get_headers()

## 2. 🎯Sending Get Request

Replace the normal links in the posts dataframe with the API links for better efficiency.

In [7]:
df = pd.read_json('../data/posts.json', orient='records')

In [8]:
df["comment_link"] = df["permalink"].replace(to_replace="https://", value="https://oauth.", regex=True)
df["comment_link"]

0       https://oauth.reddit.com/r/recipes/comments/1a...
1       https://oauth.reddit.com/r/recipes/comments/1a...
2       https://oauth.reddit.com/r/recipes/comments/1a...
3       https://oauth.reddit.com/r/recipes/comments/1a...
4       https://oauth.reddit.com/r/recipes/comments/1a...
                              ...                        
2061    https://oauth.reddit.com/r/recipes/comments/cz...
2062    https://oauth.reddit.com/r/recipes/comments/cy...
2063    https://oauth.reddit.com/r/recipes/comments/cw...
2064    https://oauth.reddit.com/r/recipes/comments/cw...
2065    https://oauth.reddit.com/r/recipes/comments/cs...
Name: comment_link, Length: 2066, dtype: object

We can now get the comments from each post.

WARNING: FOLLOWING CODE TAKES 20MIN TO RUN. If testing, skip this chunk and load from json file instead. 

In [9]:
all_contents = []
for link in tqdm(df["comment_link"]):
    # Send a GET request to the specified link with the necessary headers
    response = s.get(link, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the content from the response
        # We only want element 1 because response returns data on Post (which we don't want) and Comment (which we want)
        content = response.json()[1]
        all_contents.append(content)
    else:
        print(f"Error: {response.status_code}")
        all_contents.append(None)
    sleep(0.3) # add delay due to reddit's rate limit

  0%|          | 0/2066 [00:00<?, ?it/s]

100%|██████████| 2066/2066 [22:34<00:00,  1.53it/s]


Write the result to a json file for testing, so that we don't have to scrape for 20 min every time.
WARNING: will create a >100MB file, so don't commit it to github. Have added it to .gitignore

In [10]:
with open('../data/all_comments.json', 'w') as f:
    json.dump(all_contents, f, indent=4)

## 3. 🎯Navigating to OP's recipe comment

Open from json file if testing.

In [12]:
with open('../data/all_comments.json', 'r') as f:
    all_contents = json.load(f)

len(all_contents)

2066

We realised that under ["children"] there exists other posts made by the OP which are not the recipe, i.e. replies to other commenters.
***Assumption**: the longest comment made by the OP is most likely to be the one containing the actual recipe.*

In [13]:
# Iterate over each dictionary in the list
recipe_comment_list = []
for x in all_contents:
    if x != None:
        # Extract comments from each dictionary
        comments = [comment["data"]["body"] for comment in x["data"]["children"]]

        # Find the longest comment
        if len(comments) != 0:
            recipe_comment = max(comments, key=len)
        else:
            ingredient_comment = ""
        recipe_comment_list.append(recipe_comment)
    else:
        recipe_comment_list.append(None)

df.insert(df.columns.get_loc("comment_link"), "recipe_comment", recipe_comment_list)

Write to a json file in preparation for data cleanup, which will be done in the next notebook.

In [14]:
df.to_json('../data/posts_with_comments.json', orient='records', indent=4)