# ✅Step 2: Scraping Reddit Recipes

## 0. 🎯Import libraries

In [1]:
import sys
import json
import requests as r

from scrapy import Selector

from pprint import pprint
from tqdm import tqdm

import numpy as np
import pandas as pd

from time import sleep

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Load credentials and set up Session

Using a function defined in our `chadtools.py` script, we can authenticate with the Reddit API using our own personal `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [4]:
s = r.Session()
headers = chadtools.authenticate_and_get_headers()

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA2MzEzNDgwLjIwMjIwOCwiaWF0IjoxNzA2MjI3MDgwLjIwMjIwOCwianRpIjoiWFJIV0wzZVFJRk5jeDlwdUlCQk1ZQUsweWx6bnB3IiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.BXMtK1jB7OokT0dydxm98MRCoBKoG80KEXNkkD0RhpkuZeR0Ti6MTlhInWVW80mCpJxjfWOGnkKBXzuHt9BmncUbJSp3BSG2RN_zBa_hqf4Ujm57aChwTnt-ZE_Gjm5dtF8KmapTphpseeyXjXUqfL6I4cZcH4Ld_H8GjQy9B4K51ehRx2Tg-jhl3tnHVsBnvuTp2VrUKflucN_xw-kFeg-ujXPu3rRuFLrtyWOD1yDlT3e1TW1LW18qcJV0PXxkIUxhgfofAf3t6R-YFnXOeeFF7qzIFHg_BoUAWGJ3cFmr3D3DAJKfMu6qv6mvVluW03FZB-92AWvz_HUVKdMZeg',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by zichengliu'}

## 2. 🎯Sending Get Request

Replace the normal links in the posts dataframe with the API links for better efficiency.

In [5]:
df = pd.read_json('../data/posts.json', orient='records', lines=True)
# extracting permalink
permalink_list = df['permalink'].tolist()

# Convert to usable link
usable_comment_links = []
for link in permalink_list:
    modified_link = link.replace("https://", "")
    usable_comment_links.append("https://oauth." + modified_link)

df.insert(df.columns.get_loc("permalink"), "comment_link", usable_comment_links)

we can now navigate through the variable content

WARNING: FOLLOWING CODE IS TAKES 20MIN TO RUN, load from json file instead and skip this chunk if testing.

Write the result to a json file for testing, so that we don't have to scrape for 20 min every time.
WARNING: will create a >100MB file, so don't commit it to github. Have added it to .gitignore

## 3. 🎯Navigating to OP's recipe comment

Open from json file if testing.

In [6]:
with open('../data/comments.json', 'r') as f:
    all_contents = json.load(f)

len(all_contents)

We realised that under ["children"] there exists other posts made by the OP which are not the recipe, i.e. replies to other commenters.
***Assumption**: the longest comment made by the OP is most likely to be the one containing the actual recipe.*

In [8]:
# Iterate over each dictionary in the list
ingredient_comment_list = []
for x in all_contents:
    if x != None:
        # Extract comments from each dictionary
        comments = [comment["data"]["body"] for comment in x["data"]["children"]]

        # Find the longest comment
        if len(comments) != 0:
            ingredient_comment = max(comments, key=len)
        else:
            ingredient_comment = ""
        ingredient_comment_list.append(ingredient_comment)
    else:
        ingredient_comment_list.append(None)

df.insert(df.columns.get_loc("comment_link"), "ingredient_comment", ingredient_comment_list)

Write to a json file in preparation for data cleanup, which will be done in the next notebook.

In [9]:
df.to_json('../data/posts_with_comments.json', orient='records', indent=4)