# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [1]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

from time import sleep

## 1. 🎯Load credentials

In [2]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [3]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [4]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA1OTQ0ODQ4LjU3MTg2OSwiaWF0IjoxNzA1ODU4NDQ4LjU3MTg2OCwianRpIjoiaHU3MkNabk5FMDU5a0x3SVpIN2NqYTJPcFF2LW9RIiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.UG2JFZGPur-HRtQtvk5goo1TFEnABT4p0H3jz5QRhs1bqHSKEcw6mSi6_zpdvDXHEH2FMdNvEl_0iVmQ-eZRYOf6VXWtWmxY2077iA-zx2xrWsXWpYkSKCUFcaMXbPby32hYC5RcerY87Ey-d3DBPPwx_e1vVgJuJT8rW1lGHnh7-mp5q1hVGV8gDtfEcWghxnkuY4f0w44RM2hL9_UQMflwajg0lvko1iYd_PaMw_IeuHpndDwzadeHpcPPA1f7dfbrmOZ1n6Sgw0oQmAKCXX_0Iyig0L7JwI3R2DR-qSjO-yVYi-Ji_0D9VSItSAu-9D7Jq519PfTPooO98--lBQ',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [5]:
my_token = response.json()['access_token']

In [6]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Drawing from dataframe
Replacing the https:// with ''. then make it a usable link

In [7]:
df = pd.read_csv('../data/posts.csv')
#extracting permalink
permalink_list = df['permalink'].tolist()

# Convert to usable link
usable_comment_links = []
for link in permalink_list:
    modified_link = link.replace("https://", "")
    usable_comment_links.append("https://oauth." + modified_link)

pprint(usable_comment_links[:5])

# current link was https://reddit.com/r/recipes/comments/18b3ir1/orange_cookies/'


['https://oauth.reddit.com/r/recipes/comments/18c2c0q/classic_tiramisu_recipe_original_italian_pizzeria/',
 'https://oauth.reddit.com/r/recipes/comments/18b3ir1/orange_cookies/',
 'https://oauth.reddit.com/r/recipes/comments/18ajm70/stir_fry_supreme_chives_cashews_and_shrimp/',
 'https://oauth.reddit.com/r/recipes/comments/18a88g3/sous_vide_chicken_and_potatoes/',
 'https://oauth.reddit.com/r/recipes/comments/189d72m/chicken_riggies/']


Inspect link format

In [8]:
pprint(usable_comment_links[:3])

['https://oauth.reddit.com/r/recipes/comments/18c2c0q/classic_tiramisu_recipe_original_italian_pizzeria/',
 'https://oauth.reddit.com/r/recipes/comments/18b3ir1/orange_cookies/',
 'https://oauth.reddit.com/r/recipes/comments/18ajm70/stir_fry_supreme_chives_cashews_and_shrimp/']


we can now navigate through the variable content

In [9]:
all_contents = []
for link in tqdm(usable_comment_links[0:10]):
    # Send a GET request to the specified link with the necessary headers
    response = s.get(link, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the content from the response
        # We only want element 1 because response returns data on Post (which we don't want) and Comment (which we want)
        content = response.json()[1]
        all_contents.append(content)
    else:
        print(f"Error: {response.status_code}")

# Save the consolidated content to a JSON file
with open("../data/test_comments.json", "w") as f:
    json.dump(all_contents, f)


100%|██████████| 10/10 [00:03<00:00,  2.88it/s]


## 3. 🎯Navigating through Content

In [10]:
# Specify the file path
file_path = "../data/test_comments.json"

# Open the file in read mode and use json.load to load the content into a variable
with open(file_path, "r") as f:
    content = json.load(f)

# 'content' contains the data from the JSON file
#pprint(content)
len(content)

10

Pathing to Comment from json with only 1 post

In [22]:
# Extract the value of the 'body' key
#print(len(content))
#body_value = content["data"]["children"][0]["data"]["body"]

We realise that under ["children"] there exists other posts made by the OP which are not the original comment (ingredients list)  
Assumption: the longest comment made by the OP is most likely to be the one containing the actual recipe 

In [23]:
# Iterate over each dictionary in the list
for x in content:
    # Extract comments from each dictionary
    comments = [comment["data"]["body"] for comment in x["data"]["children"]]

    # Find the longest comment
    ingredient_comment = max(comments, key=len)

    # Print the longest comment for each dictionary
    print(ingredient_comment)


**INGREDIENTS**

* ladyfinger cookies
* 3 egg yolks
* 1/3 cup sugar
* strong creamy espresso
* 8 ounces mascarpone
* cocoa

&amp;#x200B;

**INSTRUCTIONS**

* Beat the Egg Yolks and Sugar: Beat 3 egg yolks with sugar until light and creamy.
* Add Mascarpone: Mix the mascarpone into the egg yolk and sugar mixture until well combined.
* Dip the Ladyfingers: Briefly dip the ladyfinger cookies in the strong espresso. Arrange them in a dish.
* Add the Egg Mixture: Spoon a portion of the egg and mascarpone mixture over the ladyfingers. Repeat for multiple layers.
* Chill the Classic Tiramisu: Chill the classic tiramisu for several hours or preferably overnight.
* Serve the Classic Tiramisu: Sprinkle with cocoa, serve chilled, and enjoy.

[See recipe on DIYFoodhacks.com](https://www.diyfoodhacks.com/classic-tiramisu-recipe-mastering-this-italian-dessert-at-home/)
**Ingredients:**

, softened2 cup + 2 tablespoons) butter
* 100g (1/2 cup) granulated sugar
* 1 large egg
* 1 orange (using both the

In [24]:
# define a list of words