# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [14]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

from time import sleep

## 1. 🎯Load credentials

In [15]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [16]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [17]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxOTYwOTg4LjQyOTExOSwiaWF0IjoxNzAxODc0NTg4LjQyOTExOSwianRpIjoiUTh3ektMcFEya0hlX2VTNkFhVkRhZG5adEJPRUxnIiwiY2lkIjoiVFJicTdUNUZLby1kTU1iSk5vMTdEQSIsImxpZCI6InQyXzJpOWF4eDh3IiwiYWlkIjoidDJfMmk5YXh4OHciLCJsY2EiOjE1NDA4OTAxODI0NTUsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.DLdeYsyE1NP6fLCqb-AopCjGMFMHA_oyjIlUPDNhl_C3fA0Cq-dy2NNl6sAfas9EHg5f8Y59OluOUmA76fR3tING5XepMRFgE9xIueWxRAvPisXH1c2QhvDlxkXSpPTdmaQ7SyAyZ2LnMOPQy3QRfmGg9SKYc4KinX6g5-faK-REkPl7rYSEXaX1HRuaKXSmeOIPS8fO2NO1AHXsX4Cj1ogZm-ZimHcFx4BJcgQfBR9163phbk9KOCMZ8zLieMOshV1mSoROQ90HKyzNl5ZJdrnoi2aTqMvp9YC4bmx2wDjt5SK9DFxyy_w7JyQHuPVK2QylIRja76i8CJh5PdjTbw',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [18]:
my_token = response.json()['access_token']

In [19]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Drawing from dataframe
Replacing the https:// with ''. then make it a usable link

In [20]:
df = pd.read_csv('../data/posts.csv')
#extracting permalink
permalink_list = df['permalink'].tolist()
# pprint(permalink_list)
# Convert to usable link
usable_comment_links = []
for link in permalink_list:
    modified_link = link.replace("https://", "")
    usable_comment_links.append("https://oauth." + modified_link)

# current link was https://reddit.com/r/recipes/comments/18b3ir1/orange_cookies/'


In [21]:
pprint(usable_comment_links)

['https://oauth.reddit.com/r/recipes/comments/18b3ir1/orange_cookies/',
 'https://oauth.reddit.com/r/recipes/comments/18ajm70/stir_fry_supreme_chives_cashews_and_shrimp/',
 'https://oauth.reddit.com/r/recipes/comments/18a88g3/sous_vide_chicken_and_potatoes/',
 'https://oauth.reddit.com/r/recipes/comments/1891x51/polish_krokiety_mushroom_sauerkraut_croquettes/',
 'https://oauth.reddit.com/r/recipes/comments/186osjd/festive_southern_jalapeno_pimento_cheese_dip/',
 'https://oauth.reddit.com/r/recipes/comments/1866xrq/quick_easy_nut_brittle/',
 'https://oauth.reddit.com/r/recipes/comments/183vmzc/green_borshch/',
 'https://oauth.reddit.com/r/recipes/comments/183st4x/leftover_turkey_and_pastina_soup/',
 'https://oauth.reddit.com/r/recipes/comments/1820b5b/nasi_goreng_indonesian_fried_rice_my_familys/',
 'https://oauth.reddit.com/r/recipes/comments/1813o0j/my_familys_apple_pie_recipe/',
 'https://oauth.reddit.com/r/recipes/comments/17zntw6/leek_and_beef_stir_fry/',
 'https://oauth.reddit.com

we can now navigate through the variable content

In [58]:
all_contents = []
for link in usable_comment_links[0:2]:
    # Send a GET request to the specified link with the necessary headers
    response = s.get(link, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the content from the response
        # We only want element 1 because response returns data on Post (which we don't want) and Comment (which we want)
        content = response.json()[1]
        pprint(content)

        all_contents.append(content)
    else:
        print(f"Error: {response.status_code}")

# Save the consolidated content to a JSON file
with open("../data/test_comments.json", "w") as f:
    json.dump(all_contents, f)


{'data': {'after': None,
          'before': None,
          'children': [{'data': {'all_awardings': [],
                                 'approved_at_utc': None,
                                 'approved_by': None,
                                 'archived': False,
                                 'associated_award': None,
                                 'author': 'elisetac',
                                 'author_flair_background_color': None,
                                 'author_flair_css_class': None,
                                 'author_flair_richtext': [],
                                 'author_flair_template_id': None,
                                 'author_flair_text': None,
                                 'author_flair_text_color': None,
                                 'author_flair_type': 'text',
                                 'author_fullname': 't2_g90hdupc',
                                 'author_is_blocked': False,
                                 'a

## 3. 🎯Navigating through Content

In [65]:
# Specify the file path
file_path = "../data/test_comments.json"

# Open the file in read mode and use json.load to load the content into a variable
with open(file_path, "r") as f:
    content = json.load(f)

# 'content' contains the data from the JSON file
#pprint(content)
#len(content)


Pathing to Comment from json with only 1 post

In [None]:
# Extract the value of the 'body' key
#print(len(content))
#body_value = content["data"]["children"][0]["data"]["body"]

However, we realise that under ["children"] there exists other posts made by OP which are not the original comment (ingredients list)
Assumption: filter by length

In [69]:
# Iterate over each dictionary in the list
for x in content:
    # Extract comments from each dictionary
    comments = [comment["data"]["body"] for comment in x["data"]["children"]]

    # Find the longest comment
    ingredient_comment = max(comments, key=len)

    # Print the longest comment for each dictionary
    print(ingredient_comment)


**Ingredients:**

, softened2 cup + 2 tablespoons) butter
* 100g (1/2 cup) granulated sugar
* 1 large egg
* 1 orange (using both the peel and the juice)
* 375g (3 cups) all-purpose flour
* 5g (1 teaspoon) baking powder
* 1 tsp vanilla extract (optional)
* 1 egg yolk and 1 teaspoon yogurt/milk for the top

**Instructions:** 

 In a mixing bowl, cream together the softened butter and granulated sugar. 
2. Beat in the egg until well combined. Add in the vanilla extract if using. 
 Feel free to use more than 1 orange if you wish (will need to adjust the amount of flour too). e juice will add moisture.
4. Add in the baking powder and then gradually add the flour, mixing until a dough forms. The amount of flour you use will depend on the amount of juice you got from the orange. If the dough is too sticky, add in more flour.
5. Place a sheet of baking paper (or baking mat) on a flat surface. Place the cookie dough on the paper and cover it with another sheet of baking paper. Roll out the doug

In [26]:
# define a list of words