# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [2]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. 🎯Load credentials

In [3]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [4]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [5]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxODE5NDkzLjA2NDc1MiwiaWF0IjoxNzAxNzMzMDkzLjA2NDc1MiwianRpIjoieG5UVksyb0JWV08yY21PM2FXYW5SX0xDeGNFZnNBIiwiY2lkIjoiQmVvRVNfeUhwNDJXWXF0aUNBeHVhZyIsImxpZCI6InQyXzhwNHl1NzBrIiwiYWlkIjoidDJfOHA0eXU3MGsiLCJsY2EiOjE2MDQxNDY3NTU0MjQsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.AR-ynZfL1_cPt1fCIhHZK5ZD4AmAnYsdaiZCPjPzn9J4vdl0i90Bwbx4_hdxcRNncGRJngOs_s8KGlpDn14Ni3m_w2z0kwdHe4yV_D8blfcza8kgWl_RKNMns5f8tE2frhxC5cKH4whv5lolmT8s3DvrBux6ZUd3Qnso7ysZ9urUysSjVOvehRQ2sMQH9Stgg36BM_vkmPqPvobaBG_uFLW170-bgMpEkU-jjgjD8h_stnCim8YxzgAhsZIrwfSuUzravXujtCdVBjlQ_GRZ7yLLmfy34SH6oNRWxfXvW0-4PNKxEftaHswKTXoDYGgMVVMeAozaOvnSUO9HS_bsTg',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [6]:
my_token = response.json()['access_token']

In [7]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Testing out with one link first, wait until yy and sy finish scraping the link into a dataframe

In [8]:
# assign variable to one random link first
test_link = 'https://oauth.reddit.com/r/recipes/comments/183vmzc/green_borshch/'



In [9]:
# Send a GET request to the specified link with the necessary headers
response = s.get(test_link, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Extract the content from the response
    # We only want element 1 because response returns data on Post (which we dont want) and Comment (which we want)
    content = response.json()[1]
    pprint(content)
else:
    print(f"Error: {response.status_code}")

with open("../data/test_comments.json", "w") as f:
    json.dump(content, f)

{'data': {'after': None,
          'before': None,
          'children': [{'data': {'all_awardings': [],
                                 'approved_at_utc': None,
                                 'approved_by': None,
                                 'archived': False,
                                 'associated_award': None,
                                 'author': 'CookingToEntertain',
                                 'author_flair_background_color': None,
                                 'author_flair_css_class': None,
                                 'author_flair_richtext': [],
                                 'author_flair_template_id': None,
                                 'author_flair_text': None,
                                 'author_flair_text_color': None,
                                 'author_flair_type': 'text',
                                 'author_fullname': 't2_w3ksjbaf',
                                 'author_is_blocked': False,
                         

we can now navigate through the variable content

## 3. 🎯Navigating through Content

In [10]:
# Specify the file path
file_path = "../data/test_comments.json"

# Open the file in read mode and use json.load to load the content into a variable
with open(file_path, "r") as f:
    content = json.load(f)

# 'content' contains the data from the JSON file
# pprint(content)


Pathing to Comment from json with only 1 post

In [11]:
# Extract the value of the 'body' key
print(len(content))
body_value = content["data"]["children"][0]["data"]["body"]
# print(body_value)


2


However, we realise that under ["children"] there exists other posts made by OP which are not the original comment (ingredients list)
Assumption: filter by length

In [22]:
for maggi in content["data"]["children"]:
    length_test = ''
    # print(maggi["data"]["body"])
    if len(maggi["data"]["body"]) >= len(length_test):
        length_test = maggi

pprint(length_test["data"]["body"])

"Real borsh, it's  RED . REAL UKRAINIAN BORSH 🍛 ."
