# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [48]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. 🎯Load credentials

In [49]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [50]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [51]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxODE0NDkxLjU5OTkzLCJpYXQiOjE3MDE3MjgwOTEuNTk5OTI5LCJqdGkiOiJhZzY1UGVJdjI4djE2em02UGtRZGdrZnd4ZE5uOXciLCJjaWQiOiJCZW9FU195SHA0MldZcXRpQ0F4dWFnIiwibGlkIjoidDJfOHA0eXU3MGsiLCJhaWQiOiJ0Ml84cDR5dTcwayIsImxjYSI6MTYwNDE0Njc1NTQyNCwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.q4EH7G-Z-GwH4eJg-RLnXPGtp86n-DuvoGaVLRjfgNux74CIIH8gqJYvQdQn5vHVDf1srg2_FZyrKkjyHMc8C5IZ5za-NRBbUmP4dHnddPWnuT5EjZsn6mQS16b-hPVSqSpQmF2W7Ry6l6Yam92NJhIZXTj4fvkeIpj5HrmJrGEgMDSAwTDW6EF6k9yNF3XJ3vQk9_w9BIkWw2MMmB1botkhA6wFTFXneH-muaioB6n5AX5rEgXtAKLjRqRLQ7asOvMYcE271N4RPd8wVCryfAWRnR6tZ50RsZu-ZWy1XIUobRyBgc65Pu50XppXCtvxkrTnyq_8-KU4rxJo7niAXA',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [52]:
my_token = response.json()['access_token']

In [53]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Testing out with one link first, wait until yy and sy finish scraping the link into a dataframe

In [54]:
# assign variable to one random link first
test_link = 'https://oauth.reddit.com/r/recipes/comments/183vmzc/green_borshch/'



In [55]:
# Send a GET request to the specified link with the necessary headers
response = s.get(test_link, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Extract the content from the response
    content = response.json()
    pprint(content)
else:
    print(f"Error: {response.status_code}")

with open("../data/test_comments.json", "w") as f:
    json.dump(content, f)

[{'data': {'after': None,
           'before': None,
           'children': [{'data': {'all_awardings': [],
                                  'allow_live_comments': False,
                                  'approved_at_utc': None,
                                  'approved_by': None,
                                  'archived': False,
                                  'author': 'CookingToEntertain',
                                  'author_flair_background_color': None,
                                  'author_flair_css_class': None,
                                  'author_flair_richtext': [],
                                  'author_flair_template_id': None,
                                  'author_flair_text': None,
                                  'author_flair_text_color': None,
                                  'author_flair_type': 'text',
                                  'author_fullname': 't2_w3ksjbaf',
                                  'author_is_blocked': False,
    

we can now navigate through the variable content

## 3. 🎯Navigating through Content

In [56]:
# Specify the file path
file_path = "../data/test_comments.json"

# Open the file in read mode and use json.load to load the content into a variable
with open(file_path, "r") as f:
    content = json.load(f)

# 'content' contains the data from the JSON file
# pprint(content)


Pathing to Comment from json with only 1 post

In [57]:
# Extract the value of the 'body' key
print(len(content))
body_value = content[1]["data"]["children"][0]["data"]["body"]
print(body_value)

2
One of my favorite Ukrainian recipes is the lesser known green version of the famous borshch. This one replaces the beets with sorrel.

It is also eaten in other ex-PLC countries like Poland, belarus, and Lithuania!

## [Ukrainian Green Borshch](https://cookingtoentertain.com/green-borscht/)

**INGREDIENTS**
  
• 500 grams Pork Ribs

• 500 grams Young Potatoes cubed

• 200 grams Sorrel fresh

• 1 Onion

• 1 Carrot

• 5 Eggs 4 hardboiled

• 1 tbsp Sour Cream or Smetana if you can find it


**INSTRUCTIONS**
 
1. In a pot add the pork ribs along with salt and pepper and the bay leaves. Add water up to 60% of the pot. Bring to a boil, then lower to a simmer and cover with a lid for one hour.
Add in the potatoes and bring back up to a boil. Let cook for 10 minutes.

2. While the potatoes are cooking, quickly fry some grated onion and carrot in a pan with a bit of oil. Add to the borshch and give everything a stir. Also chop up the hard boiled eggs and add that in.

3. In a small bowl beat