# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [63]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. 🎯Load credentials

In [64]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [65]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [66]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxNTIzMzI0LjE1NTgzMSwiaWF0IjoxNzAxNDM2OTI0LjE1NTgzMSwianRpIjoiMGtEWmRlU2xsSEgwWjd6SkFYdERWa0VVem1aTXV3IiwiY2lkIjoiQmVvRVNfeUhwNDJXWXF0aUNBeHVhZyIsImxpZCI6InQyXzhwNHl1NzBrIiwiYWlkIjoidDJfOHA0eXU3MGsiLCJsY2EiOjE2MDQxNDY3NTU0MjQsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.ld7dyOByUTXM71MsxaL_rki17liv__no_BtYY4brtWw4V1-ROMg7ruh14bEPmpSkDqsteWrkcwNq8rCDqN30efLhri_gtmFhRQsq2Xu-vPdsgRcku0J324nDioTChVN_b-Yoyp1RWZh_jkf-1Jho5T_HOREdSUKSIdHa3uKNH51VRKV4yGq4sf9mrn4tzOSxyhm4Nrf6j01doTMiQuiWui-HJGF-ji3vjwxtd7A4P7OigT0nr3W2cUt8MuQJz8WQY1KDTMO5OWqOThvbXFEtFESNSaKbQCKhZc2bs5bwC_MX4nn6quHU0xpcz66v0rKzS923zvSwTkqa0syktzMIeQ',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [67]:
my_token = response.json()['access_token']

In [68]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Testing out with one link first, wait until yy and sy finish scraping the link into a dataframe

In [69]:
# assign variable to one random link first
test_link = 'https://oauth.reddit.com/r/recipes/comments/183vmzc/green_borshch/'



In [72]:
# Send a GET request to the specified link with the necessary headers
response = s.get(test_link, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Extract the content from the response
    content = response.json()
    print(content)
else:
    print(f"Error: {response.status_code}")

with open("../data/test_comments.json", "w") as f:
    json.dump(content, f)

[{'kind': 'Listing', 'data': {'after': None, 'dist': 1, 'modhash': None, 'geo_filter': '', 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'recipes', 'selftext': '', 'user_reports': [], 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Green Borshch', 'link_flair_richtext': [{'e': 'text', 't': 'Recipe'}], 'subreddit_name_prefixed': 'r/recipes', 'hidden': False, 'pwls': 6, 'link_flair_css_class': 'recipe', 'downs': 0, 'top_awarded_type': None, 'parent_whitelist_status': 'all_ads', 'hide_score': False, 'name': 't3_183vmzc', 'quarantine': False, 'link_flair_text_color': 'dark', 'upvote_ratio': 0.94, 'author_flair_background_color': None, 'subreddit_type': 'public', 'ups': 56, 'total_awards_received': 0, 'media_embed': {}, 'author_flair_template_id': None, 'is_original_content': False, 'author_fullname': 't2_w3ksjbaf', 'secure_media': None, 'is_reddit_media_domain': True, 'is_meta': False, 'category': None, 'secure_media_embed': {

we can now navigate through the variable content

## 3. 🎯Navigating through Content