# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [38]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

from time import sleep

## 1. 🎯Load credentials

In [39]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [40]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [41]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA1OTQ3NDQ2LjI3NjY0OSwiaWF0IjoxNzA1ODYxMDQ2LjI3NjY0OCwianRpIjoiQWd4d1VSVWY2bWRPMjFxeG8tSE1qVzFYcDNWeW53IiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.oFVOdZ42iZqXs9Forve_lyEb3xUfvZmVn1UIK3uqrZDLNIBWyHzovdmoQyvtdpaO_ORRYI2m2pjoZJg3OJeEOnf9f_AvncFUQMk_9oOy9XJEBg2jTJK1JtipBOxMy4nGXzA1vmsbwUmT-IQ_AAnR_fHCbPgce93R0-P3tK86MDSEOD_-B-u2s4Qs-sVtXtaBZVNfB1MK7uvelVI3afgLJgRrtdOhqGVZk0wUqw4hRnYsUAEcDjfCL_jRp4E8QR51S2VTQpJZMyqUpWIZ5zvOMivqfgzxLpgiQxmkPFhCY3VsJ569XAr48jF7gXOpQG4sMZPSK4dquJjoaWkwzCSbQw',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [42]:
my_token = response.json()['access_token']

In [43]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Drawing from dataframe
Replacing the https:// with ''. then make it a usable link

In [44]:
df = pd.read_csv('../data/posts.csv')
#extracting permalink
permalink_list = df['permalink'].tolist()

# Convert to usable link
usable_comment_links = []
for link in permalink_list:
    modified_link = link.replace("https://", "")
    usable_comment_links.append("https://oauth." + modified_link)

df.insert(df.columns.get_loc("permalink"), "comment_link", usable_comment_links)

we can now navigate through the variable content

In [49]:
all_contents = []
for link in tqdm(df["comment_link"]):
    # Send a GET request to the specified link with the necessary headers
    response = s.get(link, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the content from the response
        # We only want element 1 because response returns data on Post (which we don't want) and Comment (which we want)
        content = response.json()[1]
        all_contents.append(content)
    else:
        print(f"Error: {response.status_code}")
        all_contents.append({})
    sleep(0.5)
# Save the consolidated content to a JSON file
with open("../data/test_comments.json", "w") as f:
    json.dump(all_contents, f)


100%|██████████| 2067/2067 [28:14<00:00,  1.22it/s]


## 3. 🎯Navigating through Content

In [50]:
# Specify the file path
file_path = "../data/test_comments.json"

# Open the file in read mode and use json.load to load the content into a variable
with open(file_path, "r") as f:
    content = json.load(f)

# 'content' contains the data from the JSON file
#pprint(content)
len(content)

2067

In [53]:
content[:1]

[{'kind': 'Listing',
  'data': {'after': None,
   'dist': None,
   'modhash': None,
   'geo_filter': '',
   'children': [{'kind': 't1',
     'data': {'subreddit_id': 't5_2qh56',
      'approved_at_utc': None,
      'author_is_blocked': False,
      'comment_type': None,
      'awarders': [],
      'mod_reason_by': None,
      'banned_by': None,
      'author_flair_type': 'text',
      'total_awards_received': 0,
      'subreddit': 'recipes',
      'author_flair_template_id': None,
      'likes': None,
      'replies': {'kind': 'Listing',
       'data': {'after': None,
        'dist': None,
        'modhash': None,
        'geo_filter': '',
        'children': [{'kind': 't1',
          'data': {'subreddit_id': 't5_2qh56',
           'approved_at_utc': None,
           'author_is_blocked': False,
           'comment_type': None,
           'awarders': [],
           'mod_reason_by': None,
           'banned_by': None,
           'author_flair_type': 'text',
           'total_awards_recei

In [54]:
comments = [comment["data"]["body"] for comment in content[0]["data"]["children"]]
comments[:5]

['**INGREDIENTS**\n\n* ladyfinger cookies\n* 3 egg yolks\n* 1/3 cup sugar\n* strong creamy espresso\n* 8 ounces mascarpone\n* cocoa\n\n&amp;#x200B;\n\n**INSTRUCTIONS**\n\n* Beat the Egg Yolks and Sugar: Beat 3 egg yolks with sugar until light and creamy.\n* Add Mascarpone: Mix the mascarpone into the egg yolk and sugar mixture until well combined.\n* Dip the Ladyfingers: Briefly dip the ladyfinger cookies in the strong espresso. Arrange them in a dish.\n* Add the Egg Mixture: Spoon a portion of the egg and mascarpone mixture over the ladyfingers. Repeat for multiple layers.\n* Chill the Classic Tiramisu: Chill the classic tiramisu for several hours or preferably overnight.\n* Serve the Classic Tiramisu: Sprinkle with cocoa, serve chilled, and enjoy.\n\n[See recipe on DIYFoodhacks.com](https://www.diyfoodhacks.com/classic-tiramisu-recipe-mastering-this-italian-dessert-at-home/)',
 'For being so good, that sounds ridiculously easy to make. Oh, god help my waistline.',
 'Where are the bea

In [55]:
max_comment = max(comments, key=len)
max_comment

'**INGREDIENTS**\n\n* ladyfinger cookies\n* 3 egg yolks\n* 1/3 cup sugar\n* strong creamy espresso\n* 8 ounces mascarpone\n* cocoa\n\n&amp;#x200B;\n\n**INSTRUCTIONS**\n\n* Beat the Egg Yolks and Sugar: Beat 3 egg yolks with sugar until light and creamy.\n* Add Mascarpone: Mix the mascarpone into the egg yolk and sugar mixture until well combined.\n* Dip the Ladyfingers: Briefly dip the ladyfinger cookies in the strong espresso. Arrange them in a dish.\n* Add the Egg Mixture: Spoon a portion of the egg and mascarpone mixture over the ladyfingers. Repeat for multiple layers.\n* Chill the Classic Tiramisu: Chill the classic tiramisu for several hours or preferably overnight.\n* Serve the Classic Tiramisu: Sprinkle with cocoa, serve chilled, and enjoy.\n\n[See recipe on DIYFoodhacks.com](https://www.diyfoodhacks.com/classic-tiramisu-recipe-mastering-this-italian-dessert-at-home/)'

Pathing to Comment from json with only 1 post

In [47]:
# Extract the value of the 'body' key
#print(len(content))
#body_value = content["data"]["children"][0]["data"]["body"]

We realise that under ["children"] there exists other posts made by the OP which are not the original comment (ingredients list)  
Assumption: the longest comment made by the OP is most likely to be the one containing the actual recipe 

In [56]:
# Iterate over each dictionary in the list
ingredient_comment_list = []
for x in content:
    # Extract comments from each dictionary
    comments = [comment["data"]["body"] for comment in x["data"]["children"]]

    # Find the longest comment
    if len(comments) != 0:
        ingredient_comment = max(comments, key=len)
    else:
        ingredient_comment = ""
    # Print the longest comment for each dictionary
    ingredient_comment_list.append(ingredient_comment)
    # print(ingredient_comment)

In [57]:
df.insert(df.columns.get_loc("comment_link"), "ingredient_comment", ingredient_comment_list)

In [72]:
df = df[df['ingredient_comment'] != '']
df = df[df['ingredient_comment'].str.contains("\n") == True]
df.to_csv('../data/posts_with_comments.csv', index=False)

df.shape

(1743, 115)

In [None]:
# define a list of words