# ✅Step 2: Scraping OP Comment

## 0. 🎯Import libraries

In [1]:
import sys
import json
import requests as r

from scrapy import Selector

from pprint import pprint
from tqdm import tqdm

from time import sleep

## 1. 🎯Load credentials

In [2]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [3]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [4]:
headers = chadtools.authenticate_and_get_headers()
headers

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA1OTYzNzI0LjE3NDM3LCJpYXQiOjE3MDU4NzczMjQuMTc0MzcsImp0aSI6InBpSVVhdm1hbVpaYnM2MVBEMmNJeGczdmpKUlJ3ZyIsImNpZCI6Im1oVG1fNnhFVDc1ZDloZlpCa0tGWEEiLCJsaWQiOiJ0Ml8xNmZhNDIiLCJhaWQiOiJ0Ml8xNmZhNDIiLCJsY2EiOjE0OTAyNDc3MjcwMTEsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.U2krrNDkIQYWeunz9YEjjf4d4E5W1K3LA4wQCpeEBj4X7EWhkcqPphXGpcu2EWh35JQz_VHfHfTVYXC4C2fdmz4X3vw_nYAGAsMvRa3nUAC0KQTja73e154NwMA4c9CohUm2vIzXYiMqiHWyMgAL70_2yAVsBK5l1V1ODfrlzyuyGguIgi7qX8nF9t5JQ-y86SL7htMYQGvcS3hYJ2CxmpYj4fG1S-LTTPitjAW8XYaC56TfUX-M8jnkcG6oSBVK2_a5yjPpPkyM88RqlOuzYk0Mv22cNbaHFWiaWvxjOFhjDV7evrKK7ezjlEZJyYBsXTM6NVkXT0x91iM-qp5xCQ',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [5]:
my_token = response.json()['access_token']

In [6]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending Get Request

Drawing from dataframe
Replacing the https:// with ''. then make it a usable link

In [7]:
df = pd.read_csv('../data/posts.csv')
#extracting permalink
permalink_list = df['permalink'].tolist()

# Convert to usable link
usable_comment_links = []
for link in permalink_list:
    modified_link = link.replace("https://", "")
    usable_comment_links.append("https://oauth." + modified_link)

df.insert(df.columns.get_loc("permalink"), "comment_link", usable_comment_links)

we can now navigate through the variable content

In [8]:
all_contents = []
for link in tqdm(df["comment_link"]):
    # Send a GET request to the specified link with the necessary headers
    response = s.get(link, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the content from the response
        # We only want element 1 because response returns data on Post (which we don't want) and Comment (which we want)
        content = response.json()[1]
        all_contents.append(content)
    else:
        print(f"Error: {response.status_code}")
        all_contents.append({})
    sleep(0.3)
# Save the consolidated content to a JSON file
with open("../data/test_comments.json", "w") as f:
    json.dump(all_contents, f)

100%|██████████| 2067/2067 [21:06<00:00,  1.63it/s]


## 3. 🎯Navigating through Content

In [9]:
# Specify the file path
file_path = "../data/test_comments.json"

# Open the file in read mode and use json.load to load the content into a variable
with open(file_path, "r") as f:
    content = json.load(f)

len(content)

2067

We realise that under ["children"] there exists other posts made by the OP which are not the recipe, i.e. replies to other commenters.  
***Assumption**: the longest comment made by the OP is most likely to be the one containing the actual recipe*

In [10]:
# Iterate over each dictionary in the list
ingredient_comment_list = []
for x in content:
    # Extract comments from each dictionary
    comments = [comment["data"]["body"] for comment in x["data"]["children"]]

    # Find the longest comment
    if len(comments) != 0:
        ingredient_comment = max(comments, key=len)
    else:
        ingredient_comment = ""
    # Print the longest comment for each dictionary
    ingredient_comment_list.append(ingredient_comment)
    # print(ingredient_comment)

df.insert(df.columns.get_loc("comment_link"), "ingredient_comment", ingredient_comment_list)

## 4. 🧹Data Cleanup

Some posts are not formatted properly or have been deleted. We will remove them from our dataframe by checking for newlines, which are present in all properly formatted recipes.

In [11]:
df = df[df['ingredient_comment'].str.contains("\n") == True]

df.shape

(1743, 115)

We clean up the posts.csv by removing irrelevant columns and renaming columns.

In [12]:
desired_columns = ['id',
                   'title',
                   'score',
                   'num_comments',
                   'created_utc', 
                   'upvote_ratio',
                   'link_flair_text',
                   'author',
                   'url',
                   'ingredient_comment',
                   'comment_link',
                   'permalink']

In [13]:
df.loc[:, desired_columns]

Unnamed: 0,id,title,score,num_comments,created_utc,upvote_ratio,link_flair_text,author,url,ingredient_comment,comment_link,permalink
0,18c2c0q,Classic Tiramisu Recipe (original Italian pizz...,22,5,1.701864e+09,0.83,Recipe,Altruistic_Set_3810,https://www.diyfoodhacks.com/classic-tiramisu-...,**INGREDIENTS**\n\n* ladyfinger cookies\n* 3 e...,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18c2c0q/...
2,18ajm70,"Stir Fry Supreme – Chives, cashews and Shrimp",102,9,1.701695e+09,0.91,Recipe,Served_With_Rice,https://i.redd.it/6vrftswiz94c1.jpeg,Full recipe : [https://servedwithrice.com/stir...,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18ajm70/...
3,18a88g3,Sous Vide Chicken and Potatoes,8,1,1.701651e+09,1.00,Recipe,hoosyourdaddyo,https://i.redd.it/rcgqae55e64c1.jpg,**Sous Vide Chicken and Potatoes**\n\n**Ingred...,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/18a88g3/...
4,189d72m,Chicken Riggies,3,2,1.701551e+09,1.00,Recipe,jimboslice122,https://i.redd.it/bn11tg3i5y3c1.jpg,Full recipe can be found:\n\nhttps://www.takes...,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/189d72m/...
5,1891x51,Polish Krokiety - Mushroom &amp; Sauerkraut Cr...,279,17,1.701517e+09,0.97,Recipe,mienczaczek,https://i.redd.it/lsazpcn8bv3c1.jpg,Check out my beloved recipe for Polish Krokiet...,https://oauth.reddit.com/r/recipes/comments/18...,https://reddit.com/r/recipes/comments/1891x51/...
...,...,...,...,...,...,...,...,...,...,...,...,...
2060,cz7pe8,Eggplant Chickpea Dip,9,1,1.567531e+09,0.81,Fruit\Vegetarian,mortoray,https://imgur.com/gjUBUU7,Recipe: [https://edaqaskitchen.com/recipe/eggp...,https://oauth.reddit.com/r/recipes/comments/cz...,https://reddit.com/r/recipes/comments/cz7pe8/e...
2061,cymk0i,End-Of-Summer Sesame Slaw,23,4,1.567415e+09,0.77,Fruit\Vegetarian,sweetpotatofamily,https://i.redd.it/8nehck9hb5k31.jpg,I just started a blog about food and homestead...,https://oauth.reddit.com/r/recipes/comments/cy...,https://reddit.com/r/recipes/comments/cymk0i/e...
2063,cwwfal,Restaurant Style Phool Gobhi Masala Recipe,21,1,1.567056e+09,0.88,Fruit\Vegetarian,mark30322,https://i.redd.it/ycwjgo0pnbj31.jpg,"Ingredients\n\n12 Cauliflower (gobi)\t, cut to...",https://oauth.reddit.com/r/recipes/comments/cw...,https://reddit.com/r/recipes/comments/cwwfal/r...
2064,csv234,Celery and Soy Stuffed Butternut Squash,7,1,1.566290e+09,0.71,Fruit\Vegetarian,mortoray,https://imgur.com/OyakVfz,Recipe: [https://edaqaskitchen.com/recipe/cele...,https://oauth.reddit.com/r/recipes/comments/cs...,https://reddit.com/r/recipes/comments/csv234/c...


In [14]:
filtered_df = df.loc[:, ['id', 'title', 'score', 'num_comments', 'created_utc', 'upvote_ratio']]
filtered_df

Unnamed: 0,id,title,score,num_comments,created_utc,upvote_ratio
0,18c2c0q,Classic Tiramisu Recipe (original Italian pizz...,22,5,1.701864e+09,0.83
2,18ajm70,"Stir Fry Supreme – Chives, cashews and Shrimp",102,9,1.701695e+09,0.91
3,18a88g3,Sous Vide Chicken and Potatoes,8,1,1.701651e+09,1.00
4,189d72m,Chicken Riggies,3,2,1.701551e+09,1.00
5,1891x51,Polish Krokiety - Mushroom &amp; Sauerkraut Cr...,279,17,1.701517e+09,0.97
...,...,...,...,...,...,...
2060,cz7pe8,Eggplant Chickpea Dip,9,1,1.567531e+09,0.81
2061,cymk0i,End-Of-Summer Sesame Slaw,23,4,1.567415e+09,0.77
2063,cwwfal,Restaurant Style Phool Gobhi Masala Recipe,21,1,1.567056e+09,0.88
2064,csv234,Celery and Soy Stuffed Butternut Squash,7,1,1.566290e+09,0.71


In [15]:
filtered_df = df.loc[:, desired_columns]
filtered_df.rename(columns={'url': 'image_link'}, inplace=True)
comment_df = filtered_df.pop('ingredient_comment')
filtered_df.insert(df.columns[-1], "ingredient_comment", comment_df)
filtered_df.to_csv('../data/posts_with_comments.csv', index=False)

TypeError: loc must be int