# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [19]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

import spacy
# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [20]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxOTEzMzg4LjU4NjMyOSwiaWF0IjoxNzAxODI2OTg4LjU4NjMyOSwianRpIjoiR2JXckJLUE9HZkM5NzBMekxJV3dlYnpVRjJBS3l3IiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.hlm8N_Lcpoq5_ckZYCsYAJGccPzFTgH7ScVddiZ4rmC-wWlzW9X7kQ7BpiI74vyyOhR02fzUyXKS59vsIopj7qdFOCAOeF--UPeBu-GZG2M4vTZUZBg2TNZA3crGrC-heLVeuVVdJ562px8u4zVbY2De-Nl6oHhX2jI4tXCbx0ITZ5yCLrQUGHrjv4iSIrf9rlTq2Xsb6uRQz2WVt_zu1mdNlnf1v_OIGUD7OUmu2xN5vRVU0IT_oDDPNNDJFY6Sj9csoNDLDPb9EyR95NWqzVALxnAeJ2Y5shuR6iY0vhIAsZG3k9U593lT8nKKpVOEP64_v6yfOMBOPtN8pukg0Q',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by zichengliu'}

## 2. 🎯Sending our GET requests

### 2.1 Prepare GET request

In [21]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'

# flair_query = ' OR '.join(f'flair_name:"{flair}"' for flair in flair_name)
flair_query = 'flair_name:"Recipe"'

# specify earliest time to search from
specific_date_time = datetime(2020, 8, 31, 10, 59, 0) # Aug 31 2020 is the earliest date
timestamp = int(specific_date_time.timestamp())
params = {'q': flair_query,
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new',
          'timestamp': timestamp}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

In [22]:
print(response.json())

{'kind': 'Listing', 'data': {'modhash': None, 'dist': 100, 'facets': {}, 'after': 't3_15ok2bb', 'geo_filter': '', 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'recipes', 'selftext': '', 'author_fullname': 't2_g90hdupc', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Orange Cookies 🍊🧡', 'link_flair_richtext': [{'e': 'text', 't': 'Recipe'}], 'subreddit_name_prefixed': 'r/recipes', 'hidden': False, 'pwls': 6, 'link_flair_css_class': 'recipe', 'downs': 0, 'thumbnail_height': 93, 'top_awarded_type': None, 'hide_score': False, 'name': 't3_18b3ir1', 'quarantine': False, 'link_flair_text_color': 'dark', 'upvote_ratio': 0.96, 'author_flair_background_color': None, 'ups': 115, 'total_awards_received': 0, 'media_embed': {}, 'thumbnail_width': 140, 'author_flair_template_id': None, 'is_original_content': False, 'user_reports': [], 'secure_media': None, 'is_reddit_media_domain': True, 'is_meta': False, 'category': None, 'secure_media

### 2.2 Paginate through all search results

We will be using the `after` ID given by the reddit API to paginate through until the last post matching the search query.

In [23]:
# Initialize an empty list to store the data from all pages
all_data_in_recipe_flair = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_recipe_flair.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_recipe_flair) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_recipe_flair.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [24]:
len(all_data_in_recipe_flair)

249

## 3. 🎯Saving the data 

### 3.1 Save the data as a JSON file 

In [25]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data_in_recipe_flair, f)

### 3.2 Load the JSON file as a Python dictionary

In [26]:
with open("../data/all_data_flair_is_recipe.json", "r") as file:
    posts = json.load(file)

### 3.3 Create a dataframe of all posts 

Comment: @yuyaobai add some comments in the cells below to explain the code a bit more

In [27]:
df_posts = pd.json_normalize(posts, max_level=0)
df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    # handle NaN values

#explanation needed here
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()
df_posts['permalink'] = "https://reddit.com" + df_posts['permalink']     # add prefix to each permalink 

df_posts.head()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Orange Cookies 🍊🧡,1701750000.0,115,0,0.96,115,1,False,https://reddit.com/r/recipes/comments/18b3ir1/...,https://i.redd.it/37t5h7ssje4c1.jpg
1,"Stir Fry Supreme – Chives, cashews and Shrimp",1701695000.0,91,0,0.9,91,7,False,https://reddit.com/r/recipes/comments/18ajm70/...,https://i.redd.it/6vrftswiz94c1.jpeg
2,Sous Vide Chicken and Potatoes,1701651000.0,1,0,1.0,1,1,False,https://reddit.com/r/recipes/comments/18a88g3/...,https://i.redd.it/rcgqae55e64c1.jpg
3,Polish Krokiety - Mushroom &amp; Sauerkraut Cr...,1701517000.0,274,0,0.97,274,17,False,https://reddit.com/r/recipes/comments/1891x51/...,https://i.redd.it/lsazpcn8bv3c1.jpg
4,Festive Southern Jalapeno Pimento Cheese Dip,1701263000.0,177,0,0.95,177,11,False,https://reddit.com/r/recipes/comments/186osjd/...,https://i.redd.it/i0lvs10aba3c1.jpeg


### 3.4 Remove duplicates

We identify duplicates by checking the permalink of the pages. Comment: is this step redundant? @yuyaobai

In [28]:
duplicate_posts = df_posts[df_posts.duplicated(subset='permalink', keep=False)]
duplicate_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url


In [29]:
duplicate_posts.to_csv('../data/duplicates.csv', index = False)

### Remove non-English posts

To make the data easier to analyse using NLP techniques, we will filter out posts that are not in English.

In [30]:
# load the English language model into spacy
nlp = spacy.load("en_core_web_sm")

# filter the english posts by applying custom function
filtered_df_posts = df_posts[df_posts['title'].apply(chadtools.is_english, model=nlp)]

filtered_df_posts.tail()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
244,Zurek - Polish Easter Soup,1680433000.0,378,0,0.98,378,18,True,https://reddit.com/r/recipes/comments/129hxb0/...,https://i.redd.it/p3b62d3mdgra1.jpg
245,Sweet &amp; Crunchy Maple Popcorn,1680362000.0,603,0,0.97,603,24,True,https://reddit.com/r/recipes/comments/128p3mz/...,https://i.redd.it/883p3zahgara1.jpg
246,Matcha Fudge Chocolate Chip Marble Cookies (Re...,1680281000.0,1407,0,0.97,1407,28,False,https://reddit.com/r/recipes/comments/127rf0v/...,https://i.redd.it/5r1vvzc7t3ra1.jpg
247,Greek Yogurt Chicken Salad with Apples and Alm...,1680043000.0,23,0,0.97,23,2,False,https://reddit.com/r/recipes/comments/1254xfn/...,https://i.redd.it/kbpojb4e5kqa1.jpg
248,Louisiana Crawfish Boil,1680008000.0,1212,0,0.96,1212,56,True,https://reddit.com/r/recipes/comments/124nr2l/...,https://i.redd.it/sgmiy9z18hqa1.jpg


### 3.4 Save dataframe as a CSV file

In [31]:
filtered_df_posts.to_csv('../data/posts.csv', index = False)

In [32]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''

'\n# Specify the subreddit and flair\nsubreddit_name = \'recipes\'\nflair_name = \'recipe\'  # Change this to the desired flair\n\n# Reddit API endpoint for searching posts in a subreddit\nurl = f\'https://www.reddit.com/r/{subreddit_name}/search.json\'\n\n# Define parameters for the search query\nparams = {\n    \'q\': f\'flair_name:"{flair_name}"\',\n    \'restrict_sr\': \'on\',  # Restrict the search to the specified subreddit\n    \'sort\': \'new\',       # Sort by new to get all posts\n    \'syntax\': \'cloudsearch\'\n}\n\n# Make the API request\nresponse = s.get(url, params=params, headers={\'User-agent\': \'your_user_agent\'})\n\n# Check if the request was successful (status code 200)\nif response.status_code == 200:\n    # Parse the JSON response\n    data = response.json()\n    \n    # Get the number of posts\n    num_posts = data[\'data\'][\'dist\']\n    \n    print(f"Number of posts with \'{flair_name}\' flair in r/{subreddit_name}: {num_posts}")\nelse:\n    print(f"Error: {