# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [32]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. 🎯Load credentials

This loads the `credentials.json` file in each of our local repos.

In [33]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [34]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [35]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxNTIxODk1LjY2MjMxOSwiaWF0IjoxNzAxNDM1NDk1LjY2MjMxOSwianRpIjoidUR2Z3BTU3dxQ0FvSHhfcXBDU3BYSTdUSjg1OHBRIiwiY2lkIjoiZmVpckFYYmVWakEzOFN3cVRQT05LdyIsImxpZCI6InQyX2hleXBhN2ZhIiwiYWlkIjoidDJfaGV5cGE3ZmEiLCJsY2EiOjE2MzkxMjU5ODA5NzQsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.EZttUZyzDxhC9QtUapkPv03uWON0YH72972D9dWHDtq7W76CmI4aPOWmH_OI-Q8yZks8rU0RMKhaX2u-S-XhvGFBYGZhhGADQCs3dv0ncByg_Fn2ILdjl9Zf1NKSK7yJIQ70TkmLVAw5nYw8eCG-Abdzc8XWIJYClWH1Aoj8fKqPR6MUNT1xsCzN4-s_V0j0HCAJthQduKCaXsRLxx2v4yiyVs8tAn3AQBmcjp3yNiUenPub8PiHLVKaeTceq8IwihPV3i6ro7eLjwzzLG50es9IbXjpfQMhLnGUjIi3HXhqfIgPAU5E_1HQ0FV48Zp1aWMebyU0Hk6sqIs-6KIcAw',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [36]:
my_token = response.json()['access_token']

From now on, all requests need to be followed by these HTTP HEADERS:

In [37]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending our first request

We will limit our search to 3 posts first, to test whether our GET request works.

In [38]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [39]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(3):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3
Requesting Page 4


In [40]:
len(all_data)

350

## 4. 🎯Saving the data to JSON

In [41]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data, f)

In [42]:
with open("../data/all_data_flair_is_recipe.json", "r") as file:
    posts = json.load(file)

In [43]:
df_posts = pd.json_normalize(posts, max_level=0)

df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)  # handle NaN values
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()

df_posts['permalink'] = df_posts['permalink'].apply(lambda x: 'reddit.com' + x)     # add prefix to each permalink 

df_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Festive Southern Jalapeno Pimento Cheese Dip,1.701263e+09,140,0,0.95,140,9,False,reddit.com/r/recipes/comments/186osjd/festive_...,https://i.redd.it/i0lvs10aba3c1.jpeg
1,Quick &amp; Easy Nut Brittle,1.701206e+09,99,0,0.94,99,10,False,reddit.com/r/recipes/comments/1866xrq/quick_ea...,https://i.redd.it/elfhdi81n53c1.jpg
2,Green Borshch,1.700952e+09,53,0,0.93,53,9,False,reddit.com/r/recipes/comments/183vmzc/green_bo...,https://i.redd.it/s8aaslwrnk2c1.jpg
3,Leftover Turkey and Pastina Soup,1.700944e+09,250,0,0.94,250,13,False,reddit.com/r/recipes/comments/183st4x/leftover...,https://i.redd.it/0p0rwigu0k2c1.jpg
4,Nasi Goreng (Indonesian Fried Rice) - My famil...,1.700743e+09,70,0,0.97,70,11,False,reddit.com/r/recipes/comments/1820b5b/nasi_gor...,https://i.redd.it/wj2shpc2e32c1.jpg
...,...,...,...,...,...,...,...,...,...,...
345,Roast Squash and Apples,1.691972e+09,95,0,0.91,95,2,False,reddit.com/r/recipes/comments/15qep4i/roast_sq...,https://i.redd.it/5g34xvimwyhb1.jpg
346,One-Pot Chickpeas and Rice,1.691786e+09,130,0,0.99,130,9,False,reddit.com/r/recipes/comments/15ok2bb/onepot_c...,https://i.redd.it/bpyflftlljhb1.jpg
347,Lemon Polenta Cake (Recipe),1.691780e+09,81,0,0.97,81,4,False,reddit.com/r/recipes/comments/15ohf16/lemon_po...,https://i.redd.it/u52poxxm3jhb1.png
348,Korean style Prawn Scallion Pancakes,1.691764e+09,95,0,0.98,95,10,False,reddit.com/r/recipes/comments/15oaixi/korean_s...,https://i.redd.it/p9lf4a2xrhhb1.jpg


In [44]:
df_posts.to_csv('../data/posts.csv', index = False)

In [45]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''

Number of posts with 'recipe' flair in r/recipes: 25
