# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [15]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector

# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import scripts.utils as u

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [16]:
headers = u.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxNTI5MDg3LjYxODk2OCwiaWF0IjoxNzAxNDQyNjg3LjYxODk2OCwianRpIjoiTWo3NzRWZmNxSGhuaTFBQmxiNnRZV2dSRXdGTDh3IiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.T7cCqcfmgZ-DmbYsMhvRUCRxHA_nPWiJ2tTEuMmpgpQMeNj5vb_NVPKmSbeka_o2xzPgdiKgSzPRI3Y1qZ4aZ1y0No92MoYNIAmih3Ql2_c5AtVNqtEmQtzs1bHKpLoLMN5-HSUrIpA4AO4tWBIV3a31shXRv5TpCHrENFjKgCjh9YQM__VjliLKBgQXD4e_37r5l78-1MdkRlL25dqmaedxGJmOXy0HaA6HutJYANPeXyZEvpR1TGfOo50CTMw4KP_3Nfd-XoXPht1q_wuMmvGLN7637XeSQW3daVgxSN9qwqH0Hwq9tcVgTaU3BN4OOskTVArMKoGDIx-h5S45Nw',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by zichengliu'}

## 2. 🎯Sending our first request

We will limit our search to 3 posts first, to test whether our GET request works.

In [17]:
s = r.Session()

BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 3,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [18]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(2):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [19]:
len(all_data)

9

## 4. 🎯Saving the data to JSON

In [20]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data, f)