# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [3]:
import sys
import requests as r

import pandas as pd
from datetime import datetime

from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [5]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA3MDg1OTk0Ljg0MjE5LCJpYXQiOjE3MDY5OTk1OTQuODQyMTksImp0aSI6ImgzTmxkUXR5MXo4bUt0ZEN1em43NUgzNVJjRXBNQSIsImNpZCI6Im1oVG1fNnhFVDc1ZDloZlpCa0tGWEEiLCJsaWQiOiJ0Ml8xNmZhNDIiLCJhaWQiOiJ0Ml8xNmZhNDIiLCJsY2EiOjE0OTAyNDc3MjcwMTEsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.Aj_KRnKKu3A-R0VJpubPENEiinCzt-LJLCv-5mCqknmO4-dhA6rzYxThdvQoNKPKPc81t8eCGDQrcAEGhW9haZj11QSW4r8zVdKGaNQdGJ2iHCfLS1YCbvisCW6CZEZOAW7fmts-438mrw7Dz7A6pFv_ZT1jid5D76AmJBzwwEyjRsZFqzjKkaCrGNMJUUf6WcWzNZkyv2ld_kRqAaj87XYQIxW_opjFFUHbnWqKWCTozE9xOfRSl1yzTrsO_4D7vKiZHB8rKVCqt1pAj3N2JpaG8XkFmf1yt8z2VUN_2I1u_uB-4VEfZeCluLuXT2-dTqlriJM5LnnaJiXxSOp00A',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by zichengliu'}

## 2. 🎯Sending our GET requests

### 2.1 Prepare GET request for all Flairs + Paginate through all search results

To get all the posts from the subreddit, we will iterate through a list of all the flairs in the subreddit and send a GET request for each flair. We will then paginate through the search results using the `after` ID given by Reddit's API to get all the posts for each flair.

In [6]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'

all_data_for_all_flairs = []
all_data_by_flair = {}

for flair in tqdm(flair_names):
    flair_query = f'flair_name:"{flair}"'
    params = {
        'q': flair_query,
        'limit': 100,
        'restrict_sr': 1,
        'sort': 'new'
    }
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    # Initialize an empty list to store the data from page for the current flair
    all_data_by_flair[flair] = []
    
    # Process the data from the first page
    data = response.json()
    all_data_by_flair[flair].extend(data['data']['children'])

    # Page 02 and beyond
    while 'after' in data['data'] and data['data']['after'] is not None:
        after_id = data['data']['after']
        params["after"] = after_id
        response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
        # print(f"Requesting Page {len(all_data_by_flair[flair]) // params['limit'] + 1}")
        data = response.json()

        # Process the data from the current page
        #all_data_by_flair.extend(data['data']['children'])
        all_data_by_flair[flair].extend(data['data']['children'])
    
    all_data_for_all_flairs.extend(all_data_by_flair[flair])
    
len(all_data_for_all_flairs)

100%|██████████| 10/10 [00:26<00:00,  2.63s/it]


2065

## 3. 🎯Saving the data 

### 3.1 Create a dataframe of all posts 

In [4]:
df_posts = pd.DataFrame(all_data_for_all_flairs)
df_posts = pd.json_normalize(df_posts['data'], max_level=0)

df_posts['permalink'] = "https://reddit.com" + df_posts['permalink']
df_posts.tail()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,is_gallery,media_metadata,gallery_data,author_cakeday,poll_data,author_cakeday.1,crosspost_parent_list,crosspost_parent
2060,,recipes,,t2_71qg7,False,,0,False,Eggplant Chickpea Dip,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,,
2061,,recipes,,t2_3hz99hdf,False,,0,False,End-Of-Summer Sesame Slaw,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,,
2062,,recipes,,t2_3ftl8yf0,False,,0,False,Bhindi,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,,
2063,,recipes,,t2_3ftl8yf0,False,,0,False,Restaurant Style Phool Gobhi Masala Recipe,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,,
2064,,recipes,,t2_71qg7,False,,0,False,Celery and Soy Stuffed Butternut Squash,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,,


In [11]:
df_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2065 entries, 0 to 2064
Columns: 113 entries, approved_at_utc to crosspost_parent
dtypes: bool(30), float64(3), int64(10), object(70)
memory usage: 1.4+ MB


### 3.2 Save dataframe as a JSON file

In [5]:
df_posts.to_json('../data/posts.json', orient='records', indent=4)

### 3.3 Saving Dataframe as HTML for webpage

In [4]:
df_posts = pd.read_json('../data/posts.json')

In [11]:
df_posts_styled = df_posts.head(10).style.set_table_styles([
    {'selector': 'thead', 'props': [('background-color', '#f2f2f2')]},
    {'selector': 'th', 'props': [('border', '1px solid #dddddd')]},
    {'selector': 'td', 'props': [('border', '1px solid #dddddd')]},
])

df_posts_styled.to_html('../docs/posts.html', render_links=True, index=False)