# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [1]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

import spacy
# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [2]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzA2MzA2MjQyLjI2ODQyMSwiaWF0IjoxNzA2MjE5ODQyLjI2ODQyMSwianRpIjoidWttdHQ3ZHd2U0Y4VXVHZFo4RlEzd05yajZUZkxBIiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.cmaqo5qFcyWHTVnv7tbF9AV-i0pnU272nP-PjMw2onByq6kByTNGbC9iDPk4cRZwuBqYxeVS3Y5Yn2kX0clC8G4oKMQN4ZIkCa65uW3Jf0V47_CtpUoT22urkux5PpwEHT-annWXFCOD7-pk5REmj3ldbSRQmEDswSHhFpkWpIEY5CwAfsLiOeMIfp3nOMM-l0TYwU4eB4KOt6E9WUwIV4tcn-lyGRGV5qFfH95bHJvzmwVPdc0A16HuY3L8V44eNMGhqkPs0cn3BMCw9inmP35KkVm6zz9Wx1w82UvCNeRz5IFDoDbPQ-w3jt1Va5K0Wz9Swzrd60-lVKrtmq2zFQ',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by zichengliu'}

## 2. 🎯Sending our GET requests

### 2.1 Prepare GET request for all Flairs + Paginate through all search results

Trying to loop through each flair using For Loop

We will be using the `after` ID given by the reddit API to paginate through until the last post matching the search query.

In [3]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'


all_data_for_all_flairs = []
all_data_by_flair = {}


for flair in flair_names:
    flair_query = f'flair_name:"{flair}"'
    specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
    timestamp = int(specific_date_time.timestamp())
    params = {
        'q': flair_query,
        'limit': 100,
        'restrict_sr': 1,
        'sort': 'new',
        'timestamp': timestamp
    }
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    # Initialize an empty list to store the data from page for the current flair
    all_data_by_flair[flair] = []
    

    # Process the data from the first page
    data = response.json()
    all_data_by_flair[flair].extend(data['data']['children'])

    # Page 02 and beyond
    while 'after' in data['data'] and data['data']['after'] is not None:
        after_id = data['data']['after']
        params["after"] = after_id
        response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
        # print(f"Requesting Page {len(all_data_by_flair[flair]) // params['limit'] + 1}")
        data = response.json()

        # Process the data from the current page
        #all_data_by_flair.extend(data['data']['children'])
        all_data_by_flair[flair].extend(data['data']['children'])
    
    all_data_for_all_flairs.extend(all_data_by_flair[flair])
    
len(all_data_for_all_flairs)

2066

## 3. 🎯Saving the data 

### 3.1 Save the data as a JSON file 

In [4]:
with open("../data/all_data_for_all_flairs.json", "w") as f:
    json.dump(all_data_for_all_flairs, f)

### 3.2 Load the JSON file as a Python dictionary

In [5]:
with open("../data/all_data_for_all_flairs.json", "r") as file:
    posts = json.load(file)

### 3.3 Create a dataframe of all posts 

In [6]:
df_posts = pd.DataFrame(posts)
df_posts = pd.json_normalize(df_posts['data'], max_level=0)

# selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

# filter out only the columns we want
# df_posts = df_posts[selected_cols].copy()
df_posts['permalink'] = "https://reddit.com" + df_posts['permalink']
df_posts.tail()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,is_gallery,media_metadata,gallery_data,poll_data,crosspost_parent_list,crosspost_parent,author_cakeday
2061,,recipes,,t2_71qg7,False,,0,False,Eggplant Chickpea Dip,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,
2062,,recipes,,t2_3hz99hdf,False,,0,False,End-Of-Summer Sesame Slaw,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,
2063,,recipes,,t2_3ftl8yf0,False,,0,False,Bhindi,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,
2064,,recipes,,t2_3ftl8yf0,False,,0,False,Restaurant Style Phool Gobhi Masala Recipe,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,
2065,,recipes,,t2_71qg7,False,,0,False,Celery and Soy Stuffed Butternut Squash,"[{'e': 'text', 't': 'Fruit\Vegetarian'}]",...,0,,False,,,,,,,


### Remove non-English posts

To make the data easier to analyse using NLP techniques, we will filter out posts that are not in English.

### 3.4 Save dataframe as a JSON file

In [7]:
df_posts.to_json('../data/posts.json', orient='records', lines=True)