# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [18]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

import spacy
# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [19]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxOTA2MTYyLjQ1NzI2NiwiaWF0IjoxNzAxODE5NzYyLjQ1NzI2NiwianRpIjoia1RJQ2Z3a25iVmV3OTZ6eTVqWkFCSjlGN2hUbEtBIiwiY2lkIjoibWhUbV82eEVUNzVkOWhmWkJrS0ZYQSIsImxpZCI6InQyXzE2ZmE0MiIsImFpZCI6InQyXzE2ZmE0MiIsImxjYSI6MTQ5MDI0NzcyNzAxMSwic2NwIjoiZUp5S1Z0SlNpZ1VFQUFEX193TnpBU2MiLCJmbG8iOjl9.Ke2WGFXH-AQpDV2CFPZOQmuS_CxrDxkjSXExoXJXb_y7Oh1OLyiwtpHbRinNWKJDlfz_hS8W9ExiwrfZzE2_ilaju_uopZoHNxHXZsaK6R--0pIDvsA9Eb0iTnOBlWduBVSUyUN55sHxKCuIDbcWhQ6C_QBEqKYuipPxC7vea9sBTItJVQt8pDWCIAsmM1W0MfGcr5sYzkBvJ32E-tnSCmlmD2dzArT5DnjWuLzGQFL035yWp20dAB4mJJjNavcNf7tDWfwlQ5MmX0pB7iYbly_Tpqz_EV9CbdqGK2SsY1ybx7msyDMo9ZmL_Uq8AHpUWBFgNsRh6T9vEdwq6Z9-5w',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by zichengliu'}

## 2. 🎯Sending our first request

In [20]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'
flair_query = ' OR '.join(f'flair_name:"{flair}"' for flair in flair_name)

# specify earliest time to search from
specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
timestamp = int(specific_date_time.timestamp())

params = {'q': flair_query,
          'limit': 100,
          'restrict_sr': 0,
          'sort': 'new',
          'timestamp': timestamp}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

In [21]:
print(response.json())

{'kind': 'Listing', 'data': {'modhash': None, 'dist': 100, 'facets': {}, 'after': 't3_18a27ou', 'geo_filter': '', 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'PhillyWiki', 'selftext': 'From Smalls, to Shamecca, &amp; Tiffany?', 'author_fullname': 't2_a4oeoo0gd', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Why do you guys think so many woman in Philly are getting shot and murdered?', 'link_flair_richtext': [{'e': 'text', 't': 'BEEF'}], 'subreddit_name_prefixed': 'r/PhillyWiki', 'hidden': False, 'pwls': None, 'link_flair_css_class': '', 'downs': 0, 'thumbnail_height': None, 'top_awarded_type': None, 'hide_score': True, 'name': 't3_18bp6jf', 'quarantine': False, 'link_flair_text_color': 'dark', 'upvote_ratio': 0.25, 'author_flair_background_color': None, 'subreddit_type': 'public', 'ups': 0, 'total_awards_received': 0, 'media_embed': {}, 'thumbnail_width': None, 'author_flair_template_id': None, 'is_original_content': F

In [22]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data 
while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [23]:
len(all_data_in_subreddit)

249

We will limit our search to 3 posts first, to test whether our GET request works.

In [24]:
s = r.Session()

BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [25]:
'''# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(3):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])'''

'# Initialize an empty list to store the data from all pages\nall_data = []\n\n#page 01 data\ndata = response.json()\n\n# Process the data from the first page\nall_data.extend(data[\'data\'][\'children\'])\n\n# Continue paginating until there is no more data (or paginate for a set number of times)\n\n# while data[\'data\'][\'after\'] is not None:\nfor i in range(3):\n    after_id = data[\'data\'][\'after\']\n    params["after"] = after_id\n    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)\n    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)\n    print(f"Requesting Page {i+2}")\n    data = response.json()\n\n    # Process the data from the current page\n    all_data.extend(data[\'data\'][\'children\'])'

In [26]:
# Initialize an empty list to store the data from all pages
all_data_in_recipe_flair = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_recipe_flair.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_recipe_flair) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_recipe_flair.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [27]:
len(all_data_in_recipe_flair)

249

## 3. 🎯Saving the data 

### 3.1 Saving the data as a JSON file 

In [28]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data_in_recipe_flair, f)

### 3.2 Load the JSON file as a Python dictionary

In [29]:
with open("../data/all_data_flair_is_recipe.json", "r") as file:
    posts = json.load(file)

### 3.3 Create a dataframe of all posts 

In [30]:
df_posts = pd.json_normalize(posts, max_level=0)

df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    # handle NaN values
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()

df_posts['permalink'] = "reddit.com" + df_posts['permalink']     # add prefix to each permalink 

df_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Orange Cookies 🍊🧡,1.701750e+09,103,0,0.96,103,1,False,reddit.com/r/recipes/comments/18b3ir1/orange_c...,https://i.redd.it/37t5h7ssje4c1.jpg
1,"Stir Fry Supreme – Chives, cashews and Shrimp",1.701695e+09,91,0,0.90,91,7,False,reddit.com/r/recipes/comments/18ajm70/stir_fry...,https://i.redd.it/6vrftswiz94c1.jpeg
2,Sous Vide Chicken and Potatoes,1.701651e+09,1,0,1.00,1,1,False,reddit.com/r/recipes/comments/18a88g3/sous_vid...,https://i.redd.it/rcgqae55e64c1.jpg
3,Polish Krokiety - Mushroom &amp; Sauerkraut Cr...,1.701517e+09,274,0,0.97,274,17,False,reddit.com/r/recipes/comments/1891x51/polish_k...,https://i.redd.it/lsazpcn8bv3c1.jpg
4,Festive Southern Jalapeno Pimento Cheese Dip,1.701263e+09,177,0,0.95,177,11,False,reddit.com/r/recipes/comments/186osjd/festive_...,https://i.redd.it/i0lvs10aba3c1.jpeg
...,...,...,...,...,...,...,...,...,...,...
244,Zurek - Polish Easter Soup,1.680433e+09,381,0,0.98,381,18,True,reddit.com/r/recipes/comments/129hxb0/zurek_po...,https://i.redd.it/p3b62d3mdgra1.jpg
245,Sweet &amp; Crunchy Maple Popcorn,1.680362e+09,606,0,0.97,606,24,True,reddit.com/r/recipes/comments/128p3mz/sweet_cr...,https://i.redd.it/883p3zahgara1.jpg
246,Matcha Fudge Chocolate Chip Marble Cookies (Re...,1.680281e+09,1404,0,0.97,1404,28,False,reddit.com/r/recipes/comments/127rf0v/matcha_f...,https://i.redd.it/5r1vvzc7t3ra1.jpg
247,Greek Yogurt Chicken Salad with Apples and Alm...,1.680043e+09,23,0,0.97,23,2,False,reddit.com/r/recipes/comments/1254xfn/greek_yo...,https://i.redd.it/kbpojb4e5kqa1.jpg


In [31]:
duplicate_posts = df_posts[df_posts.duplicated(subset='permalink', keep=False)]
duplicate_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url


In [32]:
duplicate_posts.to_csv('../data/duplicates.csv', index = False)

In [33]:

'''
FAILED method
# Function to check if text is in English
def is_english(text):
    language, confidence = langid.classify(text)
    return language == 'en' and confidence > 0.00000000001  # Adjust confidence threshold as needed

# Apply the function to filter rows
filtered_df = df_posts[df_posts['title'].apply(is_english)]

print(filtered_df)
print(len(filtered_df))
'''



"\nFAILED method\n# Function to check if text is in English\ndef is_english(text):\n    language, confidence = langid.classify(text)\n    return language == 'en' and confidence > 0.00000000001  # Adjust confidence threshold as needed\n\n# Apply the function to filter rows\nfiltered_df = df_posts[df_posts['title'].apply(is_english)]\n\nprint(filtered_df)\nprint(len(filtered_df))\n"

In [34]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

def is_english(text, model=nlp):
    
    # Process the text using spaCy
    doc = nlp(text)
    
    # Check if the detected language is English
    return doc.lang_ == 'en'

filtered_df_posts = df_posts[df_posts['title'].apply(is_english)]

filtered_df_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Orange Cookies 🍊🧡,1.701750e+09,103,0,0.96,103,1,False,reddit.com/r/recipes/comments/18b3ir1/orange_c...,https://i.redd.it/37t5h7ssje4c1.jpg
1,"Stir Fry Supreme – Chives, cashews and Shrimp",1.701695e+09,91,0,0.90,91,7,False,reddit.com/r/recipes/comments/18ajm70/stir_fry...,https://i.redd.it/6vrftswiz94c1.jpeg
2,Sous Vide Chicken and Potatoes,1.701651e+09,1,0,1.00,1,1,False,reddit.com/r/recipes/comments/18a88g3/sous_vid...,https://i.redd.it/rcgqae55e64c1.jpg
3,Polish Krokiety - Mushroom &amp; Sauerkraut Cr...,1.701517e+09,274,0,0.97,274,17,False,reddit.com/r/recipes/comments/1891x51/polish_k...,https://i.redd.it/lsazpcn8bv3c1.jpg
4,Festive Southern Jalapeno Pimento Cheese Dip,1.701263e+09,177,0,0.95,177,11,False,reddit.com/r/recipes/comments/186osjd/festive_...,https://i.redd.it/i0lvs10aba3c1.jpeg
...,...,...,...,...,...,...,...,...,...,...
244,Zurek - Polish Easter Soup,1.680433e+09,381,0,0.98,381,18,True,reddit.com/r/recipes/comments/129hxb0/zurek_po...,https://i.redd.it/p3b62d3mdgra1.jpg
245,Sweet &amp; Crunchy Maple Popcorn,1.680362e+09,606,0,0.97,606,24,True,reddit.com/r/recipes/comments/128p3mz/sweet_cr...,https://i.redd.it/883p3zahgara1.jpg
246,Matcha Fudge Chocolate Chip Marble Cookies (Re...,1.680281e+09,1404,0,0.97,1404,28,False,reddit.com/r/recipes/comments/127rf0v/matcha_f...,https://i.redd.it/5r1vvzc7t3ra1.jpg
247,Greek Yogurt Chicken Salad with Apples and Alm...,1.680043e+09,23,0,0.97,23,2,False,reddit.com/r/recipes/comments/1254xfn/greek_yo...,https://i.redd.it/kbpojb4e5kqa1.jpg


### 3.4 Save dataframe as a CSV file

In [35]:
filtered_df_posts.to_csv('../data/posts.csv', index = False)

In [36]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''

'\n# Specify the subreddit and flair\nsubreddit_name = \'recipes\'\nflair_name = \'recipe\'  # Change this to the desired flair\n\n# Reddit API endpoint for searching posts in a subreddit\nurl = f\'https://www.reddit.com/r/{subreddit_name}/search.json\'\n\n# Define parameters for the search query\nparams = {\n    \'q\': f\'flair_name:"{flair_name}"\',\n    \'restrict_sr\': \'on\',  # Restrict the search to the specified subreddit\n    \'sort\': \'new\',       # Sort by new to get all posts\n    \'syntax\': \'cloudsearch\'\n}\n\n# Make the API request\nresponse = s.get(url, params=params, headers={\'User-agent\': \'your_user_agent\'})\n\n# Check if the request was successful (status code 200)\nif response.status_code == 200:\n    # Parse the JSON response\n    data = response.json()\n    \n    # Get the number of posts\n    num_posts = data[\'data\'][\'dist\']\n    \n    print(f"Number of posts with \'{flair_name}\' flair in r/{subreddit_name}: {num_posts}")\nelse:\n    print(f"Error: {