# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [40]:
import sys
import json
import requests as r
import langid

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools as chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [27]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxODE5MzYyLjEwMTc3MSwiaWF0IjoxNzAxNzMyOTYyLjEwMTc3MSwianRpIjoieWFtSzBhaHhrYUs4Tkd4VlJnVk5zdWFGTVVCTHVRIiwiY2lkIjoiZmVpckFYYmVWakEzOFN3cVRQT05LdyIsImxpZCI6InQyX2hleXBhN2ZhIiwiYWlkIjoidDJfaGV5cGE3ZmEiLCJsY2EiOjE2MzkxMjU5ODA5NzQsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.DUlCUjeW8PVbq_n3vjsOAB3epb4qUyDd9NbNDicvg3NwDHUTJwFXZ8oqQugBX64ioBbxIFF3Pv-18Z6v-LGa-Xj8L1FFaMd63cZlGLXX6QPlq1r7_k5FrfwRa94TJByOSgchJX8Dfi0z7XqvvAL36KM2YpKFwBYTpjY4KIKz3zn6JdFnWjQ0ItEF3a5WZngstci6G9FDdGoXvCa7L4iwicljUhiF2p_vPKPjL1tsyBsXEVa1w9042rCF3oyuPmL1coGJNuAx9_fCpy-o_HMol2TC0SSRi3KHPvJTXeTlDhbz1gNc18EoDzs0GRLOdaYsIhaXjZaKCLd47lXPRyfOYg',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by ilovedatasci'}

## 2. 🎯Sending our first request

In [None]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'
flair_query = ' OR '.join(f'flair_name:"{flair}"' for flair in flair_name)

# specify earliest time to search from
specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
timestamp = int(specific_date_time.timestamp())

params = {'q': flair_query,
          'limit': 100,
          'restrict_sr': 0,
          'sort': 'new',
          'timestamp': timestamp}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

In [None]:
print(response.json())


In [None]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data 
while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

In [None]:
len(all_data_in_subreddit)

We will limit our search to 3 posts first, to test whether our GET request works.

In [28]:
s = r.Session()

BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [29]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(3):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3
Requesting Page 4


In [30]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [31]:
len(all_data_in_subreddit)

249

## 3. 🎯Saving the data 

### 3.1 Saving the data as a JSON file 

In [36]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data_in_subreddit, f)

### 3.2 Load the JSON file as a Python dictionary

In [37]:
with open("../data/all_data_flair_is_recipe.json", "r") as file:
    posts = json.load(file)

### 3.3 Create a dataframe of all posts 

In [38]:
df_posts = pd.json_normalize(posts, max_level=0)

df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    # handle NaN values
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()

df_posts['permalink'] = "reddit.com" + df_posts['permalink']     # add prefix to each permalink 

df_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,"Stir Fry Supreme – Chives, cashews and Shrimp",1.701695e+09,59,0,0.90,59,6,False,reddit.com/r/recipes/comments/18ajm70/stir_fry...,https://i.redd.it/6vrftswiz94c1.jpeg
1,Polish Krokiety - Mushroom &amp; Sauerkraut Cr...,1.701517e+09,260,0,0.97,260,17,False,reddit.com/r/recipes/comments/1891x51/polish_k...,https://i.redd.it/lsazpcn8bv3c1.jpg
2,Festive Southern Jalapeno Pimento Cheese Dip,1.701263e+09,172,0,0.95,172,11,False,reddit.com/r/recipes/comments/186osjd/festive_...,https://i.redd.it/i0lvs10aba3c1.jpeg
3,Quick &amp; Easy Nut Brittle,1.701206e+09,118,0,0.95,118,11,False,reddit.com/r/recipes/comments/1866xrq/quick_ea...,https://i.redd.it/elfhdi81n53c1.jpg
4,Green Borshch,1.700952e+09,62,0,0.96,62,9,False,reddit.com/r/recipes/comments/183vmzc/green_bo...,https://i.redd.it/s8aaslwrnk2c1.jpg
...,...,...,...,...,...,...,...,...,...,...
244,Matcha Fudge Chocolate Chip Marble Cookies (Re...,1.680281e+09,1404,0,0.97,1404,28,False,reddit.com/r/recipes/comments/127rf0v/matcha_f...,https://i.redd.it/5r1vvzc7t3ra1.jpg
245,Greek Yogurt Chicken Salad with Apples and Alm...,1.680043e+09,24,0,0.97,24,2,False,reddit.com/r/recipes/comments/1254xfn/greek_yo...,https://i.redd.it/kbpojb4e5kqa1.jpg
246,Louisiana Crawfish Boil,1.680008e+09,1214,0,0.96,1214,56,True,reddit.com/r/recipes/comments/124nr2l/louisian...,https://i.redd.it/sgmiy9z18hqa1.jpg
247,Drums of heaven,1.680000e+09,82,0,0.94,82,5,False,reddit.com/r/recipes/comments/124kk3b/drums_of...,https://i.redd.it/97e0rjrm2iqa1.jpg


In [39]:
duplicate_posts = df_posts[df_posts.duplicated(subset='permalink', keep=False)]
duplicate_posts

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url


In [47]:
# Function to check if text is in English
def is_english(text):
    language, confidence = langid.classify(text)
    return language == 'en' and confidence > 0.00000000001  # Adjust confidence threshold as needed

# Apply the function to filter rows
filtered_df = df_posts[df_posts['title'].apply(is_english)]

print(filtered_df)
print(len(filtered_df))


                                       title   created_utc   ups  downs  \
3               Quick &amp; Easy Nut Brittle  1.701206e+09   118      0   
4                              Green Borshch  1.700952e+09    62      0   
18                           Al Pastor Tacos  1.699102e+09   304      0   
24                              Lasagna Soup  1.698345e+09   324      0   
30                       Quick ravioli salad  1.697988e+09    65      0   
34                                 Apple Jam  1.697819e+09   103      0   
38                        Beef Ribs two ways  1.697563e+09    39      0   
39                           Classic Lasagna  1.697451e+09   183      0   
45      Canadian Maple French Toast (Recipe)  1.696952e+09   117      0   
47    Autumn Apple Cinnamon Cookies (Recipe)  1.696610e+09   312      0   
65                      Quick Beef Rice Bowl  1.695222e+09   124      0   
82   Apple Oatmeal Molasses Cookies (Recipe)  1.693594e+09   120      0   
83                       

In [None]:
duplicate_posts.to_csv('../data/duplicates.csv', index = False)

### 3.4 Save dataframe as a CSV file

In [None]:
df_posts.to_csv('../data/posts.csv', index = False)

In [None]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''