# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [1]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import scripts.utils as u

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [8]:
headers = u.authenticate_and_get_headers()
headers

AttributeError: module 'utils' has no attribute 'authenticate_and_get_headers'

## 2. 🎯Sending our first request

In [2]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'
flair_query = ' OR '.join(f'flair_name:"{flair}"' for flair in flair_name)

# specify earliest time to search from
specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
timestamp = int(specific_date_time.timestamp())

params = {'q': flair_query,
          'limit': 100,
          'restrict_sr': 0,
          'sort': 'new',
          'timestamp': timestamp}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

In [43]:
print(response.json())


{'kind': 'Listing', 'data': {'modhash': None, 'dist': 100, 'facets': {}, 'after': 't3_189j5bx', 'geo_filter': '', 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'chips', 'selftext': 'The new recipe is essentially a Cheeto. Distinctive cheese doodle crunch, gone. Burning raw the roof of your mouth after eating some, gone. Immediately opened a bag and thought I had accidentally grabbed Cheetos by mistake. \n\nThose were my fav rip', 'author_fullname': 't2_ctddil9g', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Rip to my beloved childhood favorite, the wise cheez doodle', 'link_flair_richtext': [], 'subreddit_name_prefixed': 'r/chips', 'hidden': False, 'pwls': 6, 'link_flair_css_class': '', 'downs': 0, 'thumbnail_height': None, 'top_awarded_type': None, 'hide_score': True, 'name': 't3_18augqv', 'quarantine': False, 'link_flair_text_color': 'dark', 'upvote_ratio': 1.0, 'author_flair_background_color': None, 'subreddit_type':

In [44]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data 
while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

Requesting Page 2
Requesting Page 3


In [45]:
len(all_data_in_subreddit)

249

We will limit our search to 3 posts first, to test whether our GET request works.

In [17]:
s = r.Session()

BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [18]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(3):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])

In [None]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

In [None]:
len(all_data_in_subreddit)

## 4. 🎯Saving the data 

### 4.1 Saving the data as a JSON file 

In [20]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data, f)

### 4.2 Load the JSON file as a Python dictionary

In [None]:
with open("../data/all_data_flair_is_recipe.json", "r") as file:
    posts = json.load(file)

### 4.3 Create a dataframe of all posts 

In [None]:
df_posts = pd.json_normalize(posts, max_level=0)

df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    # handle NaN values
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()

df_posts['permalink'] = df_posts['permalink'].apply(lambda x: 'reddit.com' + x)     # add prefix to each permalink 

df_posts

In [None]:
duplicate_posts = df_posts[df_posts.duplicated(subset='permalink', keep=False)]
duplicate_posts

In [None]:
duplicate_posts.to_csv('../data/duplicates.csv', index = False)

### 4.4 Save dataframe as a CSV file

In [None]:
df_posts.to_csv('../data/posts.csv', index = False)

In [None]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''