# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [44]:
import os
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

import plotnine
import altair
import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

## 1. 🎯Load credentials

This loads the `credentials.json` file in each of our local repos.

In [45]:
credentials_file_path = "../credentials.json"

# open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## 2. 🎯Obtaining a token

In [46]:
s = r.Session()

# Set up authentication parameters 
client_auth = r.auth.HTTPBasicAuth(credentials["app_client_id"], credentials["app_client_secret"])

# Send, via HTTP POST, your Reddit username and password
post_data = {"grant_type": "password",
             "username": credentials["reddit_username"],
             "password": credentials["reddit_password"]}

# Reddit API requests that we self-identify ourselves in the User-Agent
headers = {"User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

In [47]:
# From Reddit's API documentation, this is the endpoint I need
ACCESS_TOKEN_ENDPOINT = "https://www.reddit.com/api/v1/access_token"

# Send a HTTP POST 
response = s.post(ACCESS_TOKEN_ENDPOINT, auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAxNzI4MTI5LjU4Nzk4MiwiaWF0IjoxNzAxNjQxNzI5LjU4Nzk4MiwianRpIjoieVc0SVB2ZWFXMWRZY2VzdGpXZVFHSklrRGpIUk1nIiwiY2lkIjoiVFJicTdUNUZLby1kTU1iSk5vMTdEQSIsImxpZCI6InQyXzJpOWF4eDh3IiwiYWlkIjoidDJfMmk5YXh4OHciLCJsY2EiOjE1NDA4OTAxODI0NTUsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.mxePHDhOYMxBGhRqbYgluu3NwaOyx0TatEtVCB1HWXtLx6BtBzxMEbTJkL6EeN0eWAfqsqGhudCCd8XN2RSN25GqZEqUuERu2G_1YPPJF9Gt9NkFzg125dWJmcDD4K2f1HiO6OJChs_hWGL0GA80jydJVuhCwE4NviybvXXxOCxiOvRaAfDjRy2HBp8IcXVv3Mlc22A7Dw3XeSNnNDZ41lABK3UJmVvAJKMR13DIRKbJLQBeS51gHVyHoxodUwuWkI6Vr5PZnz7G0usARJl8Zm2c8sCq1QxbLEY0mabdb4B7T9dApHPVv3xtwkHJCOkEHd5oU_IQwDK4dilAwK_LIA',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Save our token

In [48]:
my_token = response.json()['access_token']

From now on, all requests need to be followed by these HTTP HEADERS:

In [49]:
headers = {"Authorization": f"bearer {my_token}",
           "User-Agent": f"LSE DS105A Recipe Scraping Project by {credentials['reddit_username']}"}

## 3. 🎯Sending our first request

In [50]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'
flair_query = '|'.join(f'flair_name:"{flair}"' for flair in flair_name)
specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
timestamp = int(specific_date_time.timestamp())

params = {'q': flair_query,
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new',
          'timestamp': timestamp}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

In [55]:
print(response.json())


{'kind': 'Listing', 'data': {'modhash': None, 'dist': 0, 'facets': {}, 'after': None, 'geo_filter': '', 'children': [], 'before': None}}


In [None]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data 
while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

In [None]:
len(all_data_in_subreddit)

We will limit our search to 3 posts first, to test whether our GET request works.

In [None]:
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_name = 'Recipe'
subreddit_name = 'recipes'

params = {'q': f'flair_name:"{flair_name}"',
          'limit': 100,
          'restrict_sr': 1,
          'sort': 'new'}

response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)

# response.json()

We will try paginating 3 times first, before increasing the number of page or paginating to the end.

In [None]:
# Initialize an empty list to store the data from all pages
all_data = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

# while data['data']['after'] is not None:
for i in range(3):
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {i+2}")
    data = response.json()

    # Process the data from the current page
    all_data.extend(data['data']['children'])

In [None]:
# Initialize an empty list to store the data from all pages
all_data_in_subreddit = []

#page 01 data
data = response.json()

# Process the data from the first page
all_data_in_subreddit.extend(data['data']['children'])

# Continue paginating until there is no more data (or paginate for a set number of times)

while 'after' in data['data'] and data['data']['after'] is not None:
    after_id = data['data']['after']
    params["after"] = after_id
    # response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/hot?limit=100&after={after_id}/", params=params, headers=headers)
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    print(f"Requesting Page {len(all_data_in_subreddit) // params['limit'] + 1}")
    data = response.json()

    # Process the data from the current page
    all_data_in_subreddit.extend(data['data']['children'])

In [None]:
len(all_data_in_subreddit)

## 4. 🎯Saving the data 

### 4.1 Saving the data as a JSON file 

In [None]:
with open("../data/all_data_flair_is_recipe.json", "w") as f:
    json.dump(all_data, f)

### 4.2 Load the JSON file as a Python dictionary

In [None]:
with open("../data/all_data_flair_is_recipe.json", "r") as file:
    posts = json.load(file)

### 4.3 Create a dataframe of all posts 

In [None]:
df_posts = pd.json_normalize(posts, max_level=0)

df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    # handle NaN values
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()

df_posts['permalink'] = df_posts['permalink'].apply(lambda x: 'reddit.com' + x)     # add prefix to each permalink 

df_posts

In [None]:
duplicate_posts = df_posts[df_posts.duplicated(subset='permalink', keep=False)]
duplicate_posts

In [None]:
duplicate_posts.to_csv('../data/duplicates.csv', index = False)

### 4.4 Save dataframe as a CSV file

In [None]:
df_posts.to_csv('../data/posts.csv', index = False)

In [None]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''