# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [1]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

import spacy
# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [2]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAyMDc4ODU2LjY4ODgyMiwiaWF0IjoxNzAxOTkyNDU2LjY4ODgyMiwianRpIjoibkZLVFdvbXdQWkF5YS1rMzNkX0toa0RFOTVuMWZRIiwiY2lkIjoiVFJicTdUNUZLby1kTU1iSk5vMTdEQSIsImxpZCI6InQyXzJpOWF4eDh3IiwiYWlkIjoidDJfMmk5YXh4OHciLCJsY2EiOjE1NDA4OTAxODI0NTUsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.ku9PZFhOac14VptdffbNP8aVU5lEjpHF_XLmLUsPwuNzC60HXdStNe7A8aZP_cicY5BOfdq2GplK9G_IxRLwzsnOeYNMnL1evY7Ot3UWIVxF0Pg5VOdGve89ee2hsZQ38NLUfxUwV2PY5pxv7-yOISWOo2NC6qSA_LB9wzuI40_aoMtCpVRllgUCoz-LDDjOLxh-FtJOcdOFd_e4lNTgkM35s3P72iXm7JiQuSmpHidOK2FwPaYPoVSpYUr6EveGV9Bv4HpdQPJmJQ_YAGuGd4VJhGwoRjE_uufXIxdeIm9FXbxfPONrsJtNic4wArCT3uhnVoANZ7onSb4CffHjCQ',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by rainshake'}

## 2. 🎯Sending our GET requests

### 2.1 Prepare GET request for all Flairs + Paginate through all search results

Trying to loop through each flair using For Loop

We will be using the `after` ID given by the reddit API to paginate through until the last post matching the search query.

In [3]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'


all_data_for_all_flairs = []
all_data_by_flair = {}


for flair in flair_names:
    flair_query = f'flair_name:"{flair}"'
    specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
    timestamp = int(specific_date_time.timestamp())
    params = {
        'q': flair_query,
        'limit': 100,
        'restrict_sr': 1,
        'sort': 'new',
        'timestamp': timestamp
    }
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    # Initialize an empty list to store the data from page for the current flair
    all_data_by_flair[flair] = []
    

    # Process the data from the first page
    data = response.json()
    all_data_by_flair[flair].extend(data['data']['children'])

    # Page 02 and beyond
    while 'after' in data['data'] and data['data']['after'] is not None:
        after_id = data['data']['after']
        params["after"] = after_id
        response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
        # print(f"Requesting Page {len(all_data_by_flair[flair]) // params['limit'] + 1}")
        data = response.json()

        # Process the data from the current page
        #all_data_by_flair.extend(data['data']['children'])
        all_data_by_flair[flair].extend(data['data']['children'])
    
    all_data_for_all_flairs.extend(all_data_by_flair[flair])
    
pprint(len(all_data_for_all_flairs))



2067


## 3. 🎯Saving the data 

### 3.1 Save the data as a JSON file 

In [4]:
with open("../data/all_data_for_all_flairs.json", "w") as f:
    json.dump( all_data_for_all_flairs, f)

### 3.2 Load the JSON file as a Python dictionary

In [5]:
with open("../data/all_data_for_all_flairs.json", "r") as file:
    posts = json.load(file)

### 3.3 Create a dataframe of all posts 

In [6]:
df_posts = pd.json_normalize(posts, max_level=0)
df_posts = pd.json_normalize(df_posts['data'])
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()
df_posts.head()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Classic Tiramisu Recipe (original Italian pizz...,1701864000.0,19,0,0.8,19,5,False,/r/recipes/comments/18c2c0q/classic_tiramisu_r...,https://www.diyfoodhacks.com/classic-tiramisu-...
1,Orange Cookies 🍊🧡,1701750000.0,175,0,0.97,175,6,False,/r/recipes/comments/18b3ir1/orange_cookies/,https://i.redd.it/37t5h7ssje4c1.jpg
2,"Stir Fry Supreme – Chives, cashews and Shrimp",1701695000.0,100,0,0.91,100,9,False,/r/recipes/comments/18ajm70/stir_fry_supreme_c...,https://i.redd.it/6vrftswiz94c1.jpeg
3,Sous Vide Chicken and Potatoes,1701651000.0,9,0,1.0,9,1,False,/r/recipes/comments/18a88g3/sous_vide_chicken_...,https://i.redd.it/rcgqae55e64c1.jpg
4,Chicken Riggies,1701551000.0,1,0,1.0,1,1,False,/r/recipes/comments/189d72m/chicken_riggies/,https://i.redd.it/bn11tg3i5y3c1.jpg


In [7]:
# normalize JSON data in 'posts' and create a DataFrame
df_posts = pd.json_normalize(posts, max_level=0)

# handle NaN values in the 'data' column by replacing them with an empty dictionary
df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    

# concatenate the original DataFrame with a new DataFrame created from normalizing the 'data' column
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

# select specific columns from the dataframe
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']
df_posts = df_posts[selected_cols].copy()

# add a prefix to the 'permalink' column
df_posts['permalink'] = "https://reddit.com" + df_posts['permalink']   

df_posts.head()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
0,Classic Tiramisu Recipe (original Italian pizz...,1701864000.0,19,0,0.8,19,5,False,https://reddit.com/r/recipes/comments/18c2c0q/...,https://www.diyfoodhacks.com/classic-tiramisu-...
1,Orange Cookies 🍊🧡,1701750000.0,175,0,0.97,175,6,False,https://reddit.com/r/recipes/comments/18b3ir1/...,https://i.redd.it/37t5h7ssje4c1.jpg
2,"Stir Fry Supreme – Chives, cashews and Shrimp",1701695000.0,100,0,0.91,100,9,False,https://reddit.com/r/recipes/comments/18ajm70/...,https://i.redd.it/6vrftswiz94c1.jpeg
3,Sous Vide Chicken and Potatoes,1701651000.0,9,0,1.0,9,1,False,https://reddit.com/r/recipes/comments/18a88g3/...,https://i.redd.it/rcgqae55e64c1.jpg
4,Chicken Riggies,1701551000.0,1,0,1.0,1,1,False,https://reddit.com/r/recipes/comments/189d72m/...,https://i.redd.it/bn11tg3i5y3c1.jpg


### Remove non-English posts

To make the data easier to analyse using NLP techniques, we will filter out posts that are not in English.

In [8]:
# load the English language model into spacy
nlp = spacy.load("en_core_web_sm")

# filter the english posts by applying custom function
filtered_df_posts = df_posts[df_posts['title'].apply(chadtools.is_english, model=nlp)]
filtered_df_posts.tail()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
2062,Bhindi,1567057000.0,15,0,0.75,15,3,False,https://reddit.com/r/recipes/comments/cwwkcq/b...,https://i.redd.it/y6698lnkqbj31.jpg
2063,Restaurant Style Phool Gobhi Masala Recipe,1567056000.0,21,0,0.88,21,1,False,https://reddit.com/r/recipes/comments/cwwfal/r...,https://i.redd.it/ycwjgo0pnbj31.jpg
2064,Celery and Soy Stuffed Butternut Squash,1566290000.0,7,0,0.74,7,1,False,https://reddit.com/r/recipes/comments/csv234/c...,https://imgur.com/OyakVfz
2065,Grilled Nectarine Caprese Salad,1566144000.0,1723,0,0.97,1723,22,False,https://reddit.com/r/recipes/comments/cs2z3v/g...,https://i.redd.it/tzjwjnulc8h31.jpg
2066,The right way to cut watermelon,1565469000.0,0,0,0.36,0,8,False,https://reddit.com/r/recipes/comments/con4hp/t...,https://i.redd.it/w72ozivbkof31.jpg


### 3.4 Save dataframe as a CSV file

In [12]:
filtered_df_posts.to_csv('../data/posts.csv', index = False)

In [10]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''

'\n# Specify the subreddit and flair\nsubreddit_name = \'recipes\'\nflair_name = \'recipe\'  # Change this to the desired flair\n\n# Reddit API endpoint for searching posts in a subreddit\nurl = f\'https://www.reddit.com/r/{subreddit_name}/search.json\'\n\n# Define parameters for the search query\nparams = {\n    \'q\': f\'flair_name:"{flair_name}"\',\n    \'restrict_sr\': \'on\',  # Restrict the search to the specified subreddit\n    \'sort\': \'new\',       # Sort by new to get all posts\n    \'syntax\': \'cloudsearch\'\n}\n\n# Make the API request\nresponse = s.get(url, params=params, headers={\'User-agent\': \'your_user_agent\'})\n\n# Check if the request was successful (status code 200)\nif response.status_code == 200:\n    # Parse the JSON response\n    data = response.json()\n    \n    # Get the number of posts\n    num_posts = data[\'data\'][\'dist\']\n    \n    print(f"Number of posts with \'{flair_name}\' flair in r/{subreddit_name}: {num_posts}")\nelse:\n    print(f"Error: {