# ✅Step 1: Data Scraping

## 0. 🎯Import libraries

In [30]:
import sys
import json
import requests as r

import numpy
import pandas as pd
from scrapy import Selector
from datetime import datetime, timedelta

import spacy
# import plotnine
# import altair
# import matplotlib.pyplot as plt

from pprint import pprint
from tqdm import tqdm

# Import our own modules
sys.path.append("../scripts/")
import chadtools

## 1. 🎯Authenticate with Reddit API

Using a function defined in our `utils.py` script, we can authenticate with the Reddit API using our own `credentials.json` file, and get a `dict` of headers to be used in all subsequent GET requests.

In [6]:
headers = chadtools.authenticate_and_get_headers()
headers

{'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzAyMDcyODMyLjkwOTQzMSwiaWF0IjoxNzAxOTg2NDMyLjkwOTQzMSwianRpIjoiaXZvTzBHM2RMMFota1NZakFMbjVZa1pHc1lxT1ZRIiwiY2lkIjoiQmVvRVNfeUhwNDJXWXF0aUNBeHVhZyIsImxpZCI6InQyXzhwNHl1NzBrIiwiYWlkIjoidDJfOHA0eXU3MGsiLCJsY2EiOjE2MDQxNDY3NTU0MjQsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.qvcL9ULBSNWezbSG8Jxn86DNjShVPDI1q8ZMElVrE7jc57qaIUmTQYadjJPCpwKfxtjx32C3D1PMWtK2MK0yDg-hloBI61KvGmYY3_qKJgOyPKNU8PhHV3RuhlZR0d-bo2gX3KC0zmqZ_QfPSqTwnjbRHQxRFEEehly0ObQDSygQOCqkWvcZk-aMaXD21JPp7pMXJZFRVprlEhAHHRK7Xgin-qTZ8nB7dkqyzpM1J9aeQCWhTFUV0oA2LaO9AVj0kUQzUKuqHGvcbSUDMaCVPI02CbUxApe11yXARz3-r-7atPFB4bIJwT51jGYfwNVhHqxfB5S4OhiKP6e3bXbejw',
 'User-Agent': 'LSE DS105A Recipe Scraping Project by Due_Chef1909'}

## 2. 🎯Sending our GET requests

### 2.1 Prepare GET request for all Flairs + Paginate through all search results

Trying to loop through each flair using For Loop

We will be using the `after` ID given by the reddit API to paginate through until the last post matching the search query.

In [7]:
s = r.Session()
BASE_ENDPOINT = "https://oauth.reddit.com"
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood', 'Fruit\Vegetarian']
subreddit_name = 'recipes'


all_data_for_all_flairs = []
all_data_by_flair = {}


for flair in flair_names:
    flair_query = f'flair_name:"{flair}"'
    specific_date_time = datetime(2020, 8, 31, 10, 59, 0)
    timestamp = int(specific_date_time.timestamp())
    params = {
        'q': flair_query,
        'limit': 100,
        'restrict_sr': 1,
        'sort': 'new',
        'timestamp': timestamp
    }
    response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
    # Initialize an empty list to store the data from page for the current flair
    all_data_by_flair[flair] = []
    

    # Process the data from the first page
    data = response.json()
    all_data_by_flair[flair].extend(data['data']['children'])

    # Page 02 and beyond
    while 'after' in data['data'] and data['data']['after'] is not None:
        after_id = data['data']['after']
        params["after"] = after_id
        response = s.get(f"{BASE_ENDPOINT}/r/{subreddit_name}/search", params=params, headers=headers)
        # print(f"Requesting Page {len(all_data_by_flair[flair]) // params['limit'] + 1}")
        data = response.json()

        # Process the data from the current page
        #all_data_by_flair.extend(data['data']['children'])
        all_data_by_flair[flair].extend(data['data']['children'])
    
    all_data_for_all_flairs.extend(all_data_by_flair[flair])
    
pprint(len(all_data_for_all_flairs))



2066


## 3. 🎯Saving the data 

### 3.1 Save the data as a JSON file 

In [8]:
with open("../data/all_data_for_all_flairs.json", "w") as f:
    json.dump( all_data_for_all_flairs, f)

### 3.2 Load the JSON file as a Python dictionary

In [9]:
with open("../data/all_data_for_all_flairs.json", "r") as file:
    posts = json.load(file)
    s



### 3.3 Create a dataframe of all posts 

In [None]:
df_posts = pd.json_normalize(posts, max_level=0)
df_posts = pd.json_normalize(df_posts['data'])
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']

df_posts = df_posts[selected_cols].copy()
df_posts.head()

In [None]:
# normalize JSON data in 'posts' and create a DataFrame
df_posts = pd.json_normalize(posts, max_level=0)

# handle NaN values in the 'data' column by replacing them with an empty dictionary
df_posts['data'] = df_posts['data'].apply(lambda x: {} if pd.isna(x) else x)    

# concatenate the original DataFrame with a new DataFrame created from normalizing the 'data' column
df_posts = pd.concat([df_posts.drop(['data'], axis=1), pd.json_normalize(df_posts['data'])], axis=1)

# select specific columns from the dataframe
selected_cols = ['title', 'created_utc', 'ups', 'downs', 'upvote_ratio', 'score', 'num_comments', 'is_original_content', 'permalink', 'url']
df_posts = df_posts[selected_cols].copy()

# add a prefix to the 'permalink' column
df_posts['permalink'] = "https://reddit.com" + df_posts['permalink']   

df_posts.head()

### Remove non-English posts

To make the data easier to analyse using NLP techniques, we will filter out posts that are not in English.

In [13]:
# load the English language model into spacy
nlp = spacy.load("en_core_web_sm")

# filter the english posts by applying custom function
filtered_df_posts = df_posts[df_posts['title'].apply(chadtools.is_english, model=nlp)]

filtered_df_posts.tail()

Unnamed: 0,title,created_utc,ups,downs,upvote_ratio,score,num_comments,is_original_content,permalink,url
244,Zurek - Polish Easter Soup,1680433000.0,378,0,0.98,378,18,True,https://reddit.com/r/recipes/comments/129hxb0/...,https://i.redd.it/p3b62d3mdgra1.jpg
245,Sweet &amp; Crunchy Maple Popcorn,1680362000.0,606,0,0.97,606,24,True,https://reddit.com/r/recipes/comments/128p3mz/...,https://i.redd.it/883p3zahgara1.jpg
246,Matcha Fudge Chocolate Chip Marble Cookies (Re...,1680281000.0,1405,0,0.97,1405,28,False,https://reddit.com/r/recipes/comments/127rf0v/...,https://i.redd.it/5r1vvzc7t3ra1.jpg
247,Greek Yogurt Chicken Salad with Apples and Alm...,1680043000.0,23,0,0.97,23,2,False,https://reddit.com/r/recipes/comments/1254xfn/...,https://i.redd.it/kbpojb4e5kqa1.jpg
248,Louisiana Crawfish Boil,1680008000.0,1215,0,0.96,1215,56,True,https://reddit.com/r/recipes/comments/124nr2l/...,https://i.redd.it/sgmiy9z18hqa1.jpg


### 3.4 Save dataframe as a CSV file

In [None]:
filtered_df_posts.to_csv('../data/posts.csv', index = False)

In [None]:
'''
# Specify the subreddit and flair
subreddit_name = 'recipes'
flair_name = 'recipe'  # Change this to the desired flair

# Reddit API endpoint for searching posts in a subreddit
url = f'https://www.reddit.com/r/{subreddit_name}/search.json'

# Define parameters for the search query
params = {
    'q': f'flair_name:"{flair_name}"',
    'restrict_sr': 'on',  # Restrict the search to the specified subreddit
    'sort': 'new',       # Sort by new to get all posts
    'syntax': 'cloudsearch'
}

# Make the API request
response = s.get(url, params=params, headers={'User-agent': 'your_user_agent'})

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Get the number of posts
    num_posts = data['data']['dist']
    
    print(f"Number of posts with '{flair_name}' flair in r/{subreddit_name}: {num_posts}")
else:
    print(f"Error: {response.status_code}")
'''

'\n# Specify the subreddit and flair\nsubreddit_name = \'recipes\'\nflair_name = \'recipe\'  # Change this to the desired flair\n\n# Reddit API endpoint for searching posts in a subreddit\nurl = f\'https://www.reddit.com/r/{subreddit_name}/search.json\'\n\n# Define parameters for the search query\nparams = {\n    \'q\': f\'flair_name:"{flair_name}"\',\n    \'restrict_sr\': \'on\',  # Restrict the search to the specified subreddit\n    \'sort\': \'new\',       # Sort by new to get all posts\n    \'syntax\': \'cloudsearch\'\n}\n\n# Make the API request\nresponse = s.get(url, params=params, headers={\'User-agent\': \'your_user_agent\'})\n\n# Check if the request was successful (status code 200)\nif response.status_code == 200:\n    # Parse the JSON response\n    data = response.json()\n    \n    # Get the number of posts\n    num_posts = data[\'data\'][\'dist\']\n    \n    print(f"Number of posts with \'{flair_name}\' flair in r/{subreddit_name}: {num_posts}")\nelse:\n    print(f"Error: {