In [None]:
%load_ext autoreload
%autoreload 2

# Loads environment variables from .env file
%load_ext dotenv
%dotenv

In [None]:
# default_exp pushshift
# all_local

# Collect all nattyorjuice post urls to manually gather data

## Install/import stuff

In [None]:
# export

import os
import requests
import time
import datetime
import pickle
from requests.adapters import HTTPAdapter
import webbrowser

In [None]:
from fastcore.test import *

## Data discovery/test

In [None]:
pushshift = "https://api.pushshift.io/reddit/"
pushshift_submission_url = pushshift + "submission/search"
one_year_seconds = 1 * 365 * 24 * 60 * 60
before = int(time.time() - (one_year_seconds * 9))

params = {
    "subreddit": "nattyorjuice",
    "size": "25",
}

In [None]:
response = requests.get(pushshift_submission_url, params=params)
len(response.json()['data'])
response.json()['data'][0]
response.json()['data'][0]
response.json()['data'][0]['permalink']
response.json()['data'][0]['full_link']
response.json()['data'][0]['url']

In [None]:
test_eq(len(response.json()['data']), 25)

## Crawling/ Downloading Data from pushshift

In [None]:
class SubredditCrawler(object):
    """
    Borrowed heavily from here: https://www.textjuicer.com/2019/07/crawling-all-submissions-from-a-subreddit/
    """

    pushshift = "https://api.pushshift.io/reddit/"
    pushshift_submission_url = pushshift + "submission/search"

    def __init__(self, subreddit, file_path, max_submissions=200):
        self.subreddit = subreddit
        self.file_path = file_path
        self.max_submissions = max_submissions
        self.submissions = []

        self.session = requests.Session()
        self.session.mount('http://api.pushshift.io/', HTTPAdapter(max_retries=5))

In [None]:
class SubredditCrawler(SubredditCrawler):
    def crawl_page(self, last_page=None):
        """
        Crawl a page of results from a given subreddit.

        :param subreddit: The subreddit to crawl.
        :param last_page: The last downloaded page.

        :return: A page or results.
        """
        params = {
            "subreddit": self.subreddit,
            "size": 100,
            "sort": "desc",
            "sort_type": "created_utc",
        }
        if last_page is not None:
            if len(last_page) > 0:
                # resume from where we left at the last page
                params["before"] = last_page[-1]["created_utc"]
            else:
                # the last page was empty, we are past the last page
                return []
        results = self.session.get(pushshift_submission_url, params=params)
        
        if not results.ok:
            # something wrong happened
            raise Exception(
                "Server returned status code {}".format(results.status_code)
            )
        return results.json()["data"]

In [None]:
class SubredditCrawler(SubredditCrawler):
    def crawl_subreddit(self):
        """
        Crawl submissions from a subreddit.
        Isn't strictly correct on the number of submissions it grabs but doesn't matter too much
        for our purposes

        :param subreddit: The subreddit to crawl.
        :param max_submissions: The maximum number of submissions to download.

        :return: A list of submissions.
        """
        last_page = None

        print(f"Started {datetime.datetime.now()}")

        with open(self.file_path, "wb") as file:

            while last_page != [] and len(self.submissions) < self.max_submissions:
                last_page = self.crawl_page(last_page)

                self.submissions += last_page
                pickle.dump(last_page, file)
                print(f"---- pickled {len(self.submissions)} posts so far ------")
                print(f"Last post title: {last_page[-1]['title']}")

                time.sleep(3)

        print(f"--------------------------------------")
        print(f"Finished {datetime.datetime.now()}")
        print(f"PICKLED {len(self.submissions)} SUBMISSIONS to file:")
        print(f"{self.file_path}")
        print(f"--------------------------------------")

In [None]:
# Important to keep these two lines in the same cell so old data doesn't
# get overwritten
submissions_path = f'data/submissions.{int(time.time())}.pkl'
crawler = SubredditCrawler('nattyorjuice', submissions_path, 100)

crawler.crawl_subreddit()
assert len(crawler.submissions) == 100

In [None]:
def open_list(urls):
    '''
    Opens a list of urls in web browser for convenience in viewing data
    '''
    [webbrowser.open(url) for url in urls]