# Collect all nattyorjuice post urls to manually gather data

## Install/import stuff

In [None]:
#hide
!pip install .[dev]

#!jupyter nbextension install https://github.com/drillan/jupyter-black/archive/master.zip --user
#!jupyter nbextension enable jupyter-black-master/jupyter-black

!nbdime extensions --enable --user

In [None]:
# hide

## Loads environment variables from .env file

%load_ext dotenv
%dotenv
import os
import requests
import time
import datetime
import sys
import pickle
from requests.adapters import HTTPAdapter
from collections import Counter
import webbrowser

## Data discovery

In [None]:
pushshift = "https://api.pushshift.io/reddit/"
pushshift_submission_url = pushshift + "submission/search"
one_year_seconds = 1 * 365 * 24 * 60 * 60
before = int(time.time() - (one_year_seconds * 9))

params = {
    "subreddit": "nattyorjuice",
    "size": "25",
}

In [None]:
# t0 = time.time()
response = requests.get(pushshift_submission_url, params=params)
# t1 = time.time()

# total = t1-t0

In [None]:
# total

In [None]:
response

In [None]:
len(response.json()['data'])

In [None]:
response.json()['data'][0]

In [None]:
response.json()['data'][0]

In [None]:
response.json()['data'][0]['permalink']

In [None]:
response.json()['data'][0]['full_link']

In [None]:
response.json()['data'][0]['url']

## Crawling/ Downloading Data from pushshift

In [None]:
class SubredditCrawler(object):
    """
    Borrowed heavily from here: https://www.textjuicer.com/2019/07/crawling-all-submissions-from-a-subreddit/
    """

    pushshift = "https://api.pushshift.io/reddit/"
    pushshift_submission_url = pushshift + "submission/search"

    def __init__(self, subreddit, file_path, max_submissions=200):
        self.subreddit = subreddit
        self.file_path = file_path
        self.max_submissions = max_submissions
        self.submissions = []

        self.session = requests.Session()
        self.session.mount('http://api.pushshift.io/', HTTPAdapter(max_retries=5))

In [None]:
class SubredditCrawler(SubredditCrawler):
    def crawl_page(self, last_page=None):
        """
        Crawl a page of results from a given subreddit.

        :param subreddit: The subreddit to crawl.
        :param last_page: The last downloaded page.

        :return: A page or results.
        """
        params = {
            "subreddit": self.subreddit,
            "size": 100,
            "sort": "desc",
            "sort_type": "created_utc",
        }
        if last_page is not None:
            if len(last_page) > 0:
                # resume from where we left at the last page
                params["before"] = last_page[-1]["created_utc"]
            else:
                # the last page was empty, we are past the last page
                return []
        results = self.session.get(pushshift_submission_url, params=params)
        
        if not results.ok:
            # something wrong happened
            raise Exception(
                "Server returned status code {}".format(results.status_code)
            )
        return results.json()["data"]

In [None]:
class SubredditCrawler(SubredditCrawler):
    def crawl_subreddit(self, erase_self_submissions=False):
        """
        Crawl submissions from a subreddit.
        Isn't strictly correct on the number of submissions it grabs but doesn't matter too much
        for our purposes

        :param subreddit: The subreddit to crawl.
        :param max_submissions: The maximum number of submissions to download.

        :return: A list of submissions.
        """
        last_page = None

        if erase_self_submissions:
            self.submissions = []

        print(f"Started {datetime.datetime.now()}")

        with open(self.file_path, "wb") as file:

            while last_page != [] and len(self.submissions) < self.max_submissions:
                last_page = self.crawl_page(last_page)

                self.submissions += last_page
                pickle.dump(last_page, file)
                print(f"---- pickled {len(self.submissions)} posts so far ------")
                print(f"Last post title: {last_page[-1]['title']}")

                time.sleep(3)

        print(f"--------------------------------------")
        print(f"Finished {datetime.datetime.now()}")
        print(f"PICKLED {len(self.submissions)} SUBMISSIONS to file:")
        print(f"{self.file_path}")
        print(f"--------------------------------------")

In [None]:
def open_list(urls):
    '''
    Opens a list of urls in web browser for convenience in viewing data
    '''
    [webbrowser.open(url) for url in urls]

## Unpickling, cleaning downloaded pushshift data

In [None]:
pickled_posts = []
with open('data/all_nattyorjuice_submissions.pkl', "rb") as file:
    while 1:
        try:
            pickled_posts += pickle.load(file)
        except EOFError:
            break

len(pickled_posts)

### Remove invalid posts

In [None]:
mostly_valid = [post for post in pickled_posts if post['num_comments'] >= 2]
len(mostly_valid)

### Most common post link domains

In [None]:
Counter([post['domain'] for post in mostly_valid]).most_common()

### i.redd.it posts AKA just the images

In [None]:
i_redd_it = [post for post in mostly_valid if post['domain'] == 'i.redd.it']
len(i_redd_it)

### Remove certain flair e.g. Meme

In [None]:
flaired = [post for post in i_redd_it if 'link_flair_text' in post]
len(flaired)

In [None]:
Counter([post['link_flair_text'] for post in flaired]).most_common()

In [None]:
def exclude_irrelevant_posts(posts):
    relevant_posts = []
    for post in posts:
        if 'link_flair_text' in post:
            flair = post['link_flair_text']
            if flair == 'Meme':
                # Memes are not useful for our purposes, exclude them
                continue
        # Include everything else
        relevant_posts.append(post)
    return relevant_posts

In [None]:
pictures_no_memes = exclude_irrelevant_posts(i_redd_it)
len(pictures_no_memes)

### Investigate how to exclude deleted posts

In [None]:
response.json()