In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp pushshift

# Collect all nattyorjuice post urls to manually gather data

## Install/import stuff

In [None]:
#hide
!pip install .[dev]

#!jupyter nbextension install https://github.com/drillan/jupyter-black/archive/master.zip --user
#!jupyter nbextension enable jupyter-black-master/jupyter-black

!nbdime extensions --enable --user

In [None]:
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [None]:
# export

# Loads environment variables from .env file

import os
import praw
import requests
import time
import datetime
import sys
import pickle
from requests.adapters import HTTPAdapter
from collections import Counter
import webbrowser

In [None]:
# PRAW stuff
client_id = os.environ.get('REDDIT_CLIENT_ID')
client_secret = os.environ.get('REDDIT_CLIENT_SECRET')
user_agent = 'User-Agent: Steroid detector bot by /u/thetreecycle'

# Output client_id to see if it's working
client_id

## Data discovery

In [None]:
pushshift = "https://api.pushshift.io/reddit/"
pushshift_submission_url = pushshift + "submission/search"
one_year_seconds = 1 * 365 * 24 * 60 * 60
before = int(time.time() - (one_year_seconds * 9))

params = {
    "subreddit": "nattyorjuice",
    "size": "25",
}

In [None]:
response = requests.get(pushshift_submission_url, params=params)
len(response.json()['data'])
response.json()['data'][0]
response.json()['data'][0]
response.json()['data'][0]['permalink']
response.json()['data'][0]['full_link']
response.json()['data'][0]['url']

## Crawling/ Downloading Data from pushshift

In [None]:
class SubredditCrawler(object):
    """
    Borrowed heavily from here: https://www.textjuicer.com/2019/07/crawling-all-submissions-from-a-subreddit/
    """

    pushshift = "https://api.pushshift.io/reddit/"
    pushshift_submission_url = pushshift + "submission/search"

    def __init__(self, subreddit, file_path, max_submissions=200):
        self.subreddit = subreddit
        self.file_path = file_path
        self.max_submissions = max_submissions
        self.submissions = []

        self.session = requests.Session()
        self.session.mount('http://api.pushshift.io/', HTTPAdapter(max_retries=5))

In [None]:
class SubredditCrawler(SubredditCrawler):
    def crawl_page(self, last_page=None):
        """
        Crawl a page of results from a given subreddit.

        :param subreddit: The subreddit to crawl.
        :param last_page: The last downloaded page.

        :return: A page or results.
        """
        params = {
            "subreddit": self.subreddit,
            "size": 100,
            "sort": "desc",
            "sort_type": "created_utc",
        }
        if last_page is not None:
            if len(last_page) > 0:
                # resume from where we left at the last page
                params["before"] = last_page[-1]["created_utc"]
            else:
                # the last page was empty, we are past the last page
                return []
        results = self.session.get(pushshift_submission_url, params=params)
        
        if not results.ok:
            # something wrong happened
            raise Exception(
                "Server returned status code {}".format(results.status_code)
            )
        return results.json()["data"]

In [None]:
class SubredditCrawler(SubredditCrawler):
    def crawl_subreddit(self, erase_self_submissions=False):
        """
        Crawl submissions from a subreddit.
        Isn't strictly correct on the number of submissions it grabs but doesn't matter too much
        for our purposes

        :param subreddit: The subreddit to crawl.
        :param max_submissions: The maximum number of submissions to download.

        :return: A list of submissions.
        """
        last_page = None

        if erase_self_submissions:
            self.submissions = []

        print(f"Started {datetime.datetime.now()}")

        with open(self.file_path, "wb") as file:

            while last_page != [] and len(self.submissions) < self.max_submissions:
                last_page = self.crawl_page(last_page)

                self.submissions += last_page
                pickle.dump(last_page, file)
                print(f"---- pickled {len(self.submissions)} posts so far ------")
                print(f"Last post title: {last_page[-1]['title']}")

                time.sleep(3)

        print(f"--------------------------------------")
        print(f"Finished {datetime.datetime.now()}")
        print(f"PICKLED {len(self.submissions)} SUBMISSIONS to file:")
        print(f"{self.file_path}")
        print(f"--------------------------------------")

In [None]:
def open_list(urls):
    '''
    Opens a list of urls in web browser for convenience in viewing data
    '''
    [webbrowser.open(url) for url in urls]

## Unpickling, cleaning downloaded pushshift data

In [None]:
# export
def unpickle(path):
    pickled_list = []
    with open(path, "rb") as file:
        while 1:
            try:
                pickled_list += pickle.load(file)
            except EOFError:
                break

    print(f'Unpickled {len(pickled_list)} objects.')
    return pickled_list

In [None]:
pickled_posts = unpickle('data/all_nattyorjuice_submissions.pkl')

Unpickled 41725 posts.


### Most common post link domains

In [None]:
Counter([post['domain'] for post in pickled_posts]).most_common()

### Remove irrelevant posts

In [None]:
#export
def select_mostly_useful(posts):
    '''
    Remove mostly invalid posts, stuff like:
    few comments (so no label),
    no selftext are often not useful
    post was deleted or removed
    
    It's not a complete solution, as the pushshift data
    doesn't get updated when the post changes, so there
    are many posts which were deleted after fact.
    '''
    mostly_useful = []
    for post in posts:
        if (post['num_comments'] >= 2 and
                'selftext' in post and
                post['selftext'] != '[deleted]' and
                post['selftext'] != '[removed]' and
                post.get('removed_by_category') == None and
                post.get('link_flair_text') != 'Meme' and
                post.get('domain') == 'i.redd.it'):
            mostly_useful.append(post)
    return mostly_useful
    
    
# 34258

In [None]:
mostly_useful = select_mostly_useful(pickled_posts)
len(mostly_useful)

In [None]:
selftext = [post['selftext'] for post in mostly_useful]
len(selftext)

In [None]:
# Show that it's only image data
Counter(selftext).most_common()

### Investigate how to exclude deleted posts

In [None]:
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
)

nattyorjuice = reddit.subreddit('nattyorjuice')

In [None]:
posts = {
    'deleted': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/oxsr5t/kenneth_the_ii_is_he_natty_or_juice/'),

    'mod_removed': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/owvbpl/chul_soon/'),

    'normal': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/p17okr/how_long_does_it_take_to_get_this_natty_physique/'),

    'spam_removed': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/oqwkmd/55_230lbs_what_do_we_think/'),
}

In [None]:
vars(posts['deleted'])

In [None]:
vars(posts['mod_removed'])

In [None]:
vars(posts['normal'])

In [None]:
vars(posts['spam_removed'])

If the `removed_by_category` field on submissions is `None`, the submission has not been removed or deleted 

P.S. usually? The selftext is sometimes set to [removed] or [deleted] without the `removed_by_category` being set