In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp submissionshandler
# all_local

In [None]:
# export
import praw
import pickle
from collections import Counter
from pprint import pprint
from steroidsornot.firsttry import PrawClient

In [None]:
from fastcore.test import *

## Unpickling, cleaning downloaded pushshift data

In [None]:
# export
class SubmissionsHandler():
    def __init__(self, path):
        self.path = path
        self.submissions = []
        self.mostly_useful = []
        
        self._unpickle(path)
        self._select_mostly_useful()
    
    def _unpickle(self, path):
        self.submissions = []
        with open(path, "rb") as file:
            while 1:
                try:
                    self.submissions += pickle.load(file)
                except EOFError:
                    break

        print(f'Unpickled {len(self.submissions)} objects.')
    
    def _select_mostly_useful(self):
        '''
        Remove mostly invalid posts, stuff like:
        few comments (so no label),
        no selftext are often not useful
        post was deleted or removed

        It's not a complete solution, as the pushshift data
        doesn't get updated when the post changes, so there
        are many posts which were deleted after fact.
        '''
        self.mostly_useful = []
        for post in self.submissions:
            if (post['num_comments'] >= 2 and
                    'selftext' in post and
                    post['selftext'] != '[deleted]' and
                    post['selftext'] != '[removed]' and
                    post.get('removed_by_category') == None and
                    post.get('link_flair_text') != 'Meme' and
                    post.get('domain') == 'i.redd.it'):
                self.mostly_useful.append(post)
        
        self.mostly_useful
        print(f'{len(self.mostly_useful)} are mostly useful.\n')
        
        # Show that none are deleted
        selftext = [post['selftext'] for post in self.mostly_useful]
        print('This should show no [deleted] or [removed] entries:')
        print(Counter(selftext).most_common())

    def common_domains(self):
        '''
            Show the number of posts that link to each domain
        '''
        pprint(Counter([post['domain'] for post in self.submissions]).most_common())
        

In [None]:
submission_handler = SubmissionsHandler('data/all_nattyorjuice_submissions.pkl')

In [None]:
test_eq(len(submission_handler.mostly_useful), 16260)

### Investigate how to exclude deleted posts

In [None]:
reddit = PrawClient().reddit()

In [None]:
posts = {
    'deleted': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/oxsr5t/kenneth_the_ii_is_he_natty_or_juice/'),

    'mod_removed': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/owvbpl/chul_soon/'),

    'normal': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/p17okr/how_long_does_it_take_to_get_this_natty_physique/'),

    'spam_removed': reddit.submission(url='https://www.reddit.com/r/nattyorjuice/comments/oqwkmd/55_230lbs_what_do_we_think/'),
}

In [None]:
vars(posts['deleted'])

vars(posts['mod_removed'])

vars(posts['normal'])

vars(posts['spam_removed'])

If the `removed_by_category` field on submissions is `None`, the submission has not been removed or deleted 

P.S. usually? The selftext is sometimes set to [removed] or [deleted] without the `removed_by_category` being set