# A Scraper for the WallStreetBets subreddit created using PRAW and PSAW

This scraper produces JSON files of submissions from the WallStreetBets subreddit. Each JSON file contains a list of dictionaries with relevant information about the submission for my analysis: post title, text, label, upvotes, upvote ratio, id and URL. I initially used this to scrape 200k WallStreetBets posts 

I had to combine PRAW and PSAW because some of the specific reddit submission search fields that I needed (link_flair_text, score) were not accurate in PSAW, but PRAW's time-range search functionality has been disabled. This combined scraper first performs a time-range search on the AITA subreddit using PSAW. The submission ids from those results are then put intp PRAW, which mines the relevant submission information.  

PRAW (https://praw.readthedocs.io/en/latest/) and PSAW were both used to create this scraper. 



In [1]:
import json
import praw
reddit = praw.Reddit(client_id='SECRET', client_secret='SECRET', user_agent='SECRET')

In [2]:
from psaw import PushshiftAPI
api = PushshiftAPI()
subreddit = reddit.subreddit('wallstreetbets')

import datetime

In [3]:
def get_more_posts(start = datetime.date(2021,2,15), lim=1000):
    
    #initialize post list
    posts = []        
    
    #search for posts from before the specified time in psaw
    results = list(api.search_submissions(before=start,
                                subreddit='wallstreetbets', #change subreddit
                                filter=['url','num_comments','created_utc','id'], #change traits returned
                                limit=lim))      
    
    for i in results:

        try:
            #insert the id of the results into PRAW
            j = praw.models.Submission(reddit,id=i.id)

            post_dict = {}
            post_dict["title"] = j.title
            post_dict["text"] = j.selftext
            post_dict["label"] = j.link_flair_text
            post_dict['score'] = j.score
            post_dict['ups'] = j.ups
            post_dict['downs'] = j.downs
            post_dict['upvote_ratio'] = j.upvote_ratio
            post_dict['id'] = j.id
            post_dict['url'] = j.url
            post_dict['comments_id'] = [str(i) for i in j.comments]
            post_dict['comments_text'] = []
            post_dict["comment_scores"] = []
            
            for i in j.comments:
                try:
                    post_dict["comments_text"].append(i.body)

                except:
                    post_dict["comments_text"].append(None)

                try:
                    post_dict["comment_scores"].append(i.score)
                except:
                    post_dict["comment_scores"].append(None)

            post_dict['date'] = j.created_utc
            post_dict['num_comments'] = len(post_dict['comments_id'])
            posts.append(post_dict)
            
        except:
            pass
    
    #return list of posts and the timestamp of the last post in the search. you use this to iterate further back in time
    return posts, results[-1].created_utc


In [9]:
### test one search
results = list(api.search_submissions(before=datetime.date(2020,1,14),
                                subreddit='rateme', #change subreddit
                                filter=['url','num_comments','created_utc','id','comments','body'], #change traits returned
                                limit=1))  

In [10]:
### test function once 

get_more_posts(datetime.date(2021,2,15),lim=1)

([{'title': 'Created a cliff for myself to jump off of',
   'text': '[deleted]',
   'label': 'Loss',
   'score': 1,
   'ups': 1,
   'downs': 0,
   'upvote_ratio': 1.0,
   'id': 'lk0rzi',
   'url': 'https://i.redd.it/aq6r2ogz7jh61.jpg',
   'comments_id': ['gnhbg9b'],
   'comments_text': ["Screenshots of your positions must show equity or gains/losses of more than $2,500 or $10,000 USD for options or stocks respectively. \n\nWe don't just want a % change chart. Tell us what you traded, when, for how much, and why!\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/wallstreetbets) if you have any questions or concerns.*"],
   'comments_scores': [1],
   'date': 1613347186.0,
   'num_comments': 1},
  {'title': 'I will ship this to the biggest loss porn from last week. Let’s see some loss porn baby!!!',
   'text': '',
   'label': 'Loss',
   'score': 1,
   'ups': 1,
   'downs': 0,
   'upvote_ratio': 1.0,
   'i

In [4]:
target = 10000000 #the number of posts you want to acquire
post_list = [] #list of posts

time = 1611781244 ## starting timed

#an arbitrarily large number for the range so it doesn't stop before it needs to
for i in range(10000000000000000000000000000):
    
    if len(post_list)<target: #continue using the get_more_posts function until post_list is long enough
        
        print(time) #optional for seeing when a new loop starts (and in case you need the time to iterate back futher)
        
        ### get 1000 posts (the max number needed)
        (posts,time) = get_more_posts(time,1000)
        post_list.extend(posts) 
        
        ### save files from one iteration (these are smaller and easier to work with)
        with open('wsb_psaw_praw_{}.json'.format(str(len(post_list))), 'w') as json_file: #optional, save files from one iteration
            json.dump(post_list, json_file) 
    
    ### once you are over the target number of posts
    elif len(post_list)>target:
        
        
        with open('wsb_psaw_praw_final_{}.json'.format(str(len(post_list))), 'w') as json_file: #save final file
            json.dump(post_list, json_file)
            
        with open("final_time.json",'w') as json_file: #save final timestamp in case you need to run some more
            json.dump(time,json_file)
            
        print(time) #optional for seeing the last timestamp
        
        break #exit the loop
        
    else: # in case you can't hit the target for whatever reason
        
        with open('wsb_psaw_praw_final_{}.json'.format(str(len(post_list))), 'w') as json_file: #save final file
            json.dump(post_list, json_file)
            
        with open("final_time.json",'w') as json_file: #save final timestamp in case you need to run some more
            json.dump(time,json_file)
            
        print(time) #optional for seeing the last timestamp
        
        break #exit the loop
        
    if time < 1611723584: ## temporary for extracting one day
        break
    

1611781244
1611781028
1611780820
1611780624
1611780435
1611780256
1611780085
1611779901
1611779729
1611779513
1611779320
1611779129
1611778950
1611778781
1611778602
1611778464
1611778291
1611778151
1611777986
1611777834
1611777691
1611777567
1611777422
1611777264
1611777100
1611776939
1611776775
1611776595
1611776419
1611776182
1611775931
1611775612
1611775396
1611775164
1611774984
1611774794
1611774614
1611774376
1611774085
1611773902
1611773757
1611773610
1611773423
1611773231
1611772995
1611772889
1611772797
1611772692
1611772595
1611772486
1611772402
1611772309
1611772205
1611772103
1611772021
1611771924
1611771824
1611771720
1611771616
1611770894
1611770800
1611770712




1611770631
1611770531
1611770452
1611770359
1611770269
1611770181
1611770089
1611770005
1611769904
1611769809
1611769709
1611769625
1611769535
1611769447
1611769373
1611769293
1611769207
1611769120
1611769037
1611768957
1611768888
1611768813
1611768736
1611768661
1611768581
1611768509
1611768428
1611768346
1611768260
1611768176
1611768090
1611768013
1611767937
1611767870
1611767791
1611767714
1611767645
1611767574
1611767495
1611767421
1611767345
1611767265
1611767175
1611767098
1611767029
1611766940
1611766866
1611766797
1611766713
1611766636
1611766564
1611766482
1611766400
1611766324
1611766233
1611766152
1611766077
1611766013
1611765929
1611765831
1611765736
1611765646
1611765567
1611765480
1611765375
1611765290
1611765208
1611765115
1611765030
1611764947
1611764853
1611764751
1611764658
1611764564
1611764476
1611764391
1611764297
1611764215
1611764120
1611764024
1611763931
1611763825
1611763736
1611763636
1611763519
1611763404
1611763301
1611763204
1611763120
1611763017
1611762915