In [1]:
import numpy as np
import pandas as pd
import requests
import time
import datetime as dt
import json
from bs4 import BeautifulSoup 

### Using Pushshift Search Function

In [2]:
# References used to create this code: 
# https://github.com/pushshift/api
# DSI instructor Chuck Dye
# https://pushshift.io/author/stuck_in_the_matrix/

def pushshift(subreddit, post_type='submission', loops=1, size=500, skip=50):
# subreddit: str, name of subreddit to search for
# post_type: {'submission', 'comment'}, type of post to search for
# loops: int, number of times to request posts
# size: int, number of posts per request (max 500 per pushshift api guide)
# skip: int, number of days back to search in each loop 
        # increase if too many duplicate posts are returned, decrease if you want to skip fewer posts
    
    # data fields to return for submissions
    subfields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments', 'permalink', 
                 'score', 'selftext', 'subreddit', 'title', 'url', 'is_self']  
    
    # data fields to return for comments
    comfields = ['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 
                'permalink', 'score', 'subreddit']
    
    # instantiate list for posts data
    list_posts = [] 
    url_stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)
    # skip a minimum of 1 day
    after = 1    
    
    
    # check before requesting data
    if post_type not in ['comment', 'submission']:
        print("post_type must be 'comment', 'submission'")
        return None
    
    
    for i in range(loops):
        # add parameters to url to skip posts (after could be used to match up to post at end of previous loop if skip = 0)
        url = '{}&after={}d'.format(url_stem, skip * i + after) 
        # monitor status as loops run
        print(i, url)
        # get data
        res = requests.get(url)
        # add dictionaries for posts to list_posts
        list_posts.extend(res.json()['data']) 
        # sleep
        #Suspend execution of the calling thread for the given number of seconds. 
        #The argument may be a floating point number to indicate a more precise sleep time. 
        #The actual suspension time may be less than that requested because any caught signal will terminate the sleep() following execution of that signal’s catching routine. 
        #Also, the suspension time may be longer than requested by an arbitrary amount because of the scheduling of other activity in the system.
        time.sleep(1) 
        
    # turn list_posts (a list of dictionaries where each dictionary contains data on one post) into a dataframe
    df_posts = pd.DataFrame.from_dict(list_posts) 

    # filter fields for submissions or comments
    if post_type == 'submission':
        df_posts = df_posts[subfields]
    elif post_type == 'comment':
        df_posts = df_posts[comfields]  
#     else:
#         print("post_type must be 'submission' or 'comment'")
#         return None

    # drop any duplicates
    df_posts.drop_duplicates(inplace=True)
    # add a field identifying submissions or comments
    df_posts['post_type'] = post_type
    
    return df_posts

### Get Reddit posts and save to csv

In [3]:
page = requests.get("https://www.reddit.com/r/elonmusk/")

In [4]:
elon_subs = pushshift('elonmusk', post_type='submission', loops=50, skip=1)
print('shape', elon_subs.shape)
elon_subs.to_csv('elon-pushshift.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=elonmusk&size=500&after=11d
11 http

In [5]:
futurology = pushshift('Futurology', post_type='submission', loops=50, skip=1)
futurology.to_csv('futurology-pushshift.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=Futurology&size

In [18]:
print('shape', futurology.shape)

shape (3944, 13)


In [6]:
page2 = requests.get("https://www.reddit.com/r/https://www.reddit.com/r/Futurology/")

In [7]:
elon_coms = pushshift('elonmusk', post_type='comment', loops=20, skip=1)
print('shape', elon_coms.shape)
elon_coms.to_csv('elon_coms-pushshift.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=11d
11 https://api.pushshift.io/reddit/searc

In [8]:
future_coms = pushshift('Futurology', post_type='comment', loops=20, skip=1)
print('shape', future_coms.shape)
future_coms.to_csv('future_coms.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=11d
11 https://api.pus

In [9]:
def pushshift_2(subreddit, post_type='submission', loops=1, size=500, skip=75):
# subreddit: str, name of subreddit to search for
# post_type: {'submission', 'comment'}, type of post to search for
# loops: int, number of times to request posts
# size: int, number of posts per request (max 500 per pushshift api guide)
# skip: int, number of days back to search in each loop 
        # increase if too many duplicate posts are returned, decrease if you want to skip fewer posts
    
    # data fields to return for submissions
    subfields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments', 'permalink', 
                 'score', 'selftext', 'subreddit', 'title', 'url', 'is_self']  
    
    # data fields to return for comments
    comfields = ['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 
                'permalink', 'score', 'subreddit']
    
    # instantiate list for posts data
    list_posts = [] 
    url_stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)
    # skip a minimum of 1 day
    after = 1    
    
    
    # check before requesting data
    if post_type not in ['comment', 'submission']:
        print("post_type must be 'comment', 'submission'")
        return None
    
    
    for i in range(loops):
        # add parameters to url to skip posts (after could be used to match up to post at end of previous loop if skip = 0)
        url = '{}&after={}d'.format(url_stem, skip * i + after) 
        # monitor status as loops run
        print(i, url)
        # get data
        res = requests.get(url)
        # add dictionaries for posts to list_posts
        list_posts.extend(res.json()['data']) 
        # sleep
        time.sleep(1) 
        
    # turn list_posts (a list of dictionaries where each dictionary contains data on one post) into a dataframe
    df_posts = pd.DataFrame.from_dict(list_posts) 

    # filter fields for submissions or comments
    if post_type == 'submission':
        df_posts = df_posts[subfields]
    elif post_type == 'comment':
        df_posts = df_posts[comfields]  
#     else:
#         print("post_type must be 'submission' or 'comment'")
#         return None

    # drop any duplicates
    df_posts.drop_duplicates(inplace=True)
    # add a field identifying submissions or comments
    df_posts['post_type'] = post_type
    
    return df_posts

In [10]:
elon_coms_2 = pushshift_2('elonmusk', post_type='comment', loops=20, skip=1)
print('shape', elon_coms_2.shape)
elon_coms_2.to_csv('elon_coms-pushshift2.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=11d
11 https://api.pushshift.io/reddit/searc

In [11]:
future_coms_2 = pushshift_2('Futurology', post_type='comment', loops=20, skip=1)
print('shape', future_coms_2.shape)
future_coms_2.to_csv('future_coms_2.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=11d
11 https://api.pus

In [12]:
def pushshift_3(subreddit, post_type='submission', loops=1, size=500, skip=200):
# subreddit: str, name of subreddit to search for
# post_type: {'submission', 'comment'}, type of post to search for
# loops: int, number of times to request posts
# size: int, number of posts per request (max 500 per pushshift api guide)
# skip: int, number of days back to search in each loop 
        # increase if too many duplicate posts are returned, decrease if you want to skip fewer posts
    
    # data fields to return for submissions
    subfields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments', 'permalink', 
                 'score', 'selftext', 'subreddit', 'title', 'url', 'is_self']  
    
    # data fields to return for comments
    comfields = ['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 
                'permalink', 'score', 'subreddit']
    
    # instantiate list for posts data
    list_posts = [] 
    url_stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)
    # skip a minimum of 1 day
    after = 1    
    
    
    # check before requesting data
    if post_type not in ['comment', 'submission']:
        print("post_type must be 'comment', 'submission'")
        return None
    
    
    for i in range(loops):
        # add parameters to url to skip posts (after could be used to match up to post at end of previous loop if skip = 0)
        url = '{}&after={}d'.format(url_stem, skip * i + after) 
        # monitor status as loops run
        print(i, url)
        # get data
        res = requests.get(url)
        # add dictionaries for posts to list_posts
        list_posts.extend(res.json()['data']) 
        # sleep
        time.sleep(1) 
        
    # turn list_posts (a list of dictionaries where each dictionary contains data on one post) into a dataframe
    df_posts = pd.DataFrame.from_dict(list_posts) 

    # filter fields for submissions or comments
    if post_type == 'submission':
        df_posts = df_posts[subfields]
    elif post_type == 'comment':
        df_posts = df_posts[comfields]  
#     else:
#         print("post_type must be 'submission' or 'comment'")
#         return None

    # drop any duplicates
    df_posts.drop_duplicates(inplace=True)
    # add a field identifying submissions or comments
    df_posts['post_type'] = post_type
    
    return df_posts

In [13]:
elon_coms_3 = pushshift_3('elonmusk', post_type='comment', loops=20, skip=1)
print('shape', elon_coms_3.shape)
elon_coms_3.to_csv('elon_coms-pushshift3.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=elonmusk&size=500&after=11d
11 https://api.pushshift.io/reddit/searc

In [14]:
future_coms_3 = pushshift_3('Futurology', post_type='comment', loops=20, skip=1)
print('shape', future_coms_3.shape)
future_coms_3.to_csv('future_coms_3.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=Futurology&size=500&after=11d
11 https://api.pus

### Create CSV for analysis of comment body text only

In [15]:
df = pd.concat([elon_coms[['body', 'subreddit']], 
                future_coms[['body', 'subreddit']], 
                elon_coms_2[['body', 'subreddit']], 
                 future_coms_2[['body', 'subreddit']],
                 elon_coms_3[['body', 'subreddit']], 
                 future_coms_3[['body', 'subreddit']]], 
                ignore_index=True)
df.to_csv('comments.csv', index=False)

In [16]:
df.shape

(11261, 2)

In [17]:
df.head()

Unnamed: 0,body,subreddit
0,They shared.\nThey did not share.\nThey resist...,elonmusk
1,"Sorry to break it to you, but corporate monopo...",elonmusk
2,And wearing his Elon's Musk,elonmusk
3,This is where it starts getting a little weird,elonmusk
4,[removed],elonmusk
