In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)

In [2]:
def query_pushshift (subreddit, kind='submission', skip=60, times=10, 
                      subfields=['title', 'score', 'is_self', 'selftext', 'created_utc', 'num_comments'],
                    comfields=['body', 'score', 'created_utc']):
    
    
    # Ensure we only input a valid search type
    assert (kind in ['submission', 'comment']), "Invalid kind of search!"
    
    # Initalize the list holding all the query results
    mylist = []
    
    # Create stem of the API query string
    stem="https://api.pushshift.io/reddit/search/"+kind+"/?subreddit="+subreddit+"&size=500" 
    
    # Iterate through queries and convert each query to a dictionary
    for i in range(1, times):
        # Skipping backwards in time, each query looks for older records
        URL=stem+"&before="+str(skip*i)+"d"
        print (URL)
        response = requests.get(URL)
        mine= response.json()['data']
        df=pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(3)
        
    # Stitch the query frames back together
    full=pd.concat(mylist)
    
    if kind == 'submission':
        # Limit the features to the ones we're interested in
        full=full[subfields]
        
        #De-duplicate the observations
        full=full.drop_duplicates()
        
        #Drop out entries that don't contain body text
        full=full.loc[full['is_self']==True]
    else:
        
        # Limit the features to the ones we're interested in
        full=full[comfields]
        
        #De-duplicate the observations
        full=full.drop_duplicates()
    
    # Convert the time to a Pandas DateTime Object (courtsey of 
    #http://www.storybench.org/how-to-scrape-reddit-with-python/)
    
    def get_date(created):
        return dt.datetime.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp']=_timestamp
    
    
    #Confirm the shape of the output
    print (full.shape)
    
    return full
    
    


In [3]:
prochoice=query_pushshift('prochoice')

https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=360d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=420d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=480d
https://api.pushshift.io/reddit/search/submission/?subreddit=prochoice&size=500&before=540d
(377, 7)


In [4]:
prolife=query_pushshift('Prolife')

https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=360d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=420d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=480d
https://api.pushshift.io/reddit/search/submission/?subreddit=Prolife&size=500&before=540d
(665, 7)


In [5]:
evolution=query_pushshift('DebateEvolution')

https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=360d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=420d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=480d
https://api.pushshift.io/reddit/search/submission/?subreddit=DebateEvolution&size=500&before=540d
(944, 7)


In [6]:
creation=query_pushshift('Creation')

https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=180d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=240d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=300d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=360d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=420d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=480d
https://api.pushshift.io/reddit/search/submission/?subreddit=Creation&size=500&before=540d
(321, 7)


In [7]:
prochoice.to_csv('./Prochoice.csv')

prolife.to_csv('./Prolife.csv')

evolution.to_csv('./DebateEvolution.csv')

creation.to_csv('./Creation.csv')

In [8]:
prochoicecom=query_pushshift('prochoice', kind='comment')

https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=60d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=120d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=180d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=240d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=300d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=360d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=420d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=480d
https://api.pushshift.io/reddit/search/comment/?subreddit=prochoice&size=500&before=540d
(4499, 4)


In [9]:
prolifecom=query_pushshift('prolife', kind='comment')

https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=60d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=120d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=180d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=240d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=300d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=360d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=420d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=480d
https://api.pushshift.io/reddit/search/comment/?subreddit=prolife&size=500&before=540d
(4499, 4)


In [10]:
evolutioncom=query_pushshift('DebateEvolution', kind='comment')

https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=60d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=120d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=180d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=240d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=300d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=360d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=420d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=480d
https://api.pushshift.io/reddit/search/comment/?subreddit=DebateEvolution&size=500&before=540d
(4499, 4)


In [11]:
creationcom=query_pushshift('Creation', kind='comment')

https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=60d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=120d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=180d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=240d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=300d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=360d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=420d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=480d
https://api.pushshift.io/reddit/search/comment/?subreddit=Creation&size=500&before=540d
(3499, 4)


In [12]:
prochoicecom.to_csv('./ProchoiceComments.csv', encoding='utf-8')

prolifecom.to_csv('./ProlifeComments.csv', encoding='utf-8')

evolutioncom.to_csv('./DebateEvolutionComments.csv', encoding='utf-8')

creationcom.to_csv('./CreationComments.csv', encoding='utf-8')

In [2]:
# Modified query function to just pull titles of recently 
# submitted submissions

def mod_query_pushshift (skip=1, 
                         times=15, 
                         subfields=['title', 'score', 'subreddit', 
                                    'created_utc', 
                                    'num_comments']):
    
    # Initalize the list holding all the query results
    mylist = []
    
    # Create stem of the API query string
    stem="https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500" 
    
    # Iterate through queries and convert each query to a dictionary
    for i in range(1, times):
        # Skipping backwards in time, each query looks for older records
        URL=stem+"&before="+str(skip*i)+"d"
        print (URL)
        response = requests.get(URL)
        mine= response.json()['data']
        df=pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(3)
        
    # Stitch the query frames back together
    full=pd.concat(mylist)
    
    # Limit the features to the ones we're interested in
    full=full[subfields]
        
    #De-duplicate the observations
    full=full.drop_duplicates()
    
    # Convert the time to a Pandas DateTime Object (courtsey of 
    #http://www.storybench.org/how-to-scrape-reddit-with-python/)
    
    def get_date(created):
        return dt.datetime.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp']=_timestamp
    
    
    #Confirm the shape of the output
    print (full.shape)
    
    return full
    

In [3]:
general=mod_query_pushshift()

https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=1d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=2d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=3d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=4d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=5d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=6d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=7d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=8d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=9d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=10d
https://api.pushshift.io/reddit/search/submission/?num_comments=>100&size=500&before=11d
https://api.pushshift.io/reddi

In [4]:
general['num_comments'].median()


169.0

In [5]:
general['num_comments'].describe()

count     7000.000000
mean       310.812286
std        640.331762
min        101.000000
25%        125.000000
50%        169.000000
75%        281.000000
max      25236.000000
Name: num_comments, dtype: float64

In [6]:
general['num_comments'].value_counts()

104      94
103      88
111      86
108      84
102      84
101      83
109      83
105      77
107      76
110      74
106      73
121      72
112      72
119      70
120      69
113      68
115      67
114      66
134      66
116      65
129      64
117      62
118      61
125      60
124      58
133      56
128      56
126      54
123      53
122      53
         ..
1665      1
1641      1
1601      1
1589      1
7730      1
1545      1
5627      1
1505      1
3540      1
9657      1
1417      1
7882      1
1753      1
3816      1
1965      1
10407     1
2139      1
2131      1
2115      1
2099      1
1977      1
1937      1
1781      1
25236     1
1917      1
3928      1
1833      1
1805      1
1785      1
2049      1
Name: num_comments, Length: 951, dtype: int64

In [7]:
general.head()

Unnamed: 0,title,score,subreddit,created_utc,num_comments,timestamp
0,Tay-K's Manager Confirms the Rapper Isn't Faci...,1,hiphopheads,1527191819,296,2018-05-24 15:56:59
1,If the Capitals win the Stanley Cup Ovechkins ...,1,hockey,1527191792,174,2018-05-24 15:56:32
2,Pro way to check if both brake light bulbs work.,1,motorcycles,1527191778,202,2018-05-24 15:56:18
3,"Otto per mille, alla Chiesa Cattolica un milia...",1,italy,1527191759,107,2018-05-24 15:55:59
4,Reminder: Why Lebron's Supporting Cast Being B...,1,nba,1527191751,272,2018-05-24 15:55:51


In [8]:
general.to_csv('./general.csv', encoding='utf-8')