<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import datetime as dt
import time
import requests

In [2]:
url = "https://api.pushshift.io/reddit/search/submission?subreddit=biology"

In [3]:
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
# can use assert to check if something is true, if not an Exception is thrown
assert res.status_code == 200

In [6]:
type(res)

requests.models.Response

In [7]:
json_data = res.json()
json_data

{'data': [{'all_awardings': [],
   'allow_live_comments': False,
   'author': 'Pistil_Pete',
   'author_flair_css_class': None,
   'author_flair_richtext': [],
   'author_flair_text': None,
   'author_flair_type': 'text',
   'author_fullname': 't2_7a98d',
   'author_patreon_flair': False,
   'author_premium': False,
   'awarders': [],
   'can_mod_post': False,
   'contest_mode': False,
   'created_utc': 1580342485,
   'domain': 'self.biology',
   'full_link': 'https://www.reddit.com/r/biology/comments/evvx4o/editing_a_word_doc_of_a_dna_sequence/',
   'gildings': {},
   'id': 'evvx4o',
   'is_crosspostable': True,
   'is_meta': False,
   'is_original_content': False,
   'is_reddit_media_domain': False,
   'is_robot_indexable': True,
   'is_self': True,
   'is_video': False,
   'link_flair_background_color': '',
   'link_flair_css_class': 'question',
   'link_flair_richtext': [],
   'link_flair_template_id': '9312ad9e-07ae-11e3-a456-22000ab3216d',
   'link_flair_text': 'question',
   'li

In [8]:
len(json_data['data'])

25

In [9]:
# What does one submission look like?
json_data['data'] [0]#list of dictionaries

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'Pistil_Pete',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_7a98d',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1580342485,
 'domain': 'self.biology',
 'full_link': 'https://www.reddit.com/r/biology/comments/evvx4o/editing_a_word_doc_of_a_dna_sequence/',
 'gildings': {},
 'id': 'evvx4o',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_css_class': 'question',
 'link_flair_richtext': [],
 'link_flair_template_id': '9312ad9e-07ae-11e3-a456-22000ab3216d',
 'link_flair_text': 'question',
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': F

In [10]:
results_df = pd.DataFrame(json_data["data"])
results_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,thumbnail_height,thumbnail_width,crosspost_parent,crosspost_parent_list,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,author_flair_text_color
0,[],False,Pistil_Pete,,[],,text,t2_7a98d,False,False,...,,,,,,,,,,
1,[],False,Gladari,,[],,text,t2_1p68ub3z,False,False,...,85.0,140.0,,,,,,,,
2,[],False,Gladari,,[],,text,t2_1p68ub3z,False,False,...,111.0,140.0,,,,,,,,
3,[],False,DarlingGirl83,,[],,text,t2_3x7owutg,False,False,...,,,,,,,,,,
4,[],False,Explorer-2020,,[],,text,t2_5jy3rq9b,False,False,...,105.0,140.0,,,,,,,,


In [11]:
def query_pushshift(subreddit, kind = 'submission', day_window = 500, n = 100):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=2980" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [12]:
results = query_pushshift("biology")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=1000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=1500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=2000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=2500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=3000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=3500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=4000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=4500d
Querying from: https://api.pushshift.io/reddit/search/su

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=39500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=40000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=40500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=41000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=41500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=42000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=42500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=43000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=biology&size=2980&after=43500d
Querying from: https://api.pushshift.io/reddit

In [13]:
results.shape

(3405, 9)

In [14]:
results.head()

title           0
selftext        0
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [15]:
results.to_csv('../data/subreddit_biology.csv')