In [2]:
import requests
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timezone
import time
import os
from IPython.display import clear_output

In [3]:
url = 'https://api.pushshift.io/reddit/search/submission'
url_com = 'https://api.pushshift.io/reddit/search/comment'

# Function definitions:

In [3]:
def get_sub_df(sub_name, num_to_pull, start_utc = None, min_comments = 0, drop_removed = True):
    #Returns a dataframe containing num_to_pull submissions from the specified subreddit
    #Dataframe has ID, Title, selftext and num_comments
    
    post_count = 0    #Keeps track of how many posts have been pulled
    data = []         #Holds all the data to create the DF after the while loop
    
    res_count = 0     #for debugging
    
    if start_utc == None:
        start_utc = int(datetime.now(timezone.utc).timestamp()) #If the start timestamp isn't specified, use the current time
        
    min_comment_count = '>' + str(min_comments-1) #paramater to select only submissions with comments greater than the number specified
    
    timer = 0 
    
    while post_count < num_to_pull:
        #To make sure to get the specific number pulled, check to see if the remaining count is less than 100, if not, just get 100
        if (num_to_pull - post_count) < 100:
            get_size = (num_to_pull - post_count)
        else:
            get_size = 100
            
        timer = time.time() #Timer to avoid 429 codes from making too many requests
        
        res = requests.get(url, params = {'subreddit' : sub_name, 'size' : get_size, 'before' : start_utc, 'num_comments' : min_comment_count})
        res_count += 1
        
        clear_output(wait=True) #reference: https://stackoverflow.com/a/24818304
        
        print(f'{100 * post_count/num_to_pull}%')
        
        while time.time() < (timer + 1):
            time.sleep(0.1)
        
        if res.status_code != 200:
            print(f'Status Code: {res.status_code}')
            print(res_count)
            return None
            
        new_data = res.json()['data']
            
        data.extend(new_data)
        
        post_count += 100 #it's OK if this number goes over num_to_pull, so just add 100 every time
        
        if post_count < num_to_pull: #do not do this if we don't need to make more requests
            try:
                start_utc = new_data[-1]['created_utc'] #This is the starting point for grabbing more posts
            except:
                print("Failed to get UTC of last Post. Printing list element that caused failure.")
                try:
                    print(new_data[-1])
                except:
                    print("Could not print element, returning end of working data.")
                    return data[-50:]
    
    print(res_count)
    
    df = pd.DataFrame(data)[['id', 'title', 'removed_by_category', 'created_utc', 'selftext','num_comments']]
    df['subreddit'] = [sub_name for _ in range(len(df))] #Creates a new column with the subreddit name - eventually, the target for the model
    
    #Filter out submissions that have been removed. Could result in small samples for heavily moderated forums, so there is the option to keep them in.
    if drop_removed:
        df = df[df['removed_by_category'].isna()]
        
    df['created_utc'] = df['created_utc'].apply(datetime.fromtimestamp)
    
    return df[['id', 'title', 'selftext','num_comments', 'created_utc', 'subreddit']] #drop the removed category before returning the dataframe

In [5]:
#Function to run the 'get_sub_df' function, but automatically check before re-downloading data, and save data as csv

def get_titles_text(sub_name, number = 10_000):
    #Step 1 is to see if the data exists already. We don't want to use the API if it isn't necesssary.
    if sub_name + '_data.csv' in os.listdir('./data'):
        print("That data ({sub_name}) is already in the data folder. Delete the file if you want to get the data again.")
        return None
    else:
        df = get_sub_df(sub_name, number)
        df.to_csv('./data/' + sub_name + '_data.csv')

In [26]:
get_titles_text('physics')
get_titles_text('chemistry')
get_titles_text('biology')

99.0%
100
