In [41]:
import requests
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from datetime import timezone
import time
import os
from IPython.display import clear_output

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'
url_com = 'https://api.pushshift.io/reddit/search/comment'

res = requests.get(url, params = {'subreddit' : 'moviesuggestions'})
res.status_code

200

In [130]:
#Function created for Project 3, modified for better functionality and capstone project needs

def get_sub_df(sub_name, num_to_pull, start_utc = None, min_comments = 0, drop_removed = True):
    #Returns a dataframe containing num_to_pull submissions from the specified subreddit
    #Dataframe has ID, Title, selftext and num_comments
    
    post_count = 0    #Keeps track of how many posts have been pulled
    data = []         #Holds all the data to create the DF after the while loop
    
    res_count = 0     #for debugging
    
    if start_utc == None:
        start_utc = int(datetime.now(timezone.utc).timestamp()) #If the start timestamp isn't specified, use the current time
        
    min_comment_count = '>' + str(min_comments-1) #paramater to select only submissions with comments greater than the number specified
    
    timer = 0 
    
    while post_count < num_to_pull:
        #To make sure to get the specific number pulled, check to see if the remaining count is less than 100, if not, just get 100
        if (num_to_pull - post_count) < 100:
            get_size = (num_to_pull - post_count)
        else:
            get_size = 100
            
        timer = time.time() #Timer to avoid 429 codes from making too many requests
        
        res = requests.get(url, params = {'subreddit' : sub_name, 'size' : get_size, 'before' : start_utc, 'num_comments' : min_comment_count})
        res_count += 1
        
        clear_output(wait=True) #reference: https://stackoverflow.com/a/24818304
        
        print(f'{100 * post_count/num_to_pull}%')
        
        # Wait for ONE second, but in 0.1s increments (saves time because the request may have taken >0.1s)
        # This shouldn't ever come close to exceeding the limit of 200 requests/minute
        while time.time() < (timer + 1):
            time.sleep(0.1)
        
        if res.status_code != 200:
            print(f'Status Code: {res.status_code}')
            print(res_count)
            return None
            
        new_data = res.json()['data']
            
        data.extend(new_data)
        
        post_count += 100 #not a problem to go over num_to_pull
        
        if post_count < num_to_pull:
            try:
                #This is the starting point for grabbing more posts
                start_utc = new_data[-1]['created_utc']
            except:
                #If that doesn't work, try to find out what went wrong:
                print("Failed to get UTC of last Post. Printing list element that caused failure.")
                try:
                    post_count = num_to_pull
                    print(new_data[-1])
                except:
                    print("Printing last list element failed.")
    
    print(res_count)
    
    columns_for_df = ['id', 'title', 'selftext', 'removed_by_category', 'created_utc', 'num_comments', 'link_flair_css_class']
    
    df = pd.DataFrame(data)
    #select the columns, but only if they exist in the dataframe - the res won't return data when there are 0 non-null values
    df = df[df.columns & columns_for_df]
    
    #Filter out submissions that have been removed. Could result in small samples for heavily moderated forums, so there is the option to keep them in.
    if drop_removed and 'removed_by_category' in df.columns:
        df = df[df['removed_by_category'].isna()]
    
    #Drop this column, it is no longer needed
    if 'removed_by_category' in df.columns:
        df.drop(columns = 'removed_by_category', inplace = True)
    
    #Convert timestamps to datetimes
    df['created_utc'] = df['created_utc'].apply(datetime.fromtimestamp)    
    
    df.reset_index(inplace=True)
    
    return df

In [145]:
def get_comments(df):
    #function to get dictionary from json data - using submission id numbers from the passed dataframe
    #This function should work on any size df*, but may take a looong time if there are a lot of comments to get 
    # - if new comments are being made on the submissions, this function might not get all the comments
    #  * unless there is a limit in the API on how many ids can be passed
    
    #First, get a list of id's from submissions in the dataframe: 
    
    id_string = ''
    
    for i in df['id']:
        id_string += i + ', ' #needs to be formated like: 'id1234, id1235, id1236'
        
    id_string = id_string[:-2] #drop the last ', '
    
    #Get the first batch of data and create the list that will ultimately be returned
    com_res = requests.get(url_com, params = {'link_id' : id_string, 'size' : 500})
    com_data = com_res.json()['data']
    
    #Check to make sure that the request worked:
    if com_res.status_code != 200:
        print(f'Status code: {com_res.status_code}')
        return None
    
    #This is the goal - the number of comments we want to end up with (ideally)
    num_comments = df['num_comments'].sum() 
    
    last_length = -1
    comment_count = 0
    timer = 0
    
    res_counter = 0
    
    # Handling the case when there are no comments to get:
    if len(com_data) == 0:
        return com_data
    
    while (len(com_data) < num_comments) & (last_length != len(com_data)):
        #This makes sure that the while loop doesn't continue forever if something goes wrong
        last_length = len(com_data)
        
        try:
            #This is the starting point for grabbing more comments
            last_comment_utc = com_data[-1]['created_utc'] 
        except:
            print("Failed to get UTC of last comment. Printing list element that caused failure.")
            print(com_data[-1])
        
        timer = time.time()
        
        new_res = requests.get(url_com, params = {'link_id' : id_string,
                                                  'size' : 500,
                                                  'before' : last_comment_utc,
                                                  'limit' : 500})
        
        res_counter += 1
        
        comment_count += 500
        
        print(f'{100 * comment_count/num_comments}%')
        
        
        while time.time() < (timer + 1):
            time.sleep(0.1)
        
        if new_res.status_code == 200:
            
            new_data = new_res.json()['data']
            
            for new_comment in new_data:
                com_data.append(new_comment)
                
        else:
            print(f'Problem when pulling new comments: Code {new_res.status_code}')
            
    print(res_counter)
            
    return com_data

In [112]:
def assign_comments(com_data, df):
    #Take comment data ( .json()['data'] ) and make a new column for the dataframe with the comment texts
    
    df_c = df.copy() #make a copy of the df, just to be safe/explicit. This copy is what is returned by the function.

    #Make a zip opject with submission ids and comment text
    com_zip = zip([com_data[i]['link_id'][-6:] for i in range(len(com_data))], [com_data[i]['body'] for i in range(len(com_data))])

    # create a list of empty lists and make that list the new column
    df_c['comments'] = [[] for _ in range(len(df_c))]
    
    # how many comments are assigned to each row in the dataframe
    assignments = np.zeros_like(df_c['num_comments'])

    #List of ids, to check if the comment's submission id is in the dataframe
    id_list = df_c['id'].values

    #counts the total number of comments that can't be assigned to a row
    unassigned = 0

    #Loop through the zip object, appending comments to the correct row on the 'comments' columns created above, then add 1 to assignment list (to be column later)
    for idx, com in com_zip:
        if idx in id_list:
            df_c[df_c['id'] == idx]['comments'].item().append(com)
            assignments[df_c[df_c['id'] == idx].index.item()] += 1
        else:
            unassigned += 1

    if(unassigned > 0):
        print(f'There are {unassigned} comments that could not be assigned to a submission!')
    
    df_c['assigned_comments'] = assignments
    
    return df_c

In [137]:
def get_and_assign_comments(df):
    #This function grabs comments in chunks and assigns them to the submissions
    #
    
    df_copy = df.copy()
    
    comment_data = []
    
    total_length = df_copy.shape[0]
    
    chunk = 0
    chunk_size = 25
    
    while chunk * chunk_size < total_length:
        start_row = chunk * chunk_size
        end_row = (chunk + 1) * chunk_size
        
        if end_row < total_length:
            comment_data += get_comments(df_copy[start_row : end_row + 1])
        else:
            comment_data += get_comments(df_copy[start_row :])
            
        chunk += 1
    
    df_copy = assign_comments(comment_data, df_copy)
    
    return df_copy

In [188]:
def get_titles_text(sub_name, number = 10_000, start_time = None):
    if sub_name + '_data.csv' in os.listdir('./data'):
        print(f"That data ({sub_name}) is already in the data folder. Delete the file if you want to get the data again.")
        return None
    else:
        df = get_sub_df(sub_name, number, start_utc = start_time)
        df = get_and_assign_comments(df)
        # -----> Will be moved to new DF and CSV, assignment will be handled differently
        df.to_csv('./data/' + sub_name + '_data.csv', index = False)

In [143]:
today = datetime.now(timezone.utc)
yesterday = today - timedelta(days = 1)
yesterday_timestamp = int(datetime.timestamp(yesterday))

In [189]:
get_titles_text('moviesuggestions', number = 500, start_time = yesterday_timestamp)

80.0%
5
82.64462809917356%
165.28925619834712%
2
143.26647564469914%
286.5329512893983%
2
88.65248226950355%
177.3049645390071%
2
70.32348804500704%
140.64697609001408%
2
65.61679790026247%
131.23359580052494%
2
81.16883116883118%
162.33766233766235%
2
91.91176470588235%
183.8235294117647%
2
103.95010395010395%
207.9002079002079%
2
80.64516129032258%
161.29032258064515%
241.93548387096774%
3
112.35955056179775%
224.7191011235955%
2
101.01010101010101%
202.02020202020202%
2
118.76484560570071%
237.52969121140143%
2
222.22222222222223%
444.44444444444446%
2


In [183]:
df = pd.read_csv('./data/moviesuggestions_data.csv')

In [185]:
df['link_flair_css_class'].value_counts(normalize=True)

request    0.861736
suggest    0.138264
Name: link_flair_css_class, dtype: float64

In [186]:
df.groupby(by = 'link_flair_css_class').mean()

Unnamed: 0_level_0,index,num_comments
link_flair_css_class,Unnamed: 1_level_1,Unnamed: 2_level_1
request,252.402985,21.902985
suggest,230.046512,10.209302


In [187]:
df.columns

Index(['index', 'created_utc', 'id', 'link_flair_css_class', 'num_comments',
       'selftext', 'title'],
      dtype='object')

#### More work:

Currently the csv has all the comments in a single cell.

This is a problem because they are saved as one long string, and without any data such as author, score, nest level, etc., which will likely be important as this project goes on. (such as removing negative-score comments from consideration, or collecting data from helpful bots)

I will need to create a new database for comments and use that alongside the title/text data. Keeping them together in the same csv won't work for this project.