In [1]:
import requests
import pandas as pd
import time
import datetime as dt

In [6]:
# Create function to pull the data from the subreddits and output to a csv file (from demo from class on pushshift API)

def pushshift_query(full_df_path, subreddit, sub_type, iters = 5):
    # Cycle through the number of iterations and add rows to the file
    for pull in range(iters):
        full_df = pd.read_csv(full_df_path)
        # Read in the path to the subreddit
        url = f'https://api.pushshift.io/reddit/search/{sub_type}/?subreddit={subreddit}&before={full_df["created_utc"].min()}&size=100'

        res = requests.get(url)
        # If the page returns a valid response
        if res.status_code == 200:
            # Turn the json into a list
            post_list = res.json()['data']
             # Create a temporary data frame to add the data
            if sub_type == 'submission':
                temp_df = pd.DataFrame(post_list)[['title', 'author', 'created_utc', 'selftext', 'subreddit']]
            elif sub_type == 'comment':
                temp_df = pd.DataFrame(post_list)[['author', 'created_utc', 'body', 'subreddit']]
            else:
                print('This is not a valid subreddit type')
            # Add this data to the existing data frame
            full_df = pd.concat([full_df, temp_df])
            # Print out statuses to make sure it's running correctly
            print(full_df.shape)
            print(full_df.nunique())
            print('Waiting until next pull...')

            # Add in a 5 second delay so IP does not get blocked by reddit
            time.sleep(5)

        else:
            continue
    
        # Output the data to a csv
        full_df.to_csv(full_df_path, index = False)

In [9]:
# Run query to create tinder file

pushshift_query(full_df_path = '../data/tinder_posts.csv', subreddit = 'tinder', sub_type = 'comment', iters = 25)

(8200, 6)
title          5336
author         5707
created_utc    8076
selftext        726
subreddit         1
body           2383
dtype: int64
Waiting until next pull...
(8300, 6)
title          5336
author         5767
created_utc    8175
selftext        726
subreddit         1
body           2475
dtype: int64
Waiting until next pull...
(8400, 6)
title          5336
author         5826
created_utc    8273
selftext        726
subreddit         1
body           2563
dtype: int64
Waiting until next pull...
(8500, 6)
title          5336
author         5866
created_utc    8371
selftext        726
subreddit         1
body           2655
dtype: int64
Waiting until next pull...
(8600, 6)
title          5336
author         5924
created_utc    8468
selftext        726
subreddit         1
body           2751
dtype: int64
Waiting until next pull...
(8700, 6)
title          5336
author         5979
created_utc    8565
selftext        726
subreddit         1
body           2847
dtype: int64
Waiting

In [12]:
# Run query to create bumble file

pushshift_query(full_df_path = '../data/bumble_posts.csv', subreddit = 'bumble', sub_type = 'comment', iters = 20)

(8200, 6)
title          4929
author         4579
created_utc    8186
selftext       1884
subreddit         1
body           3028
dtype: int64
Waiting until next pull...
(8300, 6)
title          4929
author         4597
created_utc    8286
selftext       1884
subreddit         1
body           3125
dtype: int64
Waiting until next pull...
(8400, 6)
title          4929
author         4616
created_utc    8385
selftext       1884
subreddit         1
body           3221
dtype: int64
Waiting until next pull...
(8500, 6)
title          4929
author         4640
created_utc    8485
selftext       1884
subreddit         1
body           3318
dtype: int64
Waiting until next pull...
(8600, 6)
title          4929
author         4672
created_utc    8585
selftext       1884
subreddit         1
body           3417
dtype: int64
Waiting until next pull...
(8700, 6)
title          4929
author         4694
created_utc    8685
selftext       1884
subreddit         1
body           3515
dtype: int64
Waiting