In [1]:
import pandas as pd
import numpy as np
import requests
import time

pd.set_option('max_columns', 100)

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [3]:
subreddits = ['Tomorrowland', 'PremierLeague']
dfs = []

for subreddit in subreddits:
    for i in range(10):
        # build your params to get the posts
        params = {
            'subreddit': subreddit,
            'size': 100
        }
        # set before if i is not 0
        if i != 0:
            params['before'] = last_time
        # get the response using requests
        res = requests.get(url, params=params)
        # turn the response into JSON
        data = res.json()
        # turn the JSON into a posts dataframe
        posts = pd.DataFrame(data['data'])
        # narrow the df to just be the columns you care about
        posts = posts[['title', 'selftext', 'subreddit', 'created_utc']]
        # set before if i is not 0
        last_time = posts['created_utc'].values[-1]
        # append the dataframe to dfs
        dfs.append(posts)
        time.sleep(3) # slow down three seconds in between each loop
df = pd.concat(dfs)

In [4]:
df.reset_index(inplace=True)  # resetting index so each row has unique index

In [5]:
df.drop(columns='index', inplace=True)

In [6]:
df.head()

Unnamed: 0,title,selftext,subreddit,created_utc
0,Genreverse,"Hi everyone, \n\nHere at Genreverse our ultima...",Tomorrowland,1637031457
1,Tomorrowland and it's bass line-up,So obviously Tomorrowland has always had a gre...,Tomorrowland,1637002849
2,Hand crafts 2021,,Tomorrowland,1636935255
3,Winter TML lodging,hello! I have the 4 day pass without lodging a...,Tomorrowland,1636922669
4,Was Tomorrowland around the world 2020 a speci...,,Tomorrowland,1636904090


In [7]:
len(df['created_utc'].unique())   # 2000 unique columns

2000

In [8]:
df['title'].value_counts()

r/PremierLeague Daily Discussion                                        12
Weekly "Who should I root for" discussion thread                         4
r/PremierLeague Midweek Musings                                          3
Match, Team and Player of the weekend!                                   3
MIXTAPE MARZO 2021 - 20 minutos (MEDUZA, SONNY FODERA, DOM DOLLA...)     3
                                                                        ..
Who do you think will win the next year's world cup?                     1
/r/tomorrowland enters TOP 5000 subreddits                               1
Excited for tonight. Wish you all a happy new year 2021! Rave on!        1
ATW Decoration Ideas                                                     1
Wanted: Wallpaper High Resolution Mainstage 2017/2018/2019               1
Name: title, Length: 1956, dtype: int64

In [9]:
df['selftext'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     884
[removed]                                                                                                                                                                                                                                                                                                                      

In [10]:
df.to_csv('Data/Prem-Tomorrowland_posts_11-16.csv', index=False)