In [1]:
import requests
import time
import pandas as pd

In [2]:
subreddits = ['science', 'psychology']
abbr = [subreddit[0:3] for subreddit in subreddits]
subreddits_abbr = dict(zip(subreddits, abbr))

In [3]:
url = 'https://api.pushshift.io/reddit/search/submission/'

In [4]:
data_folder = './datasets/'

In [5]:
myheader = {'User-agent': 'Ali bot 1.0'}

In [6]:
Npost = 20

for subreddit in subreddits:
    print('Scraping subreddit:', subreddit, '\n')
    
    posts = []
    before = None
    
    for i in range(Npost):
        print('Scraping {} posts ...'.format((i+1)*500))

        params = {'size': 500, 'subreddit': subreddit,
                  'before': before, 'sort' :'desc', 'sort_type': 'created_utc'}

        res = requests.get(url, params=params, headers=myheader)
        
        if res.status_code == 200:
            rdata = res.json()
            posts.extend(rdata['data'])
        else:
            print(res.status_code)
            break
            
        before = posts[-1]['created_utc']
        
        time.sleep(1)
            
    ids = []
    titles = []
    texts = []

    for post in posts:
        if 'selftext' in post.keys():
            ids.append(post['id'])
            titles.append(post['title'])
            texts.append(post['selftext'])
        
    df = pd.DataFrame({'id': ids, 'title': titles, 'text': texts, 'subreddit': subreddits_abbr[subreddit]})
    
    print('\nDataframe size:', df.shape, '\n')
    
    csv_file = data_folder + 'data_' + subreddits_abbr[subreddit] + '.csv'
    
    df.to_csv(csv_file, index=False)

Scraping subreddit: science 

Scraping 500 posts ...
Scraping 1000 posts ...
Scraping 1500 posts ...
Scraping 2000 posts ...
Scraping 2500 posts ...
Scraping 3000 posts ...
Scraping 3500 posts ...
Scraping 4000 posts ...
Scraping 4500 posts ...
Scraping 5000 posts ...
Scraping 5500 posts ...
Scraping 6000 posts ...
Scraping 6500 posts ...
Scraping 7000 posts ...
Scraping 7500 posts ...
Scraping 8000 posts ...
Scraping 8500 posts ...
Scraping 9000 posts ...
Scraping 9500 posts ...
Scraping 10000 posts ...

Dataframe size: (10000, 4) 

Scraping subreddit: psychology 

Scraping 500 posts ...
Scraping 1000 posts ...
Scraping 1500 posts ...
Scraping 2000 posts ...
Scraping 2500 posts ...
Scraping 3000 posts ...
Scraping 3500 posts ...
Scraping 4000 posts ...
Scraping 4500 posts ...
Scraping 5000 posts ...
Scraping 5500 posts ...
Scraping 6000 posts ...
Scraping 6500 posts ...
Scraping 7000 posts ...
Scraping 7500 posts ...
Scraping 8000 posts ...
Scraping 8500 posts ...
Scraping 9000 posts 