In [1]:
import requests
import json
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime

In [2]:
#change this to be whatever subreddit you want to scrape
subbreddit = 'worldnews'

In [3]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    
    Credit: https://www.jcchouinard.com/how-to-use-reddit-api-with-python/
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return request.json()

In [4]:
def extract_text(x):
    try:
        text = x['selftext']
        return text
    except:
        return 0
    
def extract_time(x):
    try:
        time_utc = x['created_utc']
        time = datetime.utcfromtimestamp(time_utc).strftime('%d%b%Y')
        return time
    except:
        return 0

def extract_score(x):
    try:
        score = x['score']
        return score
    except:
        return 0

In [5]:
day_increment = 172800
day_length = 86399
for year in tqdm(range(2017, 2021)):
    if year == 2017:
        base_date = 1483228800
    elif year == 2018:
        base_date = 1514764800
    elif year == 2019:
        base_date = 1546300800
    elif year == 2020:
        base_date = 1577836800
    for i in tqdm(range(0, 183)):
        begin = base_date + i*day_increment
        end = begin + day_length
        day = datetime.utcfromtimestamp(begin).strftime('%d%b%Y')
        begin = str(begin)
        end = str(end)
        print(f'SCRAPING {day} =====================================')
        success = False
        empty_day = False
        while not success:
            try:
                data = get_pushshift_data(data_type='submission',
                                          before=end,
                                          after=begin,
                                          subreddit=f'{subreddit}',
                                          sort_type='created_utc',
                                          sort='asc',
                                          size=1000)
                df = pd.DataFrame.from_dict(data)
                if df.empty:
                    empty_day = True
                    break
                else:
                    success = True
                
            except:
                print('exception encountered...continuing')
                time.sleep(1)
                continue
    
        if empty_day == True:
            continue
        timestamp = str(df.data.iloc[-1]['created_utc'])
        while timestamp < end:
            try:
                data = get_pushshift_data(data_type='submission',
                                      before=end,
                                      after=timestamp,
                                      subreddit=f'{subreddit}',
                                      sort_type='created_utc',
                                      sort='asc',
                                      size=1000)
                df_new = df.append(pd.DataFrame.from_dict(data))
                if df_new.data.iloc[-1]['created_utc'] == df.data.iloc[-1]['created_utc']:
                    break
                else:
                    df = df_new
                    timestamp = str(df.data.iloc[-1]['created_utc'])
                    time.sleep(1)

            except:
                time.sleep(1)
                continue
    
        text_df = df.data.apply(extract_text).reset_index()['data'].rename('text')
        time_df = df.data.apply(extract_time).reset_index()['data'].rename('time')
        score_df = df.data.apply(extract_score).reset_index()['data'].rename('score')
        full_df = pd.concat([text_df, time_df, score_df], axis=1)
        indexNames = full_df[(full_df['text'] == '[deleted]') | (full_df['text'] == '[removed]') | (full_df['text'] == '')].index
        full_df.drop(indexNames , inplace=True)
        full_df.to_csv(f'{subreddit}_data/raw_data_{year}/{day}.csv')

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/183 [00:00<?, ?it/s][A

exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountere

exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountere

exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountere

exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountere

exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountere

exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountered...continuing
exception encountere

  0%|          | 0/183 [21:29<?, ?it/s]
  0%|          | 0/4 [21:29<?, ?it/s]


KeyboardInterrupt: 