In [6]:
import requests
import json
import pandas as pd
import time
from tqdm import tqdm

In [7]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    
    Credit: https://www.jcchouinard.com/how-to-use-reddit-api-with-python/
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return request.json()

In [8]:
def extract_title(x):
    try:
        title = x['title']
        return title
    except:
        return 0
    
def extract_text(x):
    try:
        text = x['selftext']
        return text
    except:
        return 0

In [9]:
# specify posts from [1st-8th) and [15-22) of every month (every other week)
dates = {
         'Feb1': ['1580515200', '1581120000'],
         'Feb2': ['1581724800', '1582329600'],
         'Mar1': ['1583020800', '1583625600'],
         'Mar2': ['1584230400', '1584835200'],
         'Apr1': ['1585699200', '1586304000'],
         'Apr2': ['1586908800', '1587513600'],
         'May1': ['1588291200', '1588896000'],
         'May2': ['1589500800', '1590105600'],
         'Jun1': ['1590969600', '1591574400'],
         'Jun2': ['1592179200', '1592784000'],
         'Jul1': ['1593561600', '1594166400'],
         'Jul2': ['1594771200', '1595376000'],
         'Aug1': ['1596240000', '1596844800'],
         'Aug2': ['1597449600', '1598054400'],
         'Sep1': ['1598918400', '1599523200'],
         'Sep2': ['1600128000', '1600732800'],
         'Oct1': ['1601510400', '1602115200'],
         'Oct2': ['1602720000', '1603324800'],
         'Nov1': ['1604188800', '1604793600'],
         'Nov2': ['1605398400', '1606003200'],
         'Dec1': ['1606780800', '1607385600'],
         'Dec2': ['1607990400', '1608595200'],
         'Jan1': ['1609459200', '1610064000'],
         'Jan2': ['1610668800', '1611273600']
        }

In [None]:
for week in tqdm(dates.keys()):
    print(f'SCRAPING {week} =====================================')
    begin, end = dates[week]
    success = False
    while not success:
        try:
            data = get_pushshift_data(data_type='submission',
                                          before=end,
                                          after=begin,
                                          subreddit='MentalHealth',
                                          sort_type='created_utc',
                                          sort='asc',
                                          size=1000)
            df = pd.DataFrame.from_dict(data)
            success = True
        except:
            print('exception encountered...continuing')
            time.sleep(1)
            continue
    
    timestamp = str(df.data.iloc[-1]['created_utc'])
    while timestamp < end:
        try:
            data = get_pushshift_data(data_type='submission',
                                      before=end,
                                      after=timestamp,
                                      subreddit='MentalHealth',
                                      sort_type='created_utc',
                                      sort='asc',
                                      size=1000)
            df_new = df.append(pd.DataFrame.from_dict(data))
            if df_new.data.iloc[-1]['created_utc'] == df.data.iloc[-1]['created_utc']:
                break
            else:
                df = df_new
                timestamp = str(df.data.iloc[-1]['created_utc'])
                time.sleep(1)

        except:
            time.sleep(1)
            continue

    title_df = df.data.apply(extract_title).reset_index()['data'].rename('title')
    text_df = df.data.apply(extract_text).reset_index()['data'].rename('text')
    full_df = pd.concat([title_df, text_df], axis=1)
    full_df.to_csv(f'raw_data/{week}.csv')

  0%|          | 0/24 [00:00<?, ?it/s]

1580578465


  4%|▍         | 1/24 [00:40<15:35, 40.67s/it]

1581764085


  8%|▊         | 2/24 [01:38<18:40, 50.95s/it]

1583068948


 12%|█▎        | 3/24 [02:16<15:45, 45.04s/it]

1584281082


 17%|█▋        | 4/24 [02:52<13:49, 41.45s/it]

1585741792


 21%|██        | 5/24 [03:39<13:45, 43.46s/it]

1586950996


 25%|██▌       | 6/24 [04:23<13:04, 43.58s/it]

1588327428


 29%|██▉       | 7/24 [05:16<13:12, 46.64s/it]

1589532715


 33%|███▎      | 8/24 [06:10<13:05, 49.09s/it]

1591009458


 38%|███▊      | 9/24 [07:03<12:31, 50.08s/it]

1592210391


 42%|████▏     | 10/24 [07:51<11:32, 49.47s/it]

1593589018


 46%|████▌     | 11/24 [08:48<11:13, 51.78s/it]

1594807401


 50%|█████     | 12/24 [09:50<10:59, 54.98s/it]

1596274772


 54%|█████▍    | 13/24 [10:51<10:23, 56.69s/it]

1597480399


 58%|█████▊    | 14/24 [11:47<09:24, 56.44s/it]

1598940128


 62%|██████▎   | 15/24 [12:54<08:56, 59.61s/it]

1600154060


 67%|██████▋   | 16/24 [13:49<07:47, 58.48s/it]

1601539301


 71%|███████   | 17/24 [14:41<06:34, 56.42s/it]

1602755759


 75%|███████▌  | 18/24 [15:35<05:33, 55.62s/it]

1604224779


 79%|███████▉  | 19/24 [16:26<04:30, 54.20s/it]

1605437639


 83%|████████▎ | 20/24 [17:18<03:34, 53.74s/it]

1606808699


 88%|████████▊ | 21/24 [18:01<02:30, 50.33s/it]

1608029906
