# Data Collection

We used `pushshift.io`'s API to obtain Reddit posts from the subreddit r/MentalHealth. In each `.csv` file, the data are listed in ascending order based on timestamp (earliest to latest).

In [14]:
import time
from tqdm import tqdm
import requests
import json
import pandas as pd

In [15]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    
    Credit: https://www.jcchouinard.com/how-to-use-reddit-api-with-python/
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return request.json()

In [16]:
def extract_title(x):
    try:
        title = x['title']
        return title
    except:
        return 0
    
def extract_text(x):
    try:
        text = x['selftext']
        return text
    except:
        return 0

In [17]:
# specify posts from [1st-8th) and [15-22) of every month (every other week)
dates = {
         'Feb1': ['1580515200', '1581120000'],
         'Feb2': ['1581724800', '1582329600'],
         'Mar1': ['1583020800', '1583625600'],
         'Mar2': ['1584230400', '1584835200'],
         'Apr1': ['1585699200', '1586304000'],
         'Apr2': ['1586908800', '1587513600'],
         'May1': ['1588291200', '1588896000'],
         'May2': ['1589500800', '1590105600'],
         'Jun1': ['1590969600', '1591574400'],
         'Jun2': ['1592179200', '1592784000'],
         'Jul1': ['1593561600', '1594166400'],
         'Jul2': ['1594771200', '1595376000'],
         'Aug1': ['1596240000', '1596844800'],
         'Aug2': ['1597449600', '1598054400'],
         'Sep1': ['1598918400', '1599523200'],
         'Sep2': ['1600128000', '1600732800'],
         'Oct1': ['1601510400', '1602115200'],
         'Oct2': ['1602720000', '1603324800'],
         'Nov1': ['1604188800', '1604793600'],
         'Nov2': ['1605398400', '1606003200'],
         'Dec1': ['1606780800', '1607385600'],
         'Dec2': ['1607990400', '1608595200'],
         'Jan1': ['1609459200', '1610064000'],
         'Jan2': ['1610668800', '1611273600']
        }

In [18]:
for week in tqdm(dates.keys()):
    print(f'SCRAPING {week} =====================================')
    begin, end = dates[week]
    success = False
    while not success:
        try:
            data = get_pushshift_data(data_type='submission',
                                          before=end,
                                          after=begin,
                                          subreddit='MentalHealth',
                                          sort_type='created_utc',
                                          sort='asc',
                                          size=1000)
            df = pd.DataFrame.from_dict(data)
            success = True
        except:
            print('exception encountered...continuing')
            time.sleep(1)
            continue
    
    timestamp = str(df.data.iloc[-1]['created_utc'])
    while timestamp < end:
        try:
            data = get_pushshift_data(data_type='submission',
                                      before=end,
                                      after=timestamp,
                                      subreddit='MentalHealth',
                                      sort_type='created_utc',
                                      sort='asc',
                                      size=1000)
            df_new = df.append(pd.DataFrame.from_dict(data))
            if df_new.data.iloc[-1]['created_utc'] == df.data.iloc[-1]['created_utc']:
                break
            else:
                df = df_new
                timestamp = str(df.data.iloc[-1]['created_utc'])
                time.sleep(1)

        except:
            time.sleep(1)
            continue

    title_df = df.data.apply(extract_title).reset_index()['data'].rename('title')
    text_df = df.data.apply(extract_text).reset_index()['data'].rename('text')
    full_df = pd.concat([title_df, text_df], axis=1)
    full_df.to_csv(f'raw_data/{week}.csv')

  0%|          | 0/24 [00:00<?, ?it/s]



  4%|▍         | 1/24 [00:36<14:05, 36.76s/it]



  8%|▊         | 2/24 [01:12<13:19, 36.34s/it]



 12%|█▎        | 3/24 [01:42<12:06, 34.62s/it]



 17%|█▋        | 4/24 [02:15<11:19, 33.97s/it]



 21%|██        | 5/24 [02:51<11:00, 34.77s/it]



 25%|██▌       | 6/24 [03:31<10:55, 36.39s/it]



 29%|██▉       | 7/24 [04:18<11:10, 39.45s/it]



 33%|███▎      | 8/24 [05:00<10:43, 40.21s/it]



 38%|███▊      | 9/24 [05:47<10:32, 42.14s/it]



 42%|████▏     | 10/24 [06:31<10:00, 42.87s/it]



 46%|████▌     | 11/24 [07:25<10:00, 46.20s/it]



 50%|█████     | 12/24 [08:20<09:44, 48.68s/it]



 54%|█████▍    | 13/24 [09:12<09:06, 49.68s/it]



 58%|█████▊    | 14/24 [10:02<08:18, 49.82s/it]



 62%|██████▎   | 15/24 [10:57<07:42, 51.39s/it]



 67%|██████▋   | 16/24 [11:52<07:01, 52.64s/it]



 71%|███████   | 17/24 [12:38<05:54, 50.65s/it]



 75%|███████▌  | 18/24 [13:15<04:38, 46.45s/it]



 79%|███████▉  | 19/24 [13:50<03:35, 43.10s/it]



 83%|████████▎ | 20/24 [14:31<02:49, 42.41s/it]



 88%|████████▊ | 21/24 [15:08<02:02, 40.86s/it]



 92%|█████████▏| 22/24 [15:45<01:19, 39.70s/it]



 96%|█████████▌| 23/24 [16:26<00:39, 40.00s/it]



100%|██████████| 24/24 [17:02<00:00, 42.59s/it]


In [19]:
!ls raw_data/

Apr1.csv  Aug2.csv  Feb1.csv  Jan2.csv	Jun1.csv  Mar2.csv  Nov1.csv  Oct2.csv
Apr2.csv  Dec1.csv  Feb2.csv  Jul1.csv	Jun2.csv  May1.csv  Nov2.csv  Sep1.csv
Aug1.csv  Dec2.csv  Jan1.csv  Jul2.csv	Mar1.csv  May2.csv  Oct1.csv  Sep2.csv
