# Data Collection

We used `pushshift.io`'s API to obtain Reddit posts from the subreddit r/MentalHealth. In each `.csv` file, the data are listed in ascending order based on timestamp (earliest to latest).

In [144]:
import time
import requests
import json
import pandas as pd

In [145]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    
    Credit: https://www.jcchouinard.com/how-to-use-reddit-api-with-python/
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return request.json()

In [146]:
def extract_title(x):
    try:
        title = x['title']
        return title
    except:
        return 0
    
def extract_text(x):
    try:
        text = x['selftext']
        return text
    except:
        return 0

In [147]:
# specify posts from [1st-8th) of every month
dates = {
         'Feb': ['1580515200', '1581120000'],
         'Mar': ['1583020800', '1583625600'],
         'Apr': ['1585699200', '1586304000'],
         'May': ['1588291200', '1588896000'],
         'Jun': ['1590969600', '1591574400'],
         'Jul': ['1593561600', '1594166400'],
         'Aug': ['1596240000', '1596844800'],
         'Sep': ['1598918400', '1599523200'],
         'Oct': ['1601510400', '1602115200'],
         'Nov': ['1604188800', '1604793600'],
         'Dec': ['1606780800', '1607385600'],
         'Jan': ['1609459200', '1610064000'],
        }

In [148]:
for month in dates.keys():
    print(f'SCRAPING {month} =====================================')
    begin, end = dates[month]
    success = False
    while not success:
        try:
            data = get_pushshift_data(data_type='submission',
                                          before=end,
                                          after=begin,
                                          subreddit='MentalHealth',
                                          sort_type='created_utc',
                                          sort='asc',
                                          size=1000)
            df = pd.DataFrame.from_dict(data)
            success = True
        except:
            print('exception encountered...continuing')
            time.sleep(1)
            continue
    
    timestamp = str(df.data.iloc[-1]['created_utc'])
    while timestamp < end:
        try:
            data = get_pushshift_data(data_type='submission',
                                      before=end,
                                      after=timestamp,
                                      subreddit='MentalHealth',
                                      sort_type='created_utc',
                                      sort='asc',
                                      size=1000)
            df_new = df.append(pd.DataFrame.from_dict(data))
            if df_new.data.iloc[-1]['created_utc'] == df.data.iloc[-1]['created_utc']:
                break
            else:
                df = df_new
                timestamp = str(df.data.iloc[-1]['created_utc'])
                time.sleep(1)
                print(timestamp, end)

        except:
            print('exception encountered...continuing')
            time.sleep(1)
            continue

    title_df = df.data.apply(extract_title).reset_index()['data'].rename('title')
    text_df = df.data.apply(extract_text).reset_index()['data'].rename('text')
    full_df = pd.concat([title_df, text_df], axis=1)
    full_df.to_csv(f'raw_data/{month}.csv')

1580616900 1581120000
1580664571 1581120000
1580700421 1581120000
1580748638 1581120000
1580780699 1581120000
1580831773 1581120000
1580864447 1581120000
1580913381 1581120000
1580944891 1581120000
1580976552 1581120000
1581026808 1581120000
1581070611 1581120000
1581115854 1581120000
1581119828 1581120000
1583110367 1583625600
1583156232 1583625600
1583190683 1583625600
1583219666 1583625600
1583267890 1583625600
1583311569 1583625600
1583357756 1583625600
1583385115 1583625600
1583440384 1583625600
1583476829 1583625600
exception encountered...continuing
1583533654 1583625600
1583587809 1583625600
1583625438 1583625600
1585776122 1586304000
1585813522 1586304000
1585863641 1586304000
1585893418 1586304000
1585945242 1586304000
1585977679 1586304000
1586014853 1586304000
1586048355 1586304000
1586089877 1586304000
1586125657 1586304000
1586153986 1586304000
1586197189 1586304000
1586226874 1586304000
1586272915 1586304000
1586303677 1586304000
1588369662 1588896000
1588391097 15888960

In [149]:
!ls raw_data/

Apr.csv  Dec.csv  Jan.csv  Jun.csv  May.csv  Oct.csv
Aug.csv  Feb.csv  Jul.csv  Mar.csv  Nov.csv  Sep.csv
