## Target Scraping

The purpose of this notebook is to scrape users that have posted in r/SuicideWatch from 2018 till now and then collect all of their metadata (posts and comments).

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random

import pickle

from tqdm import tqdm

from datetime import datetime

import praw
from psaw import PushshiftAPI
from collections import Counter, OrderedDict

In [3]:
api = PushshiftAPI()

# set up praw
client_id = ""
secret = ""
user_agent = "Mental Health Scraper"

reddit = praw.Reddit(client_id=client_id, client_secret=secret, user_agent=user_agent)

Version 7.1.0 of praw is outdated. Version 7.2.0 was released Wednesday February 24, 2021.


In [4]:
# helpers
def ut_to_dt(created):
    '''
    To convert unix time to datetime format
    '''
    ts = int(created)
    return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
# initialize a starting date. We will scrape all posts starting from 2018.
start_epoch=int(datetime(2018, 1, 1).timestamp())

In [None]:
#DO NOT RUN THIS IF FILE IS ALREADY SAVED
res = list(api.search_submissions(after=start_epoch,
                                  filter = ["author","title","selftext","created","num_comments","score","upvote_ratio"],
                                  subreddit = 'SuicideWatch'))


In [None]:
posts = []
for post in tqdm(res):
    try:
        posts.append([post.author, post.title, post.selftext, ut_to_dt(post.created_utc), post.num_comments, post.score])
    except:
        posts.append([post.author, post.title, '', ut_to_dt(post.created_utc), post.num_comments, post.score])
posts = pd.DataFrame(posts,columns=["author","title", "text", "created","num_comments","score"])
posts['created'] = pd.to_datetime(posts['created'])

posts.to_csv("data/raw_sw_2018.csv", index = False) # this is a big file and will be available upon request
display(posts.head(), posts.shape)

In [None]:
count_df = posts.groupby('author').count()['title'].reset_index().rename({'title':'num_posts'}, axis = 1)
perc = count_df.num_posts.value_counts().values[0]/len(count_df)*100
str(np.round(perc, 2)) + "% of users collected made only 1 post."

## 77% percent of users collected made only one post

In [None]:
# Get the first posts of the users on SuicideWatch
first_posts = posts.loc[posts.groupby('author')['created'].idxmin()]
first_posts = pd.merge(first_posts, count_df, on = 'author')

In [None]:
# Collect list of usernames of moderators of SuicideWatch
rm_list = []
for moderator in reddit.subreddit("SuicideWatch").moderator():
    rm_list.append(str(moderator))
rm_list.append('[deleted]') # no use from deleted accounts

In [None]:
# remove mods from first posts
first_posts_fin = first_posts[~first_posts['author'].isin(rm_list)]
print(str(first_posts.shape[0]-first_posts_fin.shape[0]) + " users removed, total " + str(first_posts_fin.shape[0]) + " users remaining.")

## Users That Posted on MH Subs Before SW

In [5]:
posts = pd.read_csv("data/raw_sw_2018.csv")
posts['created'] = pd.to_datetime(posts['created'])
# remove removed posts
posts = posts[posts['title'] != '[removed]']

In [6]:
display(posts.head(), posts.shape, posts.dtypes)

Unnamed: 0,author,title,text,created,num_comments,score
0,Buckeye-o,I don't want to live anymore,I've struggled with depression since middle sc...,2021-03-17 15:49:37,0,1
1,ScientificialBot,Either I'll kill myself or kill you all,I am very furious and happy right now. I don't...,2021-03-17 15:48:02,0,1
2,anarchy517,I think about killing myself constantly.,"Everytime I see my cuts, I want it to end. Eve...",2021-03-17 15:45:12,5,1
3,LoneSoul66,Tired of playing the game of life.,"I never plan on taking my own life, but the th...",2021-03-17 15:45:01,0,1
4,Purrrista,I just want to be okay.,I dont want to think of killing myself on a da...,2021-03-17 15:41:47,1,1


(375010, 6)

author                  object
title                   object
text                    object
created         datetime64[ns]
num_comments             int64
score                    int64
dtype: object

In [16]:
first_posts = posts.groupby('author').created.agg(first_post = np.min).reset_index()
first_posts.shape

(192726, 2)

In [17]:
 MH_subs = ['depression',
 'depression_help',
 'antidepressants',
 'depressed',
 'AnxietyDepression',
 'AdultDepression',
 'Anxiety',
 'AnxietyDepressionsocialanxiety',
 'Anxietyhelp',
 'adhd_anxiety',
 'PanicAttack',
 'BPD',
 'bipolar',
 'BipolarReddit',
 'CPTSD',
 'ptsd',
 'addiction',
 'alcoholism',
 'alcohol',
 'cripplingalcoholism',
 'alcoholicsanonymous',
 'opiates',
 'heroin',
 'cocaine',
 'trees',
 'Drugs',
 'askdrugs',
 'leaves',
 'schizophrenia',
 'BingeEatingDisorder',
 'EatingDisorders',
 'fuckeatingdisorders',
 'eating_disorders',
 'bulimia',
 'BreakUps',
 'heartbreak',
 'selfharm',
 'SelfHarmScars',
 'lonely',
 'ForeverAlone',
 'rape',
 'rapecounseling',
 'sexualassault',
 'relationship_advice',
 'relationships',
 'abusiverelationships',
 'survivinginfidelity',
 'cheating_stories',
 'NarcissisticAbuse',
 'abusiveparents',
 'survivorsofabuse',
 'raisedbynarcissists',
 'insaneparents',
 'lossofalovedone',
 'LostALovedOne']

In [34]:
all_users = first_posts.author.unique().tolist() # list of all users in our cohort

In [82]:
def get_comment_meta(username):
    '''
    Given a username, get all comments of the user
    Returns a tuple of:
    1. list of comments and the metadata associated with them
    2. dictionary of all subreddits where the comments were made
    '''
    comment_subs = []
    comment_meta = []
    try:
        for comment in reddit.redditor(username).comments.new(limit = None):
            comment_subs.append(str(comment.subreddit))
            comment_meta.append([user, 0, str(comment.subreddit), ut_to_dt(comment.created_utc), comment.body, comment.ups, comment.downs, len(comment.replies)])
        comment_dict = Counter(comment_subs)
        return comment_meta, comment_dict
    except:
        return [],Counter([])
    
def get_post_meta(username):
    '''
    Given a username, get all posts of the user
    Returns a tuple of:
    1. list of posts and the metadata associated with them
    2. dictionary of all subreddits where the posts were made
    '''
    post_subs = []
    post_meta = []
    try:
        for submission in reddit.redditor(username).submissions.new(limit = None):
            post_subs.append(str(submission.subreddit))
            post_meta.append([user, 1, str(submission.subreddit), ut_to_dt(submission.created_utc), submission.title + ' ' + submission.selftext, submission.ups, submission.downs, submission.num_comments])
            
        post_dict = Counter(post_subs)
        
        return post_meta, post_dict
    
    except:
        return [],Counter([])
    

In [90]:
# shuffle all users collected for randomness

random.seed = 42
all_users = first_posts.author.unique().tolist()
random.shuffle(all_users)

In [99]:
len(all_users) # number of all users

192726

In [None]:
# This cell takes very long 145,826 users took 62 hours to scrape.

metadata_total = []
i = 0

for i in tqdm(range(len(all_users))):
    
    user = all_users[i]
    
    comment_meta, comment_dict = get_comment_meta(user)
    post_meta, post_dict = get_post_meta(user)
    
    if ('SuicideWatch' in post_dict.keys()):
        combined_meta = comment_meta + post_meta
        metadata_total += combined_meta
        
    if i % 5000 == 0:
        with open('metadata18v2/met'+str(i)+'.pkl', 'wb') as f:
            pickle.dump(metadata_total, f)
            
    i+= 1

 76%|███████████████████████████████████████████████████▍                | 145826/192726 [62:38:06<18:02:01,  1.38s/it]