# Data Collection
This code is to collect data and update our datasets based on the newest posts from reddit. We are using Reddit's API to collect this data. The two subreddits that the data is from r/depression and r/SuicideWatch. This will allow our classifier to classify data in 3 broad classes, either depression or suicidal, or none. After that, we will develop a model to determine the stages of depression and suicide the user is going through. 

In [1]:
import requests
import time
import pandas as pd
from random import randint

In [2]:
# Begin scraping of the two subreddits
url_1 = "https://www.reddit.com/r/CasualConversation.json"

In [3]:
# creating user agent
headers = {"User-agent" : "Sam He"}
res = requests.get(url_1, headers=headers)
res.status_code

200

In [4]:
# Preview of our data
depress_json = res.json()
depress_json

{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 27,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'CasualConversation',
     'selftext': "Let's chat!",
     'author_fullname': 't2_5r8xp',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': 'r/CasualConversation Lounge',
     'link_flair_richtext': [{'a': ':chat:',
       'e': 'emoji',
       'u': 'https://emoji.redditmedia.com/04fpiw4fukg21_t5_323oy/chat'},
      {'e': 'text', 't': ' Just Chatting'}],
     'subreddit_name_prefixed': 'r/CasualConversation',
     'collections': [{'permalink': 'https://www.reddit.com/r/CasualConversation/collection/53a1d90a-04b0-49c4-90ab-5aac2c3fb696',
       'link_ids': ['t3_g9diu6',
        't3_g9ch8v',
        't3_hg9ypn',
        't3_hk0i8l',
        't3_hv7zs7',
        't3_i9pr7m'],
       'description': 'Follow to get notified when a new chat post is live!',
       'title': 'Chats',
       'create

this data is long an extensive. It can be called whenever we want to create a new dataset.

In [5]:
sorted(depress_json["data"].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [6]:
depress_json["data"]["after"]

't3_inh5mt'

In [7]:
[post["data"]["name"] for post in depress_json["data"]["children"]]

['t3_i9pr7m',
 't3_iko4ik',
 't3_in6uob',
 't3_imxuxc',
 't3_inbwp7',
 't3_inaulo',
 't3_in8zt2',
 't3_inb3bt',
 't3_infd19',
 't3_in13qg',
 't3_in8vc5',
 't3_imn0qv',
 't3_in9uqe',
 't3_incws7',
 't3_incsla',
 't3_inbrug',
 't3_in4pok',
 't3_infcy8',
 't3_inh5nw',
 't3_inbp0l',
 't3_inh7qi',
 't3_inh4zy',
 't3_imskm7',
 't3_ind084',
 't3_inamio',
 't3_inha5t',
 't3_inh5mt']

In [8]:
# checking posts per page
len(depress_json["data"]["children"])

27

In [9]:
# dataframe the posts
pd.DataFrame(depress_json["data"]["children"])

Unnamed: 0,kind,data
0,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
1,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
2,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
3,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
4,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
5,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
6,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
7,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
8,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."
9,t3,"{'approved_at_utc': None, 'subreddit': 'Casual..."


In [10]:
# view real data
depress_json["data"]["children"][0]["data"]

{'approved_at_utc': None,
 'subreddit': 'CasualConversation',
 'selftext': "Let's chat!",
 'author_fullname': 't2_5r8xp',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'r/CasualConversation Lounge',
 'link_flair_richtext': [{'a': ':chat:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/04fpiw4fukg21_t5_323oy/chat'},
  {'e': 'text', 't': ' Just Chatting'}],
 'subreddit_name_prefixed': 'r/CasualConversation',
 'collections': [{'permalink': 'https://www.reddit.com/r/CasualConversation/collection/53a1d90a-04b0-49c4-90ab-5aac2c3fb696',
   'link_ids': ['t3_g9diu6',
    't3_g9ch8v',
    't3_hg9ypn',
    't3_hk0i8l',
    't3_hv7zs7',
    't3_i9pr7m'],
   'description': 'Follow to get notified when a new chat post is live!',
   'title': 'Chats',
   'created_at_utc': 1587697288.144,
   'subreddit_id': 't5_323oy',
   'author_name': 'tizorres',
   'collection_id': '53a1d90a-04b0-49c4-90ab-5aac2c3fb696',
   'author_id': 't2_5r8xp',
   'last_update_u

In [11]:
# automate a function to scrape reddit

def reddit_scrape(url_string, number_of_scrapes, output_list):
    #scraped posts outputted as lists
    after = None 
    for _ in range(number_of_scrapes):
        if _ == 0:
            print("SCRAPING {}\n--------------------------------------------------".format(url_string))
            print("<<<SCRAPING COMMENCED>>>") 
            print("Downloading Batch {} of {}...".format(1, number_of_scrapes))
        elif (_+1) % 5 ==0:
            print("Downloading Batch {} of {}...".format((_ + 1), number_of_scrapes))
        
        if after == None:
            params = {}
        else:
            #THIS WILL TELL THE SCRAPER TO GET THE NEXT SET AFTER REDDIT'S after CODE
            params = {"after": after}             
        res = requests.get(url_string, params=params, headers=headers)
        if res.status_code == 200:
            the_json = res.json()
            output_list.extend(the_json["data"]["children"])
            after = the_json["data"]["after"]
        else:
            print(res.status_code)
            break
        time.sleep(randint(1,6))
    
    print("<<<SCRAPING COMPLETED>>>")
    print("Number of posts downloaded: {}".format(len(output_list)))
    print("Number of unique posts: {}".format(len(set([p["data"]["name"] for p in output_list]))))

In [12]:
# call function for depression subreddit
casual_scraped = []
reddit_scrape("https://www.reddit.com/r/CasualConversation.json", 50, casual_scraped)

SCRAPING https://www.reddit.com/r/CasualConversation.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50...
Downloading Batch 5 of 50...
Downloading Batch 10 of 50...
Downloading Batch 15 of 50...
Downloading Batch 20 of 50...
Downloading Batch 25 of 50...
Downloading Batch 30 of 50...
Downloading Batch 35 of 50...
Downloading Batch 40 of 50...
Downloading Batch 45 of 50...
Downloading Batch 50 of 50...
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 1241
Number of unique posts: 839


In [13]:
# output list of unique posts
def create_unique_list(original_scrape_list, new_list_name):
    data_name_list=[]
    for i in range(len(original_scrape_list)):
        if original_scrape_list[i]["data"]["name"] not in data_name_list:
            new_list_name.append(original_scrape_list[i]["data"])
            data_name_list.append(original_scrape_list[i]["data"]["name"])
    #CHECKING IF THE NEW LIST IS OF SAME LENGTH AS UNIQUE POSTS
    print("LIST NOW CONTAINS {} UNIQUE SCRAPED POSTS".format(len(new_list_name)))

In [14]:
# call function on our data
casual_scraped_unique = []
create_unique_list(casual_scraped, casual_scraped_unique)

LIST NOW CONTAINS 839 UNIQUE SCRAPED POSTS


In [15]:
# input depression data to dataframe and csv
casualConvo = pd.DataFrame(casual_scraped_unique)
casualConvo["is_suicide"] = 2
casualConvo.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,poll_data,author_cakeday,is_suicide
0,,CasualConversation,Let's chat!,t2_5r8xp,False,,0,False,r/CasualConversation Lounge,"[{'a': ':chat:', 'e': 'emoji', 'u': 'https://e...",...,True,https://www.reddit.com/r/CasualConversation/co...,1345377,1597424000.0,0,,False,,,2
1,,CasualConversation,Monthly Meta is back - [follow the collection]...,t2_5r8xp,False,,0,False,September Monthly Meta - r/CasualConversation ...,"[{'a': ':shield:', 'e': 'emoji', 'u': 'https:/...",...,True,https://www.reddit.com/r/CasualConversation/co...,1345377,1598980000.0,0,,False,,,2
2,,CasualConversation,I have two older sisters. The oldest one has a...,t2_1pnmt8re,False,,0,False,I am 60 years old. But somehow I am still the ...,"[{'a': ':chat:', 'e': 'emoji', 'u': 'https://e...",...,False,https://www.reddit.com/r/CasualConversation/co...,1345377,1599333000.0,0,,False,,,2
3,,CasualConversation,"I’ve done it I had an idea, executed on the id...",t2_ydfsr2h,False,,0,False,"Something BIG just happened in my life, it’ll ...","[{'a': ':ididit:', 'e': 'emoji', 'u': 'https:/...",...,False,https://www.reddit.com/r/CasualConversation/co...,1345377,1599296000.0,0,,False,,,2
4,,CasualConversation,"I have hard time recognising people, once a un...",t2_dowhm,False,,0,False,I have face blindness and face masks have made...,[],...,False,https://www.reddit.com/r/CasualConversation/co...,1345377,1599350000.0,0,,False,,,2


In [20]:
casualConvo.to_csv('../data/casual_conversation8-29.csv', index = False)

In [15]:
# call function on our data
depress_scraped_unique = []
create_unique_list(depress_scraped, depress_scraped_unique)

LIST NOW CONTAINS 990 UNIQUE SCRAPED POSTS


In [16]:
# input depression data to dataframe and csv
depression = pd.DataFrame(depress_scraped_unique)
depression["is_suicide"] = 0
depression.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,is_suicide
0,,depression,We understand that most people who reply immed...,t2_1t70,False,,0,False,Our most-broken and least-understood rules is ...,[],...,/r/depression/comments/doqwow/our_mostbroken_a...,no_ads,True,https://www.reddit.com/r/depression/comments/d...,645134,1572361000.0,0,,False,0
1,,depression,Welcome to /r/depression's check-in post - a p...,t2_64qjj,False,,0,False,Regular Check-In Post,[],...,/r/depression/comments/exo6f1/regular_checkin_...,no_ads,True,https://www.reddit.com/r/depression/comments/e...,645134,1580649000.0,0,,False,0
2,,depression,It’s getting to the point where it’s every sin...,t2_3y9b6w15,False,,0,False,(20F) Every day I wake up I wish I was dead,[],...,/r/depression/comments/h7ivz1/20f_every_day_i_...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,645134,1591961000.0,0,,False,0
3,,depression,I heard other people that do this but Ive neve...,t2_1zagvt9s,False,,0,False,Am I the only one who constantly imagines livi...,[],...,/r/depression/comments/h7g5g7/am_i_the_only_on...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,645134,1591947000.0,1,,False,0
4,,depression,There’s no middle ground. I’m never content. O...,t2_3pdsi85b,False,,0,False,I alternate between not caring about life at a...,[],...,/r/depression/comments/h7nd4k/i_alternate_betw...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,645134,1591977000.0,0,,False,0


In [17]:
# calling scraping on suicidewatch data
suicide_scraped = []
reddit_scrape("https://www.reddit.com/r/SuicideWatch.json", 50, suicide_scraped)

SCRAPING https://www.reddit.com/r/SuicideWatch.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50...
Downloading Batch 5 of 50...
Downloading Batch 10 of 50...
Downloading Batch 15 of 50...
Downloading Batch 20 of 50...
Downloading Batch 25 of 50...
Downloading Batch 30 of 50...
Downloading Batch 35 of 50...
Downloading Batch 40 of 50...
Downloading Batch 45 of 50...
Downloading Batch 50 of 50...
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 1237
Number of unique posts: 987


In [18]:
# using unique function on suicide data
suicide_scraped_unique = []
create_unique_list(suicide_scraped, suicide_scraped_unique)

LIST NOW CONTAINS 987 UNIQUE SCRAPED POSTS


In [19]:
# inputting suicidewatch data into dataframe and csv
suicide_watch = pd.DataFrame(suicide_scraped_unique)
suicide_watch["is_suicide"] = 1
suicide_watch.head() #CHECK IF THERE ARE 100 COLS AND LAST DUMMY is_suicide COL

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,author_cakeday,is_suicide
0,,SuicideWatch,We've been seeing a worrying increase in pro-s...,t2_1t70,False,,1,False,New wiki on how to avoid accidentally encourag...,[],...,no_ads,True,https://www.reddit.com/r/SuicideWatch/comments...,206303,1567526000.0,0,,False,,1
1,,SuicideWatch,"If you want to recognise an occasion, please d...",t2_1t70,False,,0,False,Reminder: Absolutely no activism of any kind i...,[],...,no_ads,True,https://www.reddit.com/r/SuicideWatch/comments...,206303,1568093000.0,0,,False,,1
2,,SuicideWatch,im afraid it will be a painfull and long death...,t2_3p5hg1gw,False,,0,False,wanna die but dont have the balls to kill myself,[],...,no_ads,False,https://www.reddit.com/r/SuicideWatch/comments...,206303,1591981000.0,0,,False,,1
3,,SuicideWatch,Hi! I'm 16 and I'm from India. I have been pra...,t2_6wd2ewwk,False,,0,False,WHY I SELF HARM,[],...,no_ads,False,https://www.reddit.com/r/SuicideWatch/comments...,206303,1591945000.0,0,,False,,1
4,,SuicideWatch,,t2_6wbqtjyb,False,,0,False,Have attempted suicide 8 times and i have alwa...,[],...,no_ads,False,https://www.reddit.com/r/SuicideWatch/comments...,206303,1591967000.0,0,,False,,1


### Collection Complete
This function should be called very often. The point is to improve our dataset and thus improve our model. Since new posts are always made, periodically scraping the subreddits will allow for more comprehensive datasets and thus more comprehensive models. 

In [20]:
# saving models
suicide_watch.to_csv('../data/suicide_watch6-13.csv', index = False)
depression.to_csv('../data/depression6-13.csv', index = False)