In [8]:
# Imports
import pandas as pd
import requests

# Parsing Library: Beautiful Soup
from bs4 import BeautifulSoup

# Built-in Time Access and Conversions Module
import time

In [9]:
# Setup url as a string for API call
url = 'https://api.pushshift.io'
submission_url = '/reddit/search/submission'
comment_url = '/reddit/search/comment'

In [15]:
# Setup df list
tech = []       # Initialize empty list to append with data
oldest = 0      # Establish time-zero to reference submissions chronologically
num_reqs = 0    # No. of requests to Reddit server
max_reqs = 1#000 # Maximum No. of requests to be made to Reddit server
sub = 1         # 0 for 'depression', 1 for 'SuicideWatch'
subreddit = ['depression','SuicideWatch']

In [16]:
# Verify subreddit before sending request
subreddit[sub]

'depression'

In [17]:
# Obtain 100,000 subreddit submissions by making 1,000 requests of 100 submissions each
while num_reqs <= max_reqs:
    
    # Setup parameters for initial request
    if oldest == 0:
        req_params = {
            'subreddit': subreddit[sub], # ID the subreddit of interest
            'size': 100 # Maximum submissions allowed by Reddit
        }
    # Setup params for all subsequent requests
    else:
        req_params = {
            'subreddit': subreddit[sub],
            'size': 100,
            'before': oldest # Only 'get' submissions older than prev req
        } 
        
    # GET-request
    #   -Submits an http request from browser to server
    #   -Response contains:
    #        1. Status info about the request(response code)
    #        2. Requested content (sequence of bytes)
    # Get request-response
    req = requests.get(url+submission_url, req_params)
    
    # Keep track of No. of requests
    num_reqs += 1
    
    # Store json form of response-content as dict w/ one-key: 'data'
    data_100 = req.json()

    # Name List of 100 submissions 'subs_100'
    subs_100 = data_100['data'] # Dict key-'data': value-'subs_100' 

    # Create DataFrame from list of 100 dictionaries
    tech_100 = pd.DataFrame(subs_100)
    
    # Prepare initial request-response for concatenation
    if num_reqs == 1:
        print(num_reqs)
        submission = tech_100.copy()
        
    # Concatinate intitial response with subsequent responses
    else:
        submission = pd.concat([submission, tech_100], ignore_index=True)

    # ID oldest post in request
    oldest = subs_100[99]['created_utc']
     
    #Slow down the pace of scrapping per Reddit TOU
    time.sleep(1)
    
    # Display status of while loop
    print(f'No. of Get requests: {num_reqs}\n')
    #print(submission.title.value_counts()[:10])
    # Code of response request, as an integer
    #r.status_code

1
No. of Get requests: 1

No. of Get requests: 2



In [36]:
# Parameters of interest for NLP, defined in Reddit API Documentation
reddit_params = ['subreddit','author', 'selftext', 'title', 'num_comments', 'created_utc']

In [21]:
# Verify row-dimension of submission-data equals 100*max_reqs
submission.shape

(200, 59)

In [26]:
# Check that all submission-data is from selected subreddit
print(submission.subreddit.value_counts())

depression    200
Name: subreddit, dtype: int64


In [32]:
# Check domain(s) of all submission-data
print(submission.domain.value_counts()) # Some subreddits have domain beyond 'self'

self.depression    200
Name: domain, dtype: int64


In [34]:
# Check that all is copacetic
submission[reddit_params].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subreddit     200 non-null    object
 1   author        200 non-null    object
 2   selftext      200 non-null    object
 3   title         200 non-null    object
 4   num_comments  200 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 7.9+ KB


In [37]:
# Check that all is copacetic
submission[reddit_params]

Unnamed: 0,subreddit,author,selftext,title,num_comments,created_utc
0,depression,Gang2020Yang,So many things have gone wrong recently and I ...,I just feel numb,0,1602967887
1,depression,SadiSuto1,Born 20 years ago as a broken condom and faile...,My life in a summary... I need to rant as it i...,0,1602967807
2,depression,NF4Life44,Not 1 good reason for me to live. I'll be ment...,Fuck I just wanna die,0,1602967790
3,depression,mememememememdmd,"I feel tainted, like im now not only suicidal ...",Im going to kill myself bc of my sister that c...,0,1602967714
4,depression,jimmyneutron0212,My friends have been telling me I should see a...,Should I seek help?,0,1602967582
...,...,...,...,...,...,...
195,depression,risingpulsar,Really have been struggling with existing late...,Celexa and Wellbutrin,4,1602937260
196,depression,smolbean20,"i'm on my third year of college, and it's like...",what am i supposed to do to calm myself down???,0,1602936964
197,depression,tiiamannix,I'm posting this because I want to be honest a...,After 3 suicide attempts in a year and pills t...,28,1602936637
198,depression,mrsruffin38,[removed],Dealing With Depression,0,1602935853


In [13]:
# Save only columns with useful data for NLP classification modeling and analysis
submission[reddit_params].to_csv('./data/rename.csv')