In [11]:
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup

In [12]:
url = 'https://api.pushshift.io/reddit/search/submission'

### Test Run First

In [13]:
now = int(time.time())
params = {
    'subreddit': 'AskEngineers',
    'size': 10,
    'before': now,
    'score' : '>20',
    'num_comments' : '>20'
 
}

In [14]:
res = requests.get(url, params=params)

In [15]:
res.status_code

200

In [7]:
data = res.json()

In [8]:
posts = pd.DataFrame(data['data'])[['title', 'selftext', 'subreddit','score', 'num_comments', 'created_utc']]
posts.head(3)

Unnamed: 0,title,selftext,subreddit,score,num_comments,created_utc
0,Careers with a BS in Mechanical Engineering,"Hi all.\n\nSo yes, I graduated this year with ...",AskEngineers,60,210,1638995117
1,Question about the lifestyle of a Field Engine...,Right now I am considering a job as a Field En...,AskEngineers,26,47,1638976319
2,Dealing with negative colleagues,One of my fellow engineers is a black hat thin...,AskEngineers,137,99,1638967311


### Full Data Pull

In [136]:

df = []

subreddits = ['AskEngineers', 'MachineLearning']

for sub in subreddits:
    
    before = int(time.time())
    
    for pull in range(13): #Lucky 13, more data is never bad!
        #set params for posts of higher general quality
        params = {
            'subreddit': sub,
            #'q': 'election',
            'size': 100,
            'before': before,
            'score' : '>15',
            'num_comments' : '>15'
        }
        
        # use the requests to get the response
        res = requests.get(url, params=params)
        # turn the response into JSON
        data = res.json()
        # turn the JSON into a DataFrame
        posts = pd.DataFrame(data['data'])[['subreddit','title', 'selftext','score', 'num_comments' , 'created_utc']]
        # add posts DataFrame to dfs
        df.append(posts)
        # set before to be the timestamp of the last post
        before = posts['created_utc'].values[-1]
        
        time.sleep(5)
# concat all dfs
df = pd.concat(df, ignore_index=True)

In [137]:
df.shape

(2594, 6)

In [138]:
#Potentially ran out of qualified posts but I have more than enough
df.subreddit.value_counts()

AskEngineers       1297
MachineLearning    1297
Name: subreddit, dtype: int64

### Data Cleaning & Feature Engineering

In [10]:
df.isnull().sum()

subreddit         0
title             0
selftext        585
score             0
num_comments      0
created_utc       0
dtype: int64

In [16]:
#Binarize target variable of subreddit
df['subreddit'].replace({'AskEngineers': 0, 'MachineLearning': 1}, inplace=True)

In [17]:
#I will creat a total text column merging self text with title for any that had it

df['selftext'].fillna('', inplace = True)
df['total_text'] = df['title'] + ' ' + df['selftext']

In [18]:
def remove_html(post):
    '''function to remove html and lowercase all text'''
    
    no_html = BeautifulSoup(post).text
    # removes html
    
    lower_case = no_html.lower()
    # lowercase all text
    
    return lower_case

# Apply fuction to data
df['total_text'] = df['total_text'].apply(remove_html)

In [19]:
def length(status):
    return len(status)

df['title_length'] = df['title'].map(length)
df['selftext_length'] = df['selftext'].map(length)

In [20]:
def words(status):
    if status != '':
        return len(status.split(' '))
    else:
        return 0
df['title_word_count'] = df['title'].map(words)
df['selftext_word_count'] = df['selftext'].map(words)
df.tail(3)

Unnamed: 0,subreddit,title,selftext,score,num_comments,created_utc,total_text,title_length,selftext_length,title_word_count,selftext_word_count
2591,1,[D] Google's large scale GAN-Tuning paper unfa...,,69,19,1512149277,[d] google's large scale gan-tuning paper unfa...,65,0,9,0
2592,1,[D] Looking for papers on treating regression ...,I'm currently working on a paper regarding a r...,18,27,1512131072,[d] looking for papers on treating regression ...,83,1141,12,177
2593,1,[R] High-Resolution Image Synthesis and Semant...,,156,23,1512106470,[r] high-resolution image synthesis and semant...,83,0,10,0


In [21]:
#Binarize whether there was self text in first place

def fix_selftext(val):
    if val == '':
        return 0
    else:
        return 1
df['selftext'] = df['selftext'].map(fix_selftext)
df.tail(3)

Unnamed: 0,subreddit,title,selftext,score,num_comments,created_utc,total_text,title_length,selftext_length,title_word_count,selftext_word_count
2591,1,[D] Google's large scale GAN-Tuning paper unfa...,0,69,19,1512149277,[d] google's large scale gan-tuning paper unfa...,65,0,9,0
2592,1,[D] Looking for papers on treating regression ...,1,18,27,1512131072,[d] looking for papers on treating regression ...,83,1141,12,177
2593,1,[R] High-Resolution Image Synthesis and Semant...,0,156,23,1512106470,[r] high-resolution image synthesis and semant...,83,0,10,0


In [22]:
df.to_csv('../data/subreddit_data.csv', index = False)