In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
import requests
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.base import TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import datetime as dt

In [2]:

# pushshift api is used to search the reddit submissions for a particular reddit post.
# collect the subreddit posts of a particular kind, provided as a parameter

# Define a function that accepts the following parameters
# 1. name of the subreddit
# 2. kind = submissions, as we are searching the subreddit submissions for a type of post

def query_pushshift(subreddit, kind='submission', skip=30, times=6, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):

    #  size for search is set to 500

    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=500".format(kind, subreddit)
    mylist = []
    
    # loop for 6 times, passing the skip as multiples of 30 days
    # get the request response   
    for x in range(1, times):
        
        URL = "{}&after={}d".format(stem, skip * x) # construct URL to return results after this passed days
        print(URL)
        response = requests.get(URL)        #Data is returned in JSON format, results are included in the “data” key
        assert response.status_code == 200  # check if return status code is 200
        mine = response.json()['data']      # store the response json data into a variable mine
        df = pd.DataFrame.from_dict(mine)   # create a data frame with the response
        mylist.append(df)                   # append the df to myist[] 
        time.sleep(2)                       # pause the next request submission for 2 seconds
        
    full = pd.concat(mylist, sort=False)    # create a new dataframe 'full', by using pd.concat from list 'mylist'
    
    if kind == "submission":
        
        full = full[subfield]               # replace the df 'full' with list of columns you are interested in
        
        full = full.drop_duplicates()       # drop the duplicates from the df 'full'
        
        full = full.loc[full['is_self'] == True]
        
    def get_date(created):
        return dt.date.fromtimestamp(created)
    
    # convert the created_utc which is in epoch time to regular date and time format    
    _timestamp = full["created_utc"].apply(get_date) 

    # append date and time by creating a new column 'timestamp' in the df 'full'   
    full['timestamp'] = _timestamp

    print(full.shape)
    
    return full 

In [3]:
# using the above function, get the subreddits for jokes
# subreddit interested in is 'jokes'
jokes_df = query_pushshift('jokes')

https://api.pushshift.io/reddit/search/submission/?subreddit=jokes&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=jokes&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=jokes&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=jokes&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=jokes&size=500&after=150d
(2491, 9)


In [4]:
# using the above function, get the subreddits for datascience
# subreddit interested in is 'datascience'
data_df = query_pushshift('datascience')

https://api.pushshift.io/reddit/search/submission/?subreddit=datascience&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=datascience&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=datascience&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=datascience&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=datascience&size=500&after=150d
(1828, 9)


In [5]:
# jokes_df is the dataframe created for the subreddit 'jokes'
jokes_df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Knock-knock,Who's there? \n**A parrot!** \nA parrot who...,Jokes,1552069053,motsanciens,0,0,True,2019-03-08
1,How did the dentist suddenly become a brain su...,A slip of the hand.,Jokes,1552069079,roastedtoperfection,0,4,True,2019-03-08
2,I hate build a bear. I took my chihauhua there...,AND the stuffed animal they gave me keeps bark...,Jokes,1552069382,RikorperationYT,0,2,True,2019-03-08
3,An English Teacher And The Pope Was Sitting Ne...,"He was reading a challenging book, and was ver...",Jokes,1552069428,GangstaKev,2,3,True,2019-03-08
4,"I looked left, then I looked right. I looked l...",Then I pulled out... she wasn’t pleased.,Jokes,1552069459,Windwaker85,0,2,True,2019-03-08


In [6]:
# data_df is the dataframe created for the subreddit 'datascience'
data_df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,ERD Question (Academic),[removed],datascience,1552070887,madvillaini,2,1,True,2019-03-08
1,Avoiding Apocalypse by Doubling Down on Team H...,[removed],datascience,1552071021,The_Syndicate_VC,0,1,True,2019-03-08
2,Not happy with my company (rant),Background:\nI work as a lead project manager ...,datascience,1552073551,hereiskyle,12,16,True,2019-03-08
3,datacamp coupon,[removed],datascience,1552073588,kebarulez,2,1,True,2019-03-08
5,Looking for local Data Science and or Python m...,"Hey folks,\n\nI’m a sole Data Analyst at a sma...",datascience,1552074457,Nicodemus34,6,3,True,2019-03-08


In [7]:
#  merging the two dataframes, create a new df called 'final_df'
frames = [jokes_df, data_df]
final_df = pd.concat(frames)
final_df.shape



(4319, 9)

In [8]:
# copy the data to a csv - reddit.csv
final_df.to_csv('./reddit.csv', index = False)

In [9]:
read_from_csv_df = pd.read_csv('./reddit.csv')

In [10]:
read_from_csv_df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Knock-knock,Who's there? \n**A parrot!** \nA parrot who...,Jokes,1552069053,motsanciens,0,0,True,2019-03-08
1,How did the dentist suddenly become a brain su...,A slip of the hand.,Jokes,1552069079,roastedtoperfection,0,4,True,2019-03-08
2,I hate build a bear. I took my chihauhua there...,AND the stuffed animal they gave me keeps bark...,Jokes,1552069382,RikorperationYT,0,2,True,2019-03-08
3,An English Teacher And The Pope Was Sitting Ne...,"He was reading a challenging book, and was ver...",Jokes,1552069428,GangstaKev,2,3,True,2019-03-08
4,"I looked left, then I looked right. I looked l...",Then I pulled out... she wasn’t pleased.,Jokes,1552069459,Windwaker85,0,2,True,2019-03-08


In [11]:
# confirm the shape of df 'read_from_csv_df' with the shape of the df 'final_df'
read_from_csv_df.shape

(4319, 9)