# Web Scraping Subreddit Data

In [1]:
# Imports
import pandas as pd
import numpy as np

# API
import requests

# Automating
import time


**Test Pull from Artificial Inteligence**

In [2]:
# Set base url
url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='

In [3]:
# Set params
params = {
    'subreddit': 'ArtificialInteligence',
    'size': 50,
    'lang': True,
    'before': 1601384439 # epoch(another version of a timestamp)
}

In [4]:
# Make request
res = requests.get(url, params)

In [5]:
# Examine object
res.text

'{\n    "data": [\n        {\n            "all_awardings": [],\n            "allow_live_comments": false,\n            "author": "hackernoon",\n            "author_flair_css_class": null,\n            "author_flair_richtext": [],\n            "author_flair_text": null,\n            "author_flair_type": "text",\n            "author_fullname": "t2_1zghuhvt",\n            "author_patreon_flair": false,\n            "author_premium": false,\n            "awarders": [],\n            "can_mod_post": false,\n            "contest_mode": false,\n            "created_utc": 1601383717,\n            "domain": "hackernoon.com",\n            "full_link": "https://www.reddit.com/r/ArtificialInteligence/comments/j1yjrq/dont_let_them_fool_you_manipulative_strategies/",\n            "gildings": {},\n            "id": "j1yjrq",\n            "is_crosspostable": false,\n            "is_meta": false,\n            "is_original_content": false,\n            "is_reddit_media_domain": false,\n            "is_ro

In [8]:
# Read as a JSON
res.json() 

{'data': [{'all_awardings': [],
   'allow_live_comments': False,
   'author': 'hackernoon',
   'author_flair_css_class': None,
   'author_flair_richtext': [],
   'author_flair_text': None,
   'author_flair_type': 'text',
   'author_fullname': 't2_1zghuhvt',
   'author_patreon_flair': False,
   'author_premium': False,
   'awarders': [],
   'can_mod_post': False,
   'contest_mode': False,
   'created_utc': 1601383717,
   'domain': 'hackernoon.com',
   'full_link': 'https://www.reddit.com/r/ArtificialInteligence/comments/j1yjrq/dont_let_them_fool_you_manipulative_strategies/',
   'gildings': {},
   'id': 'j1yjrq',
   'is_crosspostable': False,
   'is_meta': False,
   'is_original_content': False,
   'is_reddit_media_domain': False,
   'is_robot_indexable': False,
   'is_self': False,
   'is_video': False,
   'link_flair_background_color': '',
   'link_flair_richtext': [],
   'link_flair_text_color': 'dark',
   'link_flair_type': 'text',
   'locked': False,
   'media_only': False,
   'no_

In [9]:
# Reading as a Data Frame
df_ai = pd.DataFrame(res.json()['data'])
df_ai.head(3)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,url,url_overridden_by_dest,whitelist_status,wls,crosspost_parent,crosspost_parent_list,media,media_embed,secure_media,secure_media_embed
0,[],False,hackernoon,,[],,text,t2_1zghuhvt,False,False,...,https://hackernoon.com/dont-let-them-fool-you-...,https://hackernoon.com/dont-let-them-fool-you-...,all_ads,6,,,,,,
1,[],False,IoT_geek,,[],,text,t2_80k9nq2d,False,False,...,https://www.reddit.com/r/ArtificialInteligence...,,all_ads,6,,,,,,
2,[],False,hackernoon,,[],,text,t2_1zghuhvt,False,False,...,https://hackernoon.com/daniel-jeffries-is-bett...,https://hackernoon.com/daniel-jeffries-is-bett...,all_ads,6,,,,,,


In [10]:
# Listing all of the available columns
list(df_ai.columns)

['all_awardings',
 'allow_live_comments',
 'author',
 'author_flair_css_class',
 'author_flair_richtext',
 'author_flair_text',
 'author_flair_type',
 'author_fullname',
 'author_patreon_flair',
 'author_premium',
 'awarders',
 'can_mod_post',
 'contest_mode',
 'created_utc',
 'domain',
 'full_link',
 'gildings',
 'id',
 'is_crosspostable',
 'is_meta',
 'is_original_content',
 'is_reddit_media_domain',
 'is_robot_indexable',
 'is_self',
 'is_video',
 'link_flair_background_color',
 'link_flair_richtext',
 'link_flair_text_color',
 'link_flair_type',
 'locked',
 'media_only',
 'no_follow',
 'num_comments',
 'num_crossposts',
 'over_18',
 'parent_whitelist_status',
 'permalink',
 'pinned',
 'post_hint',
 'preview',
 'pwls',
 'removed_by_category',
 'retrieved_on',
 'score',
 'selftext',
 'send_replies',
 'spoiler',
 'stickied',
 'subreddit',
 'subreddit_id',
 'subreddit_subscribers',
 'subreddit_type',
 'thumbnail',
 'thumbnail_height',
 'thumbnail_width',
 'title',
 'total_awards_receiv

In [11]:
# Extracting specific columns
df_ai = df_ai.loc[:, ['title',
                    'created_utc', 
                    'selftext',
                    'subreddit',
                    'author',
                    'media_only',
                    'permalink']]

In [12]:
df_ai.head(2)

Unnamed: 0,title,created_utc,selftext,subreddit,author,media_only,permalink
0,Don't Let Them Fool You: Manipulative Strategi...,1601383717,,ArtificialInteligence,hackernoon,False,/r/ArtificialInteligence/comments/j1yjrq/dont_...
1,Looking for someone to collaborate,1601380494,[removed],ArtificialInteligence,IoT_geek,False,/r/ArtificialInteligence/comments/j1xrso/looki...


**Get Reddit Post Function**

In [17]:
def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    '''
    This function concatenates dataframes created by requesting html from reddit 
    using the pushshift API.
    '''
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='
    # instantiate empty list    
    df_list=[]
    # save current epoch, used to iterate in reverse through time
    current_time  = epoch_right_now
    # set up for loop
    for post in range(n_iter):
        
        
        # instantiate get request
        res = requests.get( 
            # requests.get takes base_url and params
                base_url,
            # parameters for get request
                params = {
                # specify subreddit
                    'subreddit': subreddit,
                # specify number of posts to pull
                    'size': 100,
                # ???
                    'lang': True,
                # pull everything from current time backward
                    'before': current_time})
        
        
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        # pull specific columns from dataframe for analysis
        df = df.loc[:, ['title', 
                    'subreddit',
                    'selftext',
                    'permalink',
                    'author',
                    'created_utc',
                    'media_only']]
        
        # append to empty dataframe list
        df_list.append(df)
        
        # Add wait time
        time.sleep(10)
        # set current time counter back to last epoch in recently grabbed df
        current_time= df['created_utc'].min()
    # return one dataframe for all requests
    return pd.concat(df_list, axis=0)
# Adapated from Tim Book

In [26]:
# Function returns current time epoch
def current_epoch():
    import time
    return round(time.time())

current_epoch = current_epoch()

In [27]:
# Collecting data from Artificial Inteligence subreddit
# Note: they spell 'inteligence' incorrectly.
ai = get_posts('ArtificialInteligence', 150, current_epoch)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [25]:
ai.shape

NameError: name 'ai' is not defined

In [57]:
# Collecting data from Machine Learning subreddit
ml = get_posts('MachineLearning', 250, current_epoch)

In [58]:
ml.shape

(10000, 7)

In [83]:
# Concatenating ai and ml 
both = pd.concat([ai,ml])
both['subreddit'].value_counts()

ArtificialInteligence    10000
MachineLearning          10000
Name: subreddit, dtype: int64

In [84]:
# Creating an all_text column
both['all_text'] = both['title'] + both['selftext']

In [85]:
# Lowercasing all_text
both['all_text'] = both['all_text'].apply(lambda x: str(x).lower())

In [86]:
both['subreddit'].value_counts()

ArtificialInteligence    10000
MachineLearning          10000
Name: subreddit, dtype: int64

In [87]:
# Binarizing the subreddit feature
# ML: 1, AI: 0
both['subreddit'] = np.where(both['subreddit'] == 'MachineLearning', 1, 0)

In [127]:

both.to_csv('all_post.csv')