# Exploring the Reddit PRAW and Pushshift API

In [38]:
# Libraries

import requests
import praw
import configparser
import pandas as pd
import numpy as np
import nltk
import json
from datetime import datetime, timedelta, date
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [2]:
# The following cells uses an INI file to pull in credentials needed to access the PRAW API.
# This INI file is stored locally only

config = configparser.RawConfigParser()
config.read("config.txt")
reddit = praw.Reddit(client_id=config.get("reddit","client_id"),
                     client_secret=config.get("reddit","client_secret"),
                     password=config.get("reddit","password"),
                     user_agent="Political exploration",
                     username=config.get("reddit","username"))


## General Data

The API can look at a subreddit and pull information like the title, url, and body of a post (if it's not a link post).
We can also get the karma score, the number of comments, and when the post was created.

We'll generally want to look at /r/politics, /r/news, and /r/worldnews

In [18]:
# Example data pull

posts = []

for post in reddit.subreddit('politics').hot(limit=10):
    posts.append([post.title, 
                  post.score, 
                  post.id, 
                  post.subreddit, 
                  post.url, 
                  post.num_comments, 
                  post.selftext, 
                  datetime.utcfromtimestamp(post.created)
                 ])
df = pd.DataFrame(posts,
                     columns=['title', 
                              'score', 
                              'id', 
                              'subreddit', 
                              'url', 
                              'num_comments', 
                              'body', 
                              'created'
                             ])

df

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,We are the majority-makers in Congress. Meet t...,1394,bugtra,politics,https://www.reddit.com/r/politics/comments/bug...,1177,The New Democrat Coalition is a group of 101 f...,2019-05-30 00:30:20
1,Conservatives Stunned by Mueller Suggesting Tr...,25462,butf0t,politics,https://nymag.com/intelligencer/2019/05/conser...,2737,,2019-05-30 21:51:12
2,Trump says Russia 'helped me get elected' for ...,63781,buscyw,politics,https://www.independent.co.uk/news/world/ameri...,6690,,2019-05-30 20:08:47
3,Trump Finally Admits Russia Helped Him Get Ele...,9102,busa98,politics,https://www.thedailybeast.com/trump-finally-ad...,430,,2019-05-30 20:00:59
4,Sen. Elizabeth Warren on The View: If Trump we...,3353,buuqoh,politics,https://abcnews.go.com/Politics/sen-elizabeth-...,268,,2019-05-30 23:46:59
5,The USS John McCain Debacle Shows the Whole Co...,4718,butiti,politics,https://www.esquire.com/news-politics/politics...,333,,2019-05-30 22:00:38
6,Fox News's Shep Smith: Mueller statement 'dire...,9584,burvdr,politics,https://thehill.com/homenews/media/446033-fox-...,403,,2019-05-30 19:12:51
7,Trump Attacks Mueller Probe - Inadvertently Co...,6820,busfjn,politics,https://www.haaretz.com/us-news/trump-attacks-...,312,,2019-05-30 20:16:01
8,Republican group to hand-deliver Mueller repor...,3277,buufm0,politics,https://thehill.com/homenews/house/446140-repu...,171,,2019-05-30 23:20:04
9,Sun-Maid Pulls Baseball Sponsorship After Team...,7207,bus1mi,politics,https://www.newsweek.com/sun-maid-alexandria-o...,778,,2019-05-30 19:33:01


## List of URLs

To grab text articles from

In [6]:
# Example data pull

urls = []

for post in reddit.subreddit('politics').hot(limit=10):
    urls.append([post.subreddit, 
                  post.url,
                  datetime.utcfromtimestamp(post.created)
                 ])
df = pd.DataFrame(urls,columns=['subreddit','url','created'])

df

Unnamed: 0,subreddit,url,created
0,politics,http://nymag.com/intelligencer/2019/05/study-t...,2019-06-02 23:53:58
1,politics,https://www.newsweek.com/support-trump-impeach...,2019-06-03 00:07:05
2,politics,https://thehill.com/homenews/administration/44...,2019-06-02 21:10:35
3,politics,https://www.cnn.com/2019/05/31/politics/elizab...,2019-06-02 20:52:29
4,politics,https://www.rollingstone.com/politics/politics...,2019-06-03 00:43:11
5,politics,https://thehill.com/homenews/sunday-talk-shows...,2019-06-02 23:50:00
6,politics,https://www.motherjones.com/politics/2019/06/p...,2019-06-03 00:59:57
7,politics,https://www.axios.com/trump-mexico-tariffs-tax...,2019-06-03 02:57:48
8,politics,https://www.independent.co.uk/news/world/ameri...,2019-06-02 20:51:38
9,politics,https://abcnews.go.com/International/wireStory...,2019-06-02 18:29:53


## Narrowing down the data

### By Subreddit

Michael Bennet, Steve Bullock, Julián Castro, Bill de Blasio, John Delaney, John Hickenlooper, Seth Moulton, Tim Ryan, Eric Swalwell, Marianne Williamson, and Andrew Yang are candidates who do not have dedicated subreddits as of May 30, 2019.

For the other candidates, here are their subscriber counts as of May 30, 2019

In [8]:
subreddits = {'Joe Biden': 'JoeBiden',
              'Cory Booker': 'corybooker',
              'Pete Buttigieg': 'Pete_Buttigieg',
              'Tulsi Gabbard': 'tulsi',
              'Kirsten Gillibrand': 'Kirsten_Gillibrand',
              'Mike Gravel': 'gravelforpresident',
              'Kamala Harris': 'Kamala',
              'Jay Inslee': 'inslee2020',
              'Amy Klobuchar': 'BaemyKlobaechar',
              'Beto O\'Rourke': 'Beto2020',
              'Bernie Sanders': 'SandersForPresident',
              'Donald Trump': 'The_Donald',
              'Elizabeth Warren': 'ElizabethWarren',
              'politics': 'politics'}

for subreddit in subreddits.values():
    print(subreddit,': ',reddit.subreddit(subreddit).subscribers)

JoeBiden :  952
corybooker :  378
Pete_Buttigieg :  22156
tulsi :  10607
Kirsten_Gillibrand :  106
gravelforpresident :  3660
Kamala :  1617
inslee2020 :  546
BaemyKlobaechar :  502
Beto2020 :  10131
SandersForPresident :  249726
The_Donald :  747188
ElizabethWarren :  11381
politics :  5120363


## By Date

To narrow down by date, we can either use the `reddit.subreddit('subreddit').search` functionality, or make the call with conditions set around the `post.created` parameter.

## Feature Engineering

Possible ideas include:

1. Number of average and cumulative posts in a subreddit, but the API doesn't make this easy to get.
2. The karma/score and number of comments for an associated headline.
3. Topic modeling on the headline
4. What candidate is addressed in the headline
5. Sentiment analysis on the headline
6. Counting if the word "donation" is referenced in the headline (better yet, the comments)

In [14]:
# Using id = buf84a as a test

sample_post = reddit.submission(id='buf84a')

sample_post.title

'Megathread: Robert Mueller to Make Public Statement About Russia Investigation'

In [None]:
# This runs too long

sample_post.comments.replace_more(limit=None)

for top_level_comment in sample_post.comments:
    #print(top_level_comment.body)

## Logistic Regression

Will be switching over to Pushshift API for this process, which allows us to get data between dates

Example call: https://api.pushshift.io/reddit/submission/search/?after=2019-06-02&before=2019-06-03&q=trump&sort_type=score&sort=desc&subreddit=politics&limit=500

Where the output is a JSON file, the after date is the date of interest, and the before date is one date in the future.

### Headline compared to donations
#### Consider calculating the cumulative karma score and comments along with this later on

The Pushshift API call used here will rely on `agg=subreddit` to get counts and will look like the following:

https://api.pushshift.io/reddit/submission/search/?subreddit=politics,news,worldnews&aggs=subreddit&q=trump&size=0&after=2019-06-02&before=2019-06-03

This data will be stored in a csv file that can be pulled in later.

In [37]:
#day_count = (date(2019, 6, 3) - date(2019, 6, 1)).days + 1
#for single_date in (start_date + timedelta(n) for n in range(day_count)):
#    print(single_date.strftime("%Y-%m-%d"))

afterDate = '2019-06-02'
beforeDate = '2019-06-03'

# Dates above means it will capture data for June 2
politicians = ['williamson', 'harris', 'buttigieg', 'klobuchar', 'yang', 'gillibrand', 'delaney', 'inslee', 
               'hickenlooper', 'orourke', 'warren', 'castro', 'sanders', 'gabbard', 'booker', 'trump', 'biden']

for candidate in politicians:
    dict1 = {}
    dict2 = {}
    dict3 = {}
    url = "https://api.pushshift.io/reddit/submission/search/?subreddit=politics&aggs=subreddit&q={0}&size=0&after={1}&before={2}".format(candidate,afterDate,beforeDate)
    response = requests.get(url).json()
    if response['aggs']['subreddit']:
        dict1.update({'date': afterDate, 'candidate': candidate, 'subreddit': 'politics', 'doc_count': response['aggs']['subreddit'][0]['doc_count']}) 
        rows_list.append(dict1)
    
    url = "https://api.pushshift.io/reddit/submission/search/?subreddit=worldnews&aggs=subreddit&q={0}&size=0&after={1}&before={2}".format(candidate,afterDate,beforeDate)
    response = requests.get(url).json()
    if response['aggs']['subreddit']:
        dict2.update({'date': afterDate, 'candidate': candidate, 'subreddit': 'worldnews', 'doc_count': response['aggs']['subreddit'][0]['doc_count']}) 
        rows_list.append(dict2)
    
    url = "https://api.pushshift.io/reddit/submission/search/?subreddit=news&aggs=subreddit&q={0}&size=0&after={1}&before={2}".format(candidate,afterDate,beforeDate)
    response = requests.get(url).json()
    if response['aggs']['subreddit']:
        dict3.update({'date': afterDate, 'candidate': candidate, 'subreddit': 'news', 'doc_count': response['aggs']['subreddit'][0]['doc_count']}) 
        rows_list.append(dict3)
    
mentions = pd.DataFrame(rows_list)

mentions

Unnamed: 0,candidate,date,doc_count,subreddit
0,harris,2019-06-02,9,politics
1,harris,2019-06-02,11,worldnews
2,harris,2019-06-02,2,news
3,buttigieg,2019-06-02,4,politics
4,yang,2019-06-02,1,worldnews
5,yang,2019-06-02,3,news
6,gillibrand,2019-06-02,1,news
7,delaney,2019-06-02,2,politics
8,hickenlooper,2019-06-02,7,politics
9,warren,2019-06-02,9,politics


## Sentiment Analysis

In [29]:
sia = SIA()
results = []

for i in range(0,len(posts)):
    line = posts[i][0]
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    #results.append(pol_score)
    print(pol_score)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0, 'headline': 'We are the majority-makers in Congress. Meet the New Dems! A(us)A'}
{'neg': 0.13, 'neu': 0.648, 'pos': 0.222, 'compound': 0.25, 'headline': 'Conservatives Stunned by Mueller Suggesting Trump Is Not Innocent'}
{'neg': 0.209, 'neu': 0.791, 'pos': 0.0, 'compound': -0.5719, 'headline': "Trump says Russia 'helped me get elected' for first time in furious outburst at Mueller"}
{'neg': 0.0, 'neu': 0.761, 'pos': 0.239, 'compound': 0.296, 'headline': 'Trump Finally Admits Russia Helped Him Get Elected'}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0, 'headline': "Sen. Elizabeth Warren on The View: If Trump weren't president 'he'd be in handcuffs'"}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0, 'headline': 'The USS John McCain Debacle Shows the Whole Country Has to Cater to President* Snowflake'}
{'neg': 0.204, 'neu': 0.796, 'pos': 0.0, 'compound': -0.3182, 'headline': "Fox News's Shep Smith: Mueller statement 'dire

In [None]:
# Save to a csv file if needed for the future

posts.to_csv('reddit_posts.csv', index=False) 