## Reddit Political Community Analyzer

In [1]:
import praw
from psaw import PushshiftAPI
from urllib.parse import urlparse
import tldextract
from bs4 import BeautifulSoup
import re

In [2]:
reddit = praw.Reddit(client_id='JgYmAnPps7utGg',
                     client_secret='g2s1IJiQPnAXSfxua-MFxTSz1xs',
                     user_agent='15388throwaway')
api = PushshiftAPI(reddit)

In [3]:
import datetime as dt

def getTopMonthlyPosts(year, month, subreddit):
    e_month = month + 1
    e_year = year
    if month == 12:
        e_month = 1
        e_year = year + 1

    start_epoch = int(dt.datetime(year, month, 1).timestamp())
    end_epoch = int(dt.datetime(e_year, e_month, 1). timestamp())
    gen = api.search_submissions(before=end_epoch, after=start_epoch,
                                subreddit=subreddit, sort='desc',
                                sort_type='score', limit=50)
    return list(gen)

In [4]:
# helper method to extract the urls from the list
def getUrls(gen):
    urls = []
    for g in gen:
        urls.append(g.url)
    return urls

In [11]:
invalid_domains = ['go']
non_news_domains = ['reddit', 'imgur', 'twitter', 'twimg', 'youtube', 'google', 'redd', 'youtu']

def parseUrl(fullUrl):
    parseObj = tldextract.extract(fullUrl)
    if parseObj.domain in non_news_domains:
        return ""
    elif parseObj.domain in invalid_domains:
        return parseObj.subdomain
    return parseObj.domain

In [12]:
def loadBiases(filename):
    least = open(filename, "r").read()
    soup = BeautifulSoup(least, 'html.parser')
    
    sources = set()
    #websites = set()
    for s in soup.findAll('td', ):
        source = s.string
        if source is not None:
            if source[-1] == ")":
                left_paren = source.find('(')
                sources.add(source[0:left_paren - 1])
                sources.add(parseUrl(source[left_paren + 1:-1]))
            else:
                sources.add(source)
    return sources

# Loading the biases to a dict mapping (source/website to bias ranking)
left = loadBiases("Left.htm")
left_center = loadBiases("Left-Center.htm")
least = loadBiases("Least.htm")
right_center = loadBiases("Right-Center.htm")
right = loadBiases("Right.htm")

biases = {}

biases.update(biases.fromkeys(left, -2))
biases.update(biases.fromkeys(left_center, -1))
biases.update(biases.fromkeys(least, 0))
biases.update(biases.fromkeys(right_center, 1))
biases.update(biases.fromkeys(right, 2))

del biases[""]

addenum_to_biases = {"huffpost" : -2, 
                     "buzzfeednews" : -1, 
                     "breitbart" : 3, 
                     "progressivestoday" : 3, 
                     "frontpagemag" : 3,
                     "dailymail" : 2,
                     "ap" : 0,
                     "sputniknews" : 2,
                     "truthinmedia" : 3,
                     "resistancereport" : -3,
                     "redstate" : 2,
                     "newcenturytimes" : -3,
                     "rare" : 1,
                     "rt" : 1,
                     "americanthinker" : 3,
                     "cnsnews" : 3
                    }

biases.update(addenum_to_biases)

In [14]:
def getMonthlyBias(year, month, subreddit):
    topMonthlyPostsTest = getTopMonthlyPosts(year,month, subreddit)
    topMonthlyPostsUrls = getUrls(topMonthlyPostsTest)
    total = 0
    total_num_parseable = 0

    for url in topMonthlyPostsUrls:
        parsed_url = parseUrl(url)
        total_num_parseable += 1
        if parsed_url != "":
            if parsed_url in biases:
                total += biases.get(parsed_url)
            else:
                total_num_parseable -= 1
                print(parsed_url + ", " + url)
    return (total, total_num_parseable)

def getAllMonthlyBiasesSince(year, subreddit):
    for i in range(2019 - year + 1):
        for month in range(12):
            print(year + i, month + 1, getMonthlyBias(year + i, month + 1, subreddit))

getAllMonthlyBiasesSince(2019, "politics")
#getAllMonthlyBiasesSince(2019, "Conservative")
#getAllMonthlyBiasesSince(2019, "Liberal")
#getAllMonthlyBiasesSince(2019, "news")

2019 1 (-47, 50)
speaker, https://www.speaker.gov/newsroom/21519-3/
2019 2 (-42, 49)
nbcsandiego, https://www.nbcsandiego.com/investigations/Source-Leaked-Documents-Show-the-US-Government-Tracking-Journalists-and-Advocates-Through-a-Secret-Database-506783231.html
2019 3 (-34, 49)
nybooks, https://www.nybooks.com/daily/2019/04/26/mueller-prosecutors-trump-did-obstruct-justice/
mystateline, https://www.mystateline.com/news/illinois-democrats-to-trump-show-tax-returns-or-be-barred-from-2020-ballot/1919139230
2019 4 (-46, 48)
justice, https://www.justice.gov/usao-sdny/pr/bank-ceo-stephen-m-calk-charged-corruptly-soliciting-presidential-administration
2019 5 (-28, 49)
2019 6 (-44, 50)
wnky, https://www.wnky.com/putins-mitch-billboard-grabs-attention-on-interstate-65/
2019 7 (-34, 49)
clarionledger, https://www.clarionledger.com/story/news/politics/2019/08/27/voting-machine-problems-video-changing-vote-bill-waller-tate-reeves-ms-election-governor-runoff/2129515001/?fbclid=IwAR2RFlFrTu--MwFHt