## Reddit Political Community Analyzer

In [38]:
import praw
from psaw import PushshiftAPI
from urllib.parse import urlparse
import tldextract
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PyQt5

In [39]:
reddit = praw.Reddit(client_id='JgYmAnPps7utGg',
                     client_secret='g2s1IJiQPnAXSfxua-MFxTSz1xs',
                     user_agent='15388throwaway')
api = PushshiftAPI(reddit)

In [3]:
import datetime as dt

def getTopMonthlyPosts(year, month, subreddit):
    e_month = month + 1
    e_year = year
    if month == 12:
        e_month = 1
        e_year = year + 1

    start_epoch = int(dt.datetime(year, month, 1).timestamp())
    end_epoch = int(dt.datetime(e_year, e_month, 1). timestamp())
    gen = api.search_submissions(before=end_epoch, after=start_epoch,
                                subreddit=subreddit, sort='desc',
                                sort_type='score', limit=50)
    return list(gen)

In [4]:
# helper method to extract the urls from the list
def getUrls(gen):
    urls = []
    for g in gen:
        urls.append(g.url)
    return urls

In [5]:
invalid_domains = ['go']
non_news_domains = ['reddit', 'imgur', 'twitter', 'twimg', 'youtube', 'google', 'redd', 'youtu']

def parseUrl(fullUrl):
    parseObj = tldextract.extract(fullUrl)
    if parseObj.domain in non_news_domains:
        return ""
    elif parseObj.domain in invalid_domains:
        return parseObj.subdomain
    return parseObj.domain

In [6]:
def loadBiases(filename):
    least = open(filename, "r").read()
    soup = BeautifulSoup(least, 'html.parser')
    
    sources = set()
    #websites = set()
    for s in soup.findAll('td', ):
        source = s.string
        if source is not None:
            if source[-1] == ")":
                left_paren = source.find('(')
                sources.add(source[0:left_paren - 1])
                sources.add(parseUrl(source[left_paren + 1:-1]))
            else:
                sources.add(source)
    return sources

# Loading the biases to a dict mapping (source/website to bias ranking)
left = loadBiases("Left.htm")
left_center = loadBiases("Left-Center.htm")
least = loadBiases("Least.htm")
right_center = loadBiases("Right-Center.htm")
right = loadBiases("Right.htm")

biases = {}

biases.update(biases.fromkeys(left, -2))
biases.update(biases.fromkeys(left_center, -1))
biases.update(biases.fromkeys(least, 0))
biases.update(biases.fromkeys(right_center, 1))
biases.update(biases.fromkeys(right, 2))

del biases[""]

addenum_to_biases = {"huffpost" : -2, 
                     "buzzfeednews" : -1, 
                     "breitbart" : 3, 
                     "progressivestoday" : 3, 
                     "frontpagemag" : 3,
                     "dailymail" : 2,
                     "ap" : 0,
                     "sputniknews" : 2,
                     "truthinmedia" : 3,
                     "resistancereport" : -3,
                     "redstate" : 2,
                     "newcenturytimes" : -3,
                     "rare" : 1,
                     "rt" : 1,
                     "americanthinker" : 3,
                     "cnsnews" : 3
                    }

biases.update(addenum_to_biases)

In [74]:
def getMonthlyBias(year, month, subreddit):
    topMonthlyPostsTest = getTopMonthlyPosts(year, month, subreddit)
    topMonthlyPostsUrls = getUrls(topMonthlyPostsTest)
    total = 0
    total_num_parseable = 0
    breakdown = {}
    breakdown.update(breakdown.fromkeys([-3, -2, -1, 0, 1, 2, 3], 0))

    out_sources = []
    
    for url in topMonthlyPostsUrls:
        parsed_url = parseUrl(url)
        total_num_parseable += 1
        if parsed_url != "":
            if parsed_url in biases:
                bias = biases.get(parsed_url)
                total += bias
                breakdown[bias] += 1
                out_sources.append(parsed_url)
            else:
                total_num_parseable -= 1
                #print(parsed_url + ", " + url)
    return (total, total_num_parseable, breakdown, out_sources)

def getAllMonthlyBiasesSince(year, subreddit):
    years = []
    months = []
    totals = []
    parseables = []
    breakdowns = []
    out_sources_overall = []
    for i in range(2019 - year + 1):
        for month in range(12):
            #print(year + i, month + 1, getMonthlyBias(year + i, month + 1, subreddit))
            (total, parseable, breakdown, out_s) = getMonthlyBias(year + i, month + 1, subreddit)
            years.append(year+i)
            months.append(month+1)
            totals.append(total)
            parseables.append(parseable)
            breakdowns.append(breakdown)
            out_sources_overall += out_s
    return (years, months, totals, parseables, breakdowns, out_sources_overall)

(y_politics, m_politics, tot_politics, num_politics, b_politics, os_politics) = getAllMonthlyBiasesSince(2016, "politics")
(_, _, tot_Conservative, num_Conservative, b_Conservative, os_Conservative) = getAllMonthlyBiasesSince(2016, "Conservative")
(_, _, tot_Liberal, num_Liberal, b_Liberal, os_Liberal) = getAllMonthlyBiasesSince(2016, "Liberal")
(_, _, tot_moderatepolitics, num_moderatepolitics, b_moderatepolitics, os_moderatepolitics) = getAllMonthlyBiasesSince(2016, "moderatepolitics")
(_, _, tot_news, num_news, b_news, os_news) = getAllMonthlyBiasesSince(2016, "news")

In [75]:
year_month = []

for k in range(len(y_politics)):
    m_str = str(m_politics[k])
    if m_politics[k] < 10:
        m_str = "0" + m_str
    year_month.append(str(y_politics[k]) + "-" + m_str)

dates = np.array(year_month, dtype='datetime64')
politics_score = np.divide(np.array(tot_politics), np.array(num_politics))
conservative_score = np.divide(np.array(tot_Conservative), np.array(num_Conservative))
liberal_score = np.divide(np.array(tot_Liberal), np.array(num_Liberal))
moderatepolitics_score = np.divide(np.array(tot_moderatepolitics), np.array(num_moderatepolitics))
news_score = np.divide(np.array(tot_news), np.array(num_news))
print(len(politics_score))
print(len(conservative_score))
print(len(liberal_score))
print(len(moderatepolitics_score))
print(len(news_score))
print(len(dates))
df_all_subreddits = pd.DataFrame({ 'dates' : dates, 'politics' : politics_score, 'conservative' : conservative_score,
                            'liberal' : liberal_score, 'moderatepolitics' : moderatepolitics_score,
                            'news' : news_score})
#%matplotlib 
fig=plt.figure(figsize=(18, 16), dpi= 160, facecolor='w', edgecolor='k')

plt.plot('dates', 'politics', data=df_all_subreddits)
plt.plot('dates', 'conservative', data=df_all_subreddits)
plt.plot('dates', 'liberal', data=df_all_subreddits)
plt.plot('dates', 'moderatepolitics', data=df_all_subreddits)
plt.plot('dates', 'news', data=df_all_subreddits)
plt.legend()

48
48
48
48
48
48


<matplotlib.legend.Legend at 0x145715358>

In [72]:
pol_test = np.ndarray((48, 7), int)

%matplotlib qt

for index in range(len(b_politics)):
    for i in range(7):
        pol_test[index][i] = (b_politics[index])[i - 3]

pol_test = pol_test.T
#plt.plot(dates, pol_test[0], label="very left")
plt.plot(dates, pol_test[1], label="left")
plt.plot(dates, pol_test[2], label="left-center")
plt.plot(dates, pol_test[3], label="neutral")
plt.plot(dates, pol_test[4], label="right-center")
plt.plot(dates, pol_test[5], label="right")
#plt.plot(dates, pol_test[6], label="very right")
plt.legend()

<matplotlib.legend.Legend at 0x1163ba160>

In [107]:
from collections import Counter
w = dict(Counter(os_Conservative).most_common(10))
print(w)
plt.xticks(rotation=60)

plt.bar(*zip(*w.items()))
#plt.bar(range(len(w.keys())), w.values, align='center')
#plt.xticks(range(len(w.keys()), list(w.keys())))
plt.tight_layout()

plt.show()

#Counter(os_Conservative).most_common()
#Counter(os_Liberal).most_common()
#Counter(os_moderatepolitics).most_common()
#Counter(os_news).most_common()

{'dailywire': 87, 'foxnews': 74, 'breitbart': 55, 'dailycaller': 43, 'townhall': 27, 'thehill': 27, 'nypost': 26, 'washingtonexaminer': 22, 'redstate': 20, 'washingtontimes': 19}
