In [None]:
%%html
<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at 'In [12]'.</span>

# Creating a Reddit tracker

This notebook details:  
1. Setting up a Reddit API wrapper and querying Reddit
2. Extracting names from Reddit submission titles with SpaCy
3. Saving the results as a dataframe
4. Counting the top mentioned names and plotting them with Plotly
5. Saving the outputs

## Setup

In [1]:
# Setup
import os
import time
import praw
import pandas as pd
import numpy as np
import pprint
import json
from datetime import datetime, timezone
import matplotlib.pyplot as plt
from collections import Counter
import plotly.express as px
%matplotlib inline

import spacy
from spacy import displacy

In [2]:
print(f'praw = {praw.__version__}')
print(f'pandas = {pd.__version__}')
print(f'spacy = {spacy.__version__}')
from platform import python_version
print(f'python version = {python_version()}')

praw = 6.5.1
pandas = 1.0.3
spacy = 2.2.4
python version = 3.7.7


# Set up the Python Reddit API Wrapper (PRAW)
You need a user account - it's fine to create a brand new one.

A really useful library that allows querying of Reddit's API in python.  
Check the documentation and follow the steps to generate your app client and find your client ID and client Secret  
https://praw.readthedocs.io/en/latest/getting_started/quick_start.html

## Save Secrets with treebeard

I want to save my reddit credentials separately from my code and this notebook.  
Create a JSON file with a dictionary of the reddit app secrets, and then upload with `treebeard secrets push secrets.json`  
Ignore the file in `.gitignore` and `treebeard.yaml` so it does not get stored with the code

In [3]:
# Don't save your creds in code! This is an example just to show how to construct the JSON file.
my_creds = {'username': 'my_username', 
            'password': 'my_password', 
            'app_client_id': 'app_client_ID', 
            'app_client_secret': 'app_client_secret'}

In [4]:
with open('dummy_secrets.json', 'w') as f:
    f.write(json.dumps(my_creds))

In [5]:
!cat dummy_secrets.json

{"username": "my_username", "password": "my_password", "app_client_id": "app_client_ID", "app_client_secret": "app_client_secret"}

In [6]:
!treebeard secrets push dummy_secrets.json

[0m🌲 Pushing Secrets for project 63db2b28e1
[0m

[0m  Including dummy_secrets.json
[0m  Including secrets.json
[0m

🔐  done!
[0m

[0m

This ensures secrets are available across any cloud project.

In [7]:
# pushing my real credentials file
!treebeard secrets push secrets.json

[0m🌲 Pushing Secrets for project 63db2b28e1
[0m

[0m  Including secrets.json
[0m  Including secrets.json
[0m

🔐  done!
[0m

[0m

In [8]:
with open('secrets.json', 'r') as f:
    secrets = json.loads(f.read())

In [9]:
# Reddit Credentials - add your own by following the steps in the quick start link above.
username = secrets['username']
password = secrets['password']
app_client_id = secrets['app_client_id']
app_client_secret = secrets['app_client_secret']
user_agent = "script:my_app:v0.1 (by u/laurence_treebeard)"

In [10]:
reddit = praw.Reddit(client_id=app_client_id,
                     client_secret=app_client_secret,
                     user_agent=user_agent)

In [11]:
# Check we have a reddit read_only instance
print(reddit.read_only)  # Output: True

True


In [12]:
# Check we can query reddit
# options: controversial, gilded, hot, new, rising, top
for submission in reddit.subreddit('learnpython').hot(limit=5):
    print(submission.title)

ResponseException: received 401 HTTP response

You've now queried the Reddit API!

# Using SpaCy

SpaCy can recognise a range of named entities:  
https://spacy.io/api/annotation#named-entities  
- PERSON	People, including fictional.
- NORP	Nationalities or religious or political groups.
- FAC	Buildings, airports, highways, bridges, etc.
- ORG	Companies, agencies, institutions, etc.
- GPE	Countries, cities, states.
- LOC	Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT	Objects, vehicles, foods, etc. (Not services.)
- EVENT	Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART	Titles of books, songs, etc.
- LAW	Named documents made into laws.
- LANGUAGE	Any named language.
- DATE	Absolute or relative dates or periods.
- TIME	Times smaller than a day.
- PERCENT	Percentage, including ”%“.
- MONEY	Monetary values, including unit.
- QUANTITY	Measurements, as of weight or distance.
- ORDINAL	“first”, “second”, etc.
- CARDINAL	Numerals that do not fall under another type.

In [None]:
# Load SpaCy's text model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Test SpaCy and render the output - shiny!
doc = nlp("Apple is looking at buying a U.K. startup for $1 billion")
displacy.render(doc, style="ent")

In [None]:
# Turn this into a helpful function for later
def spacy_extract(df, label = 'PERSON'):
    '''
    Takes a pandas Dataframe object and a named entity label
    Returns an array of arrays for each reddit submission
    in the dataframe.
    '''
    titles = df['title']
    output = [] # output
    for title in titles:
        names = [] 
        title = title.replace("'s", "") # clear out apostrophe's
        doc = nlp(title)
        for ent in doc.ents:
            if ent.label_ == label:
                names.append(ent.text)
        output.append(names)
    return output

# Querying Reddit
Input types:  
- subreddit
- new or hot or top
- entity types: person, organisation, organisation, product etc

In [None]:
# Test out a query
# Try replacing top with new - but you'll have to remove the time_filter as it always takes the most recent
df = pd.DataFrame([[x.title, x.score, x.id, x.url] for x in reddit.subreddit('nba').top(limit=100, time_filter='week')], columns=['title', 'score', 'id', 'url'])
df.head()

In [None]:
# Test SpaCy on Reddit submission titles
for x in df['title'][:10]:
    doc = nlp(x)
    displacy.render(doc, style="ent")

## Create functions

In [None]:
def plot(df):
    ''' 
    Given an input dataframe plot a horizontal bar chart
    df: pandas dataframe with two columns, 'Names' and 'Count'
    look_for: string, SpaCy entity type
    subreddit: string, name of subreddit
    '''
    df = df[:20].iloc[::-1].reset_index()
    fig = px.bar(df, x="Count", y="Names", orientation='h')
    return fig

In [None]:
def get_subreddit(subreddit='news', look_for='PERSON', sort='top', limit=100, time_filter='month'):
    '''
    Sort can be 'top' or 'hot' or 'new'
    Limit should be max 1000
    Time_filter can be 'hour', day', 'week', 'month', 'year', 'all'
    If 'new' is selected, time_filter is unused
    '''
    if limit > 1000:
        print('Limit should be less than or equal to 1000')
        return
    
    time_filters = ['hour', 'day', 'week', 'month', 'year', 'all']
    if time_filter not in time_filters:
        print(f'Incorrect time filter. Expecting one of {time_filters}')
    
    columns = ['title', 'score', 'id', 'url', 'datetime']
    if sort=='hot':
        df = pd.DataFrame([[x.title, x.score, x.id, x.url, datetime.fromtimestamp(x.created_utc, timezone.utc)] \
                           for x in reddit.subreddit(subreddit).hot(limit=limit, time_filter=time_filter)],
                    columns=columns)
    elif sort=='new':
        df = pd.DataFrame([[x.title, x.score, x.id, x.url, datetime.fromtimestamp(x.created_utc, timezone.utc)] \
                           for x in reddit.subreddit(subreddit).new(limit=limit)],
                  columns=columns)
    else:
        df = pd.DataFrame([[x.title, x.score, x.id, x.url, datetime.fromtimestamp(x.created_utc, timezone.utc)] \
                           for x in reddit.subreddit(subreddit).top(limit=limit, time_filter=time_filter)],
              columns=columns)
        
    print(f'{len(df)} submissions found')
    print(f'Extracting {look_for}s')
    df['data'] = spacy_extract(df, label=look_for)
    
    # Return top 10
    flat_list = [item for sublist in df['data'] for item in sublist]
    c = Counter(flat_list)
    top = pd.DataFrame(c.most_common(), columns=['Names', 'Count'])
    
    chart = plot(top) # plot chart
    return df, top, chart

In [None]:
# %%timeit -n 1 -r 1
df, top, chart = get_subreddit(subreddit='nba', look_for='PERSON', sort='top', limit=999, time_filter='week')
chart

In [None]:
subreddits_to_query = ["soccer","baseball","hockey","mma","running","snowboarding",
                       "climbing","nba","nfl","politics","casualuk","news"]

In [None]:
# ensure there is a local directory to save images in
if not os.path.exists('output'):
    os.makedirs('output')

In [None]:
# Save all the images locally
for subreddit in subreddits_to_query:
    df, top, chart = get_subreddit(subreddit=subreddit, look_for='PERSON', sort='top', limit=999, time_filter='week')
    name = f"{subreddit}.html"
    chart.write_html(f'output/{name}') # save image