Analyze state of the union addresses. 
Data source: https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents

Find similar or dissimilar speeches using word embeddings

https://nlp.stanford.edu/IR-book/html/htmledition/sublinear-tf-scaling-1.html 
https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html 

## Setup

In [14]:
from bs4 import BeautifulSoup
import requests
import string
import pandas as pd
import urllib.request
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import time
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

# Data source we are going to scrape for results
data_url = 'https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents'

link_list = []

# extract the text of a speech from a URL
# text is extracted in a list of paragraphs (strings) for each speech
def get_speech(url):
    return([ p.text.strip() for p in BeautifulSoup(urllib.request.urlopen(url)).find_all("p") if \
             'This work is in the public domain in the United States because it is a work of the United States federal government' \
             not in p.text.strip()])

# Make a frequency count by distinct values of 
# column(s) listed in 'groupbyvars'
# Returns pandas dataframe
def tidy_count(df,groupbyvars):
    return(df.groupby(groupbyvars).size().reset_index().\
        rename(columns={0: "n"}).sort_values('n',ascending=False).reset_index(drop=True))

## Web Scraping

In [4]:
resp = urllib.request.urlopen(data_url)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))

# Get all links to state of the union addresses from 
for link in soup.find_all('a', href=True):
    if "union_address" in link['href'].lower() or "union_speech" in link['href'].lower() \
        and "portal" not in link['href'].lower() and "#" not in link['href'].lower():
        link_list.append(link['href'])

# Note that I am storing these speeches as lists of paragraphs (strings) for readability
speeches = [get_speech('https://en.wikisource.org' + link) for link in link_list]
# Extract presidents names from link text
presidents = [ link.replace('%','/').split('/')[2].replace('_',' ') for link in link_list ]

# Extract state of the union text entries so we can extract the date
sou_entries = []
for item in soup.find_all('li'):
    if 'union' in item.text.strip().lower() and '(' in  item.text.strip().lower():
        sou_entries.append(item.text.strip())

speeches_pd = pd.DataFrame({
                'president' : presidents,
                'speech' : speeches,
                'year' : [int(re.findall('\d+',item)[1]) for item in sou_entries ]} )

In [5]:
len(speeches_pd)

232

In [7]:
#speeches_pd['speech_num'] = speeches_pd.index # for joining
speeches_pd.sample(n=5,random_state=42)

Unnamed: 0,president,speech,year
219,George W. Bush,"[Thank you very much. And tonight, I have a hi...",2007
66,Franklin Pierce,[Fellow-Citizens of the Senate and of the Hous...,1855
9,John Adams,[Gentlemen of the Senate and Gentlemen of the ...,1798
170,Dwight D. Eisenhower,"[Mr. President, Mr. Speaker, Members of the 86...",1959
15,Thomas Jefferson,[To the Senate and House of Representatives of...,1804


## Preprocessing

Clean text (remove stop words, convert to lower case, remove non-alphabetic content)

In [11]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner']) # disabling the parser makes it run faster
#nlp = spacy.load('en_core_web_lg',disable=['parser', 'ner'])

## Workaround for stopwords bug in en_core_web_lg model
## only need if you are using the .is_stop spacy attribute
for word in nlp.Defaults.stop_words:
     lex = nlp.vocab[word]
     lex.is_stop = True

# Cleans a string by tokenizing with spacy, removing trailing and leading non-alphabetic characters,
# and then removing stop words and tokens that have no alphabetic characters

def spacy_preprocess(text): 
    text_out = []
    # Only keep alphabetic characeters, spaces, and some select characters
    for token in nlp(text.lower()):
        # remove traililng or leading non-alphabetic text
        cleaned_token = token.text.strip(string.punctuation + string.digits + string.whitespace)

        # remove stop words and tokens with no alphabetic content
        if len(cleaned_token) > 1 and cleaned_token not in nlp.Defaults.stop_words and \
        any(c.isalpha() for c in cleaned_token):
            text_out.append(cleaned_token)
    return(" ".join(text_out))

# Deprecated - Preprocess pipeline with spacy. 
# def spacy_preprocess(text): 
#     text_out = []
#     for token in nlp(text.lower()):
#         # Get rid of stop words and non-alphanumeric
#         if not token.is_stop and token.is_alpha and len(token)>1:
#                 text_out.append(token.text)            
#     return(nlp(" ".join(text_out)))

In [6]:
# Print stop words
#print(nlp.Defaults.stop_words)

In [19]:
for token in nlp('The the weather'):
    print(token.is_stop)

False
True
False


In [20]:
# test spacy preprocessing
spacy_preprocess('The dog ran into Bob beCause he saw 234 squirrels under VAU15')

'dog ran bob saw squirrels vau'

In [21]:
test_sentence = """d3.js  Programming ### experience$ /battleship# I\ 
    * also program #and in -python- - sometimes## te\rminal// """

print(spacy_preprocess(test_sentence))

d3.js programming experience battleship program python te minal


## Vectorize Speeches

Use spacy's inbuilt embedding model to vectorize our speeches 

In [22]:
# Each speech is stored as a list of paragraph strings. 
# Here we join the paragraphs into a single speech string
speech_list = [" ".join(speech) for speech in speeches_pd['speech'].tolist() ]

## Pre-process and tokenize our speeches
t0 = time.time()

# preprocess text of all speeches
speeches_cleaned = [ spacy_preprocess(speech) for speech in speech_list]

# vectorize and convert to numpy array
speeches_embed = np.array([nlp(speech).vector for speech in speeches_cleaned])

print('Preprocessing time elapsed: ' + str(time.time()-t0))

Preprocessing time elapsed: 107.23779678344727


Do a k-nearest neighbors search to find similar speeches

In [23]:
# Find the closest X points
# we will find all other points in the dataset
k_search_dist = len(speeches_embed)

In [24]:
t0 = time.time()

# Scale document vectors (maybe not necessary?)
#speechvec_scaler = StandardScaler() # initialize scaler
#speeches_scaled = speechvec_scaler.fit_transform(speeches_embed)

kn_model = NearestNeighbors()
kn_model.fit(speeches_embed)

# find X most similar speeches for each speech
# We add 1 to k since each speech will be most similar to itself (and we remove that result)
dist_speeches, sim_speeches = kn_model.kneighbors(speeches_embed,k_search_dist)

print('k-nearest search time elapsed: ' + str(time.time()-t0))

k-nearest search time elapsed: 0.037275075912475586


In [25]:
# Story nump arrays in pandas
dist_speeches_pd =pd.DataFrame(dist_speeches)
dist_speeches_pd.insert(0,'speech_num',speeches_pd.index)

sim_speeches_pd =pd.DataFrame(sim_speeches)
sim_speeches_pd.insert(0,'speech_num',speeches_pd.index)

In [26]:
dist_matrix = pd.melt(dist_speeches_pd,
    id_vars=['speech_num'],value_vars=list(range(0,k_search_dist))).\
    rename({'variable':'rank','value': 'distance'},axis='columns')

sim_matrix = pd.melt(sim_speeches_pd,
    id_vars=['speech_num'],value_vars=list(range(0,k_search_dist))).\
    rename({'variable': 'rank','value':'speech_num_match'},axis='columns')

Show the most 'similar' state of the union speeches according to spacy document embeddings

In [27]:
# Only keep one unique pair of matches and don't keep rows that match the same speech to itself
simdist_matrix = sim_matrix[(sim_matrix['speech_num'] != sim_matrix['speech_num_match']) & \
                        (sim_matrix['speech_num'] < sim_matrix['speech_num_match'])].\
    merge(dist_matrix,on=['speech_num','rank']).\
    merge(speeches_pd[['president','year']],left_on='speech_num',right_index=True).\
    merge(speeches_pd[['president','year']],left_on='speech_num_match',right_index=True,suffixes=['','_match']).\
    sort_values('distance')

The most similar speeches

In [29]:
simdist_matrix.head(5)

Unnamed: 0,speech_num,rank,speech_num_match,distance,president,year,president_match,year_match
57,97,1,99,0.144875,Grover Cleveland,1886,Grover Cleveland,1888
40,66,1,67,0.160682,Franklin Pierce,1855,Franklin Pierce,1856
56,96,1,97,0.162154,Grover Cleveland,1885,Grover Cleveland,1886
90,166,1,167,0.164151,Dwight D. Eisenhower,1955,Dwight D. Eisenhower,1956
54,92,1,96,0.174653,Chester A. Arthur,1881,Grover Cleveland,1885


Now let's eliminate cases where the president is the same 

In [30]:
simdist_matrix[simdist_matrix['president'] != simdist_matrix['president_match']].head(5)

Unnamed: 0,speech_num,rank,speech_num_match,distance,president,year,president_match,year_match
54,92,1,96,0.174653,Chester A. Arthur,1881,Grover Cleveland,1885
185,92,2,97,0.182346,Chester A. Arthur,1881,Grover Cleveland,1886
302,92,3,99,0.183339,Chester A. Arthur,1881,Grover Cleveland,1888
53,91,1,100,0.184407,Rutherford B. Hayes,1880,Benjamin Harrison,1889
433,96,4,100,0.18822,Grover Cleveland,1885,Benjamin Harrison,1889


Most similar speeches to speeches occuring since 1950

In [33]:
simdist_matrix[(simdist_matrix['president'] != simdist_matrix['president_match']) & \
              ((simdist_matrix['year'] >= 1950 ))].head(10).\
            drop(['speech_num','speech_num_match'],axis=1)

Unnamed: 0,rank,distance,president,year,president_match,year_match
103,1,0.24136,Jimmy Carter,1978,Ronald Reagan,1983
102,1,0.268269,Gerald Ford,1976,Ronald Reagan,1982
109,1,0.283236,George Herbert Walker Bush,1991,George W. Bush,2006
229,2,0.295865,Gerald Ford,1976,Jimmy Carter,1978
334,3,0.296978,Dwight D. Eisenhower,1959,John F. Kennedy,1961
345,3,0.3006,Gerald Ford,1976,Ronald Reagan,1983
490,4,0.303192,Bill Clinton,1998,Barack Obama,2013
230,2,0.308841,Ronald Reagan,1984,George Herbert Walker Bush,1989
335,3,0.311126,Dwight D. Eisenhower,1960,John F. Kennedy,1961
749,6,0.311279,Bill Clinton,1998,Barack Obama,2015


Most dissimilar speeches since 1900. Note 1943 and 1944 by FDR are very dissimilar.

In [34]:
simdist_matrix[((simdist_matrix['year'] >= 1900 ) & (simdist_matrix['year_match'] >= 1900 ))].\
                sort_values('distance',ascending=False).head(10).\
            drop(['speech_num','speech_num_match'],axis=1)

Unnamed: 0,rank,distance,president,year,president_match,year_match
26786,231,1.400909,Herbert Hoover,1931,Franklin Delano Roosevelt,1942
26771,231,1.385296,Woodrow Wilson,1916,Franklin Delano Roosevelt,1942
26772,231,1.38319,Woodrow Wilson,1917,Harry S. Truman,1946
26484,229,1.376139,Franklin Delano Roosevelt,1942,Harry S. Truman,1946
26785,231,1.374885,Herbert Hoover,1930,Franklin Delano Roosevelt,1942
26635,230,1.354937,Herbert Hoover,1931,Donald Trump,2019
26621,230,1.354232,Woodrow Wilson,1916,Donald Trump,2018
26463,229,1.352068,Woodrow Wilson,1916,Donald Trump,2019
26478,229,1.343565,Herbert Hoover,1931,Donald Trump,2018
26309,228,1.330011,Woodrow Wilson,1916,Barack Obama,2016
