## Import Libraries

In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as ss

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize import ToktokTokenizer
import pickle
import spacy
import nltk

In [4]:
## Defining Functions
nltk.download('stopwords', quiet=True, raise_on_error=True)
stopword_list = set(nltk.corpus.stopwords.words('english'))
tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))

In [5]:
tokenizer = ToktokTokenizer()

def remove_stopwords(text):
    # convert sentence into token of words
    all_addresses = []
    for addresses in text:    
        tokens = tokenizer.tokenize(addresses)
        tokens = [token.strip() for token in tokens]
        # check in lowercase 
        t = [token for token in tokens if token.lower() not in stopword_list]
        text=' '.join(t)
        all_addresses.append(text)    
    return pd.Series(all_addresses)

In [6]:
def lemma(text,allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    # Create list to store all addresses in
    all_addresses = []
    for address in text:
        doc = nlp(address)
        t = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        all_addresses.append(t)
    return pd.Series(all_addresses)

## Import Data

In [7]:
# Load Pickled Dataframe from Cleaning Notebook into a DataFrame
path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Pickled_Files\cleaned_addresses.pkl"

df = pickle.load(open(path,'rb'))
df.head(1)

Unnamed: 0,president_number,term,pres_name,pres_det,president_x,address,party,Year,time_period
0,1,1,Washington,1 Washington,01.Washington.1.txt,AMONG the vicissitudes incident to life no eve...,Nonpartisan,1789,pre-1800


## Define Stopwords

In [8]:
# Add additional stop words
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['thing','year','ago','people','nation','states', 'make','long','come','day','know','day','way','fellow'
               ,'americans','citizens','citizen','united','america','shall','must','may','upon','every','let','one','would','great']
stopwords.extend(newStopWords)

## Breakdown by Sentence

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
df['address'] = df['address'].apply(lambda x: [sent.text for sent in nlp(x).sents])

In [11]:
df_sentences = df.explode("address", ignore_index=True)

In [12]:
df_sentences['address'][5842]

'Sustained by faith, driven by conviction and devoted to one another and the country we love with all our hearts.'

## Topic Modelling

In [13]:
# Create text array
text = df_sentences['address']

# Remove stopwords
text = remove_stopwords(text)

# Lemmatize
text = lemma(text)

In [14]:
# Create vectorizer and fit to text
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\b[^\d\W]+\b',stop_words = stopwords,binary = True)

In [15]:
doc_word = vectorizer.fit_transform(text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [21]:
topic_model = ct.Corex(n_hidden=9, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=text, 
                anchors=[['war','invasion'], 
                         ['peace'], 
                         ['foreign','interest'], 
                         ['economy'],
                         ['freedom'],
                         ['equal'],
                         ['uphold'],
                         ['taxation','tax'],
                        ['constitution','preserve']], anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: war,invasion,prevent,state,control,part,permit,product,organization,partial
1: peace,world,promote,commerce,force,defense,maintain,international,army,military
2: interest,foreign,power,country,proper,policy,opinion,domestic,necessary,object
3: economy,public,government,expenditure,exercise,political,enable,health,business,sufficient
4: freedom,man,liberty,hope,human,life,woman,love,happiness,dignity
5: equal,duty,right,law,high,good,discharge,protection,protect,give
6: uphold,home,abroad,child,farm,factory,destruction,leave,old,spiritual
7: revenue,executive,tax,taxation,branch,tariff,department,legislative,money,system
8: preserve,oath,constitution,office,confidence,take,countryman,principle,express,measure


In [22]:
# Let's check out topic : graphics
topic_model.get_top_docs(topic=5, n_docs=2)

[('firm reliance goodness providence mercifully protect national infancy uphold libertie various vicissitude encourage offer ardent supplication continue make beloved country object divine care gracious benediction',
  0.0),
 ('do justice occasion favor favor lawful cherished mutual interest intercourse fair equal term',
  0.0)]