## Import Libraries

In [166]:
import pandas as pd
import numpy as np
import scipy.sparse as ss

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize import ToktokTokenizer
import pickle
import spacy

In [167]:
## Defining Functions
nltk.download('stopwords', quiet=True, raise_on_error=True)
stopword_list = set(nltk.corpus.stopwords.words('english'))
tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))

In [168]:
tokenizer = ToktokTokenizer()

def remove_stopwords(text):
    # convert sentence into token of words
    all_addresses = []
    for addresses in text:    
        tokens = tokenizer.tokenize(addresses)
        tokens = [token.strip() for token in tokens]
        # check in lowercase 
        t = [token for token in tokens if token.lower() not in stopword_list]
        text=' '.join(t)
        all_addresses.append(text)    
    return pd.Series(all_addresses)

In [169]:
def lemma(text,allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    # Create list to store all addresses in
    all_addresses = []
    for address in text:
        doc = nlp(address)
        t = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        all_addresses.append(t)
    return pd.Series(all_addresses)

## Import Data

In [170]:
# Load Pickled Dataframe from Cleaning Notebook into a DataFrame
path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Pickled_Files\cleaned_addresses.pkl"

df = pickle.load(open(path,'rb'))
df.head(1)

Unnamed: 0,president_number,term,pres_name,pres_det,president_x,address,party,Year,time_period
0,1,1,Washington,1 Washington,01.Washington.1.txt,AMONG the vicissitudes incident to life no eve...,Nonpartisan,1789,pre-1800


## Define Stopwords

In [171]:
# Add additional stop words
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['thing','year','ago','people','nation','states', 'make','long','come','day','know','day','way','fellow'
               ,'americans','citizens','citizen','united','america','shall','must','may','upon','every','let','one','would','great']
stopwords.extend(newStopWords)

## Breakdown by Sentence

In [172]:
nlp = spacy.load("en_core_web_sm")

In [173]:
df['address'] = df['address'].apply(lambda x: [sent.text for sent in nlp(x).sents])

In [174]:
df_sentences = df.explode("address", ignore_index=True)

In [175]:
df_sentences['address'][5842]

'Sustained by faith, driven by conviction and devoted to one another and the country we love with all our hearts.'

## Topic Modelling

In [176]:
# Create text array
text = df_sentences['address']

# Remove stopwords
text = remove_stopwords(text)

# Lemmatize
text = lemma(text)

In [177]:
# Create vectorizer and fit to text
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\b[^\d\W]+\b',stop_words = stopwords,binary = True)

In [178]:
doc_word = vectorizer.fit_transform(text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [180]:
topic_model = ct.Corex(n_hidden=7, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=text, 
                anchors=[['war','invasion'], 
                         ['peace'], 
                         ['foreign','interest'], 
                         ['economy'],
                         ['freedom'],
                         ['equality'],
                        ['constitution','preserve','uphold']], anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: war,invasion,manufacture,internal,improvement,patriot,city,destruction,experience,effect
1: peace,world,establish,commerce,maintain,promote,international,harmony,force,contribute
2: interest,foreign,power,proper,country,executive,revenue,necessary,object,state
3: public,economy,law,duty,expenditure,support,administration,business,system,execute
4: freedom,man,life,human,woman,young,history,new,opportunity,mankind
5: government,political,right,equality,form,principle,opinion,institution,exercise,settle
6: preserve,oath,constitution,liberty,blessing,high,office,take,honor,uphold


In [181]:
# Let's check out topic : graphics
topic_model.get_top_docs(topic=5, n_docs=2)

[('therefore thing stand war peace nation equally interested peace world political stability free people equally responsible maintenance essential principle peace actual equality nation matter right privilege peace securely justly rest armed balance power government derive power consent govern power support common thought purpose power family nation sea equally free safe use people rule set common agreement consent far practicable accessible equal term national armament limited necessity national order domestic safety community interest power peace henceforth depend impose nation duty seeing influence proceed citizen mean encourage assist revolution state sternly effectually suppress prevent',
  0.0),
 ('declare part long make reality name argument postulation exhaust positive declaration receive wrong provoking discontinue last appeal long delay break spirit nation destroy confidence political institution perpetuate state disgraceful suffering regain costly sacrifice severe struggle l