## Importing Classes & Libraries

In [585]:
import pickle
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer 
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk 
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import ToktokTokenizer
from nltk.tokenize import regexp_tokenize
import spacy
import gensim.corpora as corpora
from gensim import corpora, models, similarities, matutils
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore")
from __future__ import division
from sklearn.manifold import TSNE
import numpy as np
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
import os
output_notebook()

In [586]:
## Defining Functions
nltk.download('stopwords', quiet=True, raise_on_error=True)
stopword_list = set(nltk.corpus.stopwords.words('english'))
tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))

In [587]:
tokenizer = ToktokTokenizer()

def remove_stopwords(text):
    # convert sentence into token of words
    all_addresses = []
    for addresses in text:    
        tokens = tokenizer.tokenize(addresses)
        tokens = [token.strip() for token in tokens]
        # check in lowercase 
        t = [token for token in tokens if token.lower() not in stopword_list]
        text=' '.join(t)
        all_addresses.append(text)    
    return pd.Series(all_addresses)

In [588]:
def lemma(text,allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    # Create list to store all addresses in
    all_addresses = []
    for address in text:
        doc = nlp(address)
        t = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        all_addresses.append(t)
    return pd.Series(all_addresses)

In [589]:
# Plotting

def plot_top_words(model, feature_names, n_top_words, title,dim_1,dim_2):
    fig, axes = plt.subplots(dim_1, dim_2, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()
    fig.tight_layout()

## Import Data

In [590]:
# Load Pickled Dataframe from Cleaning Notebook into a DataFrame
path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Pickled_Files\cleaned_addresses.pkl"

df = pickle.load(open(path,'rb'))
df.head(100)

Unnamed: 0,president_number,term,pres_name,pres_det,president_x,address,party,Year,time_period
0,1,1,Washington,1 Washington,01.Washington.1.txt,AMONG the vicissitudes incident to life no eve...,Nonpartisan,1789,pre-1800
1,1,2,Washington,2 Washington,01.Washington.2.txt,I AM again called upon by the voice of my coun...,Nonpartisan,1793,pre-1800
2,2,1,JAdams,1 JAdams,02.JAdams.1.txt,"WHEN it was first perceived, in early times, ...",Federalist,1797,pre-1800
3,3,1,Jefferson,1 Jefferson,03.Jefferson.1.txt,CALLED upon to undertake the duties of the fi...,Democratic-Republican,1801,1800-1850
4,3,2,Jefferson,2 Jefferson,03.Jefferson.2.txt,"PROCEEDING, fellow-citizens, to that qualific...",Democratic-Republican,1805,1800-1850
5,4,1,Madison,1 Madison,04.Madison.1.txt,UNWILLING to depart from examples of the most...,Democratic-Republican,1809,1800-1850
6,4,2,Madison,2 Madison,04.Madison.2.txt,ABOUT to add the solemnity of an oath to the o...,Democratic-Republican,1813,1800-1850
7,5,1,Monroe,1 Monroe,05.Monroe.1.txt,I SHOULD be destitute of feeling if I was not ...,Democratic-Republican,1817,1800-1850
8,5,2,Monroe,2 Monroe,05.Monroe.2.txt,I SHALL not attempt to describe the grateful...,Democratic-Republican,1821,1800-1850
9,6,1,JQAdams,1 JQAdams,06.JQAdams.1.txt,IN compliance with an usage coeval with the e...,Democratic-Republican,1825,1800-1850


## Locating Addtional Stop Words

In [591]:
# Create text array
text = df['address']

# Remove stopwords
text = remove_stopwords(text)

# Lemmatize
text = lemma(text)

# Define Count Vectorizer
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\b[^\d\W]+\b',ngram_range=(1,2),stop_words = 'english')

# call `fit` to build the vocabulary
vectorizer.fit(text)

# finally, call `transform` to convert text to a bag of words
x = vectorizer.transform(text)

In [592]:
# Convert sparse array to numpy array
x_back = x.toarray()

In [593]:
# View word Matrix

pd.DataFrame(x_back, columns=vectorizer.get_feature_names()).head()

Unnamed: 0,abandon,abandon act,abandon claim,abandon delusion,abandon enemy,abandon government,abandon great,abandon habit,abandon hope,abandon indignantly,...,zealous unceasing,zealously,zealously contend,zealously devote,zealously devoted,zealously enforce,zealously steadily,zealously unite,zone,zone extend
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [594]:
# Determine what are the most common words

sum_words = x.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
top_words = sorted(words_freq, key = lambda x: x[1], reverse=True)
#top_words

In [595]:
# Add additional stop words
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['thing','year','ago','people','nation','states', 'make','long','come','day','know','day','way','fellow'
               ,'americans','citizens','citizen','united','america','shall','must','may','upon','every','let','one','would','great']
stopwords.extend(newStopWords)

## Create a new dataframe where each address is broken down by sentence

In [596]:
nlp = spacy.load("en_core_web_sm")

In [597]:
df['address'] = df['address'].apply(lambda x: [sent.text for sent in nlp(x).sents])

In [598]:
df_sentences = df.explode("address", ignore_index=True)

In [599]:
df_sentences.columns

Index(['president_number', 'term', 'pres_name', 'pres_det', 'president_x',
       'address', 'party', 'Year', 'time_period'],
      dtype='object')

In [600]:
df.rename(columns={"Unnamed: 0": "Dialogue ID"}, inplace=True)
df.index.name = "Sentence ID"

In [601]:
df_sentences['address'][5842]

'Sustained by faith, driven by conviction and devoted to one another and the country we love with all our hearts.'

### NMF Topic Modelling by Sentence

In [602]:
# Create text array
text = df_sentences['address']

In [603]:
# Remove stopwords
text = remove_stopwords(text)

In [604]:
# Lemmatize
text = lemma(text)

In [605]:
text.shape

(5844,)

In [606]:
# Create vectorizer and fit to text
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\b[^\d\W]+\b',stop_words = stopwords,ngram_range = (1,2))
doc_word = vectorizer.fit_transform(text)

In [607]:
# Fit and transform NMF
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(5844, 20)

The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [608]:
doc_topic

array([[0.00029567, 0.0078253 , 0.00128346, ..., 0.00293611, 0.05178871,
        0.02270995],
       [0.10809217, 0.03739906, 0.        , ..., 0.        , 0.00185724,
        0.02357853],
       [0.0980234 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.00086909,
        0.        ],
       [0.0950004 , 0.02040175, 0.        , ..., 0.        , 0.00101792,
        0.        ],
       [0.00069198, 0.        , 0.        , ..., 0.        , 0.00258726,
        0.00123769]])

The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [609]:
topic_word = nmf_model.components_
topic_word.shape 

(20, 49952)

The **topic_word** matrix shows us the resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.

In [610]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['country', 'industry', 'foreign', 'whole', 'part', 'section'],
 ['spirit', 'honor', 'form', 'love', 'preserve', 'wish'],
 ['world', 'new', 'old', 'hope', 'freedom', 'new world'],
 ['peace', 'world', 'peace world', 'policy', 'equally', 'prosperity'],
 ['government', 'self', 'self government', 'local', 'form', 'establish'],
 ['power', 'grant', 'exercise', 'give', 'executive', 'authority'],
 ['law', 'enforce', 'equal', 'pass', 'respect', 'execute'],
 ['man', 'woman', 'man woman', 'life', 'hope', 'hand'],
 ['liberty', 'right', 'foreign', 'stand', 'authority', 'public'],
 ['interest', 'national', 'policy', 'regard', 'revenue', 'american'],
 ['public', 'service', 'expenditure', 'debt', 'money', 'opinion'],
 ['time', 'history', 'first', 'change', 'need', 'first time'],
 ['well', 'give', 'never', 'opportunity', 'system', 'hope'],
 ['good', 'effort', 'common', 'future', 'office', 'secure'],
 ['duty', 'call', 'take', 'high', 'office', 'principle'],
 ['right', 'respect', 'equal', 'constitutiona

## Topic Modelling by Paragraph

In [611]:
# Load Pickled Dataframe from Cleaning Notebook into a DataFrame
path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Pickled_Files\cleaned_addresses.pkl"

df_par = pickle.load(open(path,'rb'))
df_par.head(100)

Unnamed: 0,president_number,term,pres_name,pres_det,president_x,address,party,Year,time_period
0,1,1,Washington,1 Washington,01.Washington.1.txt,AMONG the vicissitudes incident to life no eve...,Nonpartisan,1789,pre-1800
1,1,2,Washington,2 Washington,01.Washington.2.txt,I AM again called upon by the voice of my coun...,Nonpartisan,1793,pre-1800
2,2,1,JAdams,1 JAdams,02.JAdams.1.txt,"WHEN it was first perceived, in early times, ...",Federalist,1797,pre-1800
3,3,1,Jefferson,1 Jefferson,03.Jefferson.1.txt,CALLED upon to undertake the duties of the fi...,Democratic-Republican,1801,1800-1850
4,3,2,Jefferson,2 Jefferson,03.Jefferson.2.txt,"PROCEEDING, fellow-citizens, to that qualific...",Democratic-Republican,1805,1800-1850
5,4,1,Madison,1 Madison,04.Madison.1.txt,UNWILLING to depart from examples of the most...,Democratic-Republican,1809,1800-1850
6,4,2,Madison,2 Madison,04.Madison.2.txt,ABOUT to add the solemnity of an oath to the o...,Democratic-Republican,1813,1800-1850
7,5,1,Monroe,1 Monroe,05.Monroe.1.txt,I SHOULD be destitute of feeling if I was not ...,Democratic-Republican,1817,1800-1850
8,5,2,Monroe,2 Monroe,05.Monroe.2.txt,I SHALL not attempt to describe the grateful...,Democratic-Republican,1821,1800-1850
9,6,1,JQAdams,1 JQAdams,06.JQAdams.1.txt,IN compliance with an usage coeval with the e...,Democratic-Republican,1825,1800-1850


In [612]:
df_par['address'] = [text.split("\n") for text in df_par['address']]

In [613]:
df_par_tokenized = df_par.explode("address", ignore_index=True)

In [614]:
# Create text array
text_par = df_par_tokenized['address']

In [615]:
# Remove stopwords
text_par = remove_stopwords(text_par)

In [616]:
# Lemmatize
text_par = lemma(text_par)

In [617]:
# Create vectorizer and fit to text
vectorizer_par = CountVectorizer(analyzer='word',token_pattern=r'\b[^\d\W]+\b',stop_words = stopwords,ngram_range = (1,2))
doc_word_par = vectorizer_par.fit_transform(text_par)

In [618]:
# Fit and transform NMF
nmf_model_par = NMF(20)
doc_topic_par = nmf_model_par.fit_transform(doc_word_par)
doc_topic_par.shape

(5820, 20)

In [619]:
topic_word_par = nmf_model_par.components_
topic_word_par.shape 

(20, 52456)

In [620]:
words_par = vectorizer_par.get_feature_names()
t_par = nmf_model_par.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words_par = [[words_par[e] for e in l] for l in t_par]
topic_words_par

[['country', 'find', 'much', 'high', 'part', 'many'],
 ['power', 'grant', 'sovereignty', 'grant power', 'right', 'possess'],
 ['spirit', 'liberty', 'power', 'character', 'government', 'free'],
 ['world', 'new', 'freedom', 'american', 'work', 'time'],
 ['power', 'control', 'state', 'executive', 'government', 'officer'],
 ['law', 'man', 'enforce', 'pass', 'amendment', 'support'],
 ['party', 'political', 'peace', 'time', 'government', 'war'],
 ['revenue', 'interest', 'protection', 'duty', 'home', 'equally'],
 ['spirit', 'honor', 'interest', 'preserve', 'love', 'wish'],
 ['peace', 'policy', 'war', 'world', 'foreign', 'treaty'],
 ['public', 'duty', 'good', 'service', 'office', 'economy'],
 ['member', 'power', 'however', 'institution', 'separate', 'principle'],
 ['government', 'island', 'already', 'order', 'inhabitant', 'self'],
 ['war', 'force', 'invasion', 'power', 'time', 'naval'],
 ['party', 'whole', 'liberty', 'interest', 'spirit', 'country'],
 ['institution', 'interest', 'never', 'subj