In [16]:
# import libraries
import numpy as np
import pandas as pd
import sys
import csv

In [17]:
csv.field_size_limit(1000000000)
df = pd.read_csv("./data/state-of-the-union.csv", names=["year", "speech"], header=None)
df.head()

Unnamed: 0,year,speech
0,1790,"George Washington\nJanuary 8, 1790\n\nFellow-C..."
1,1790,\nState of the Union Address\nGeorge Washingto...
2,1791,\nState of the Union Address\nGeorge Washingto...
3,1792,\nState of the Union Address\nGeorge Washingto...
4,1793,\nState of the Union Address\nGeorge Washingto...


In [18]:
df.shape

(226, 2)

In [19]:
# check for duplicate rows
df.duplicated().sum()

np.int64(0)

In [20]:
# check for missing values
df.isnull().sum()

year      0
speech    0
dtype: int64

In [21]:
# read stop words
custom_stopwords = pd.read_csv("./data/stopwords-en.csv", header=None)

custom_stopwords.shape

# convert to a set of unique words
custom_stopwords = set(custom_stopwords[0])

In [22]:
# Stop words
STOP_WORDS = set(
    """
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at

back be became because become becomes becoming been before beforehand behind
being below beside besides between beyond both bottom but by

call can cannot ca could

did do does doing done down due during

each eight either eleven else elsewhere empty enough even ever every
everyone everything everywhere except

few fifteen fifty first five for former formerly forty four from front full
further

get give go

had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred

i if in indeed into is it its itself

keep

last latter latterly least less

just

made make many may me meanwhile might mine more moreover most mostly move much
must my myself

name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere

of off often on once one only onto or other others otherwise our ours ourselves
out over own

part per perhaps please put

quite

rather re really regarding

same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such

take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two

under until up unless upon us used using

various very very via was we well were what whatever when whence whenever where
whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would

yet you your yours yourself yourselves
""".split()
)

contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
STOP_WORDS.update(contractions)

for apostrophe in ["‘", "’"]:
    for stopword in contractions:
        STOP_WORDS.add(stopword.replace("'", apostrophe))

In [50]:
us_presidents_stopwords = {
    "george",
    "washington",
    "john",
    "adams",
    "thomas",
    "jefferson",
    "james",
    "madison",
    "monroe",
    "quincy",
    "jackson",
    "martin",
    "van",
    "buren",
    "william",
    "henry",
    "harrison",
    "tyler",
    "knox",
    "polk",
    "zachary",
    "taylor",
    "millard",
    "fillmore",
    "franklin",
    "pierce",
    "buchanan",
    "abraham",
    "lincoln",
    "andrew",
    "johnson",
    "ulysses",
    "grant",
    "rutherford",
    "hayes",
    "garfield",
    "chester",
    "arthur",
    "grover",
    "cleveland",
    "benjamin",
    "mcKinley",
    "theodore",
    "roosevelt",
    "taft",
    "woodrow",
    "wilson",
    "warren",
    "harding",
    "calvin",
    "coolidge",
    "herbert",
    "hoover",
    "truman",
    "eisenhower",
    "kennedy",
    "lyndon",
    "nixon",
    "gerald",
    "ford",
    "jimmy",
    "carter",
    "ronald",
    "reagan",
    "bill",
    "clinton",
    "obama",
    "donald",
    "trump",
    "joe",
    "biden",
}


# other stopwords for this specific dataset
other_stopwords = {
    "state",
    "union",
    "mr",
    "speaker",
    "vice",
    "members",
    "senate",
    "house",
    "representatives",
    "fellow",
    "citizens",
    "united",
    "states",
    "america",
    "american",
    "americans",
    "congress",
    "congressmen",
    "congresswomen",
    "congressman",
    "congresswoman",
    "president",
    "address",
}

months = {
    "january",
    "february",
    "march",
    "april",
    "may",
    "june",
    "july",
    "august",
    "september",
    "october",
    "november",
    "december",
}

In [51]:
# spacy stopwords as set
spacy_stopwords = STOP_WORDS

# combine spacy and custom stopwords
all_stopwords = spacy_stopwords.union(custom_stopwords)

# UPDATE STOPWORDS
# add US presidents stopwords
all_stopwords.update(us_presidents_stopwords)
# add other stopwords
all_stopwords.update(other_stopwords)
# add months
all_stopwords.update(months)

# # shape of the combined stopwords
len(all_stopwords)

485

In [52]:
from gensim.parsing.preprocessing import (
    preprocess_string,
    strip_tags,
    strip_punctuation,
    strip_numeric,
    remove_stopwords,
)


# custom function to remove stopwords using the custom stop words
def remove_stopwords_custom(text):
    return remove_stopwords(text, stopwords=all_stopwords)


# base function to clean the speech using gensim package
def preprocess_text(text):
    filters = [
        strip_tags,
        strip_punctuation,
        strip_numeric,
        remove_stopwords_custom,
        # stem_text,
    ]
    text = preprocess_string(text, filters)
    return text

In [53]:
# convert to lowercase
df["speech"] = df["speech"].str.lower()

# apply the function to the speech column
df["clean_speech"] = df["speech"].apply(preprocess_text)

In [54]:
df.head()

Unnamed: 0,year,speech,clean_speech,lemmatized_speech
0,1790,"george washington\njanuary 8, 1790\n\nfellow-c...","[embrace, great, satisfaction, opportunity, pr...","[George, Washington, January, Fellow, Citizens..."
1,1790,\nstate of the union address\ngeorge washingto...,"[meeting, feel, satisfaction, able, repeat, co...","[State, Union, Address, George, Washington, De..."
2,1791,\nstate of the union address\ngeorge washingto...,"[vain, expect, peace, indians, frontiers, long...","[State, Union, Address, George, Washington, Oc..."
3,1792,\nstate of the union address\ngeorge washingto...,"[abatement, satisfaction, meet, present, occas...","[State, Union, Address, George, Washington, No..."
4,1793,\nstate of the union address\ngeorge washingto...,"[commencement, term, called, office, fit, occa...","[State, Union, Address, George, Washington, De..."


In [55]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()


# Function to lemmatize words
def lemmatize_list(word_list):
    return [lemmatizer.lemmatize(word, pos="v") for word in word_list]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ajult\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [56]:
# Apply the lemmatize function to the clean_speech column
df["lemmatized_speech"] = df["clean_speech"].apply(lemmatize_list)

In [57]:
df.head()

Unnamed: 0,year,speech,clean_speech,lemmatized_speech
0,1790,"george washington\njanuary 8, 1790\n\nfellow-c...","[embrace, great, satisfaction, opportunity, pr...","[embrace, great, satisfaction, opportunity, pr..."
1,1790,\nstate of the union address\ngeorge washingto...,"[meeting, feel, satisfaction, able, repeat, co...","[meet, feel, satisfaction, able, repeat, congr..."
2,1791,\nstate of the union address\ngeorge washingto...,"[vain, expect, peace, indians, frontiers, long...","[vain, expect, peace, indians, frontiers, long..."
3,1792,\nstate of the union address\ngeorge washingto...,"[abatement, satisfaction, meet, present, occas...","[abatement, satisfaction, meet, present, occas..."
4,1793,\nstate of the union address\ngeorge washingto...,"[commencement, term, called, office, fit, occa...","[commencement, term, call, office, fit, occasi..."


In [58]:
# saved the lemmatized speech to a different df
df_lemmatized = df[["year", "lemmatized_speech"]].copy()

df_lemmatized.head()

Unnamed: 0,year,lemmatized_speech
0,1790,"[embrace, great, satisfaction, opportunity, pr..."
1,1790,"[meet, feel, satisfaction, able, repeat, congr..."
2,1791,"[vain, expect, peace, indians, frontiers, long..."
3,1792,"[abatement, satisfaction, meet, present, occas..."
4,1793,"[commencement, term, call, office, fit, occasi..."


In [59]:
# function to get word frequency

from collections import defaultdict


def get_word_freq(df):
    word_freq = defaultdict(int)
    for index, row in df.iterrows():
        for word in row["lemmatized_speech"]:
            word_freq[word] += 1
    return word_freq


word_freq = get_word_freq(df_lemmatized)

word_freq

defaultdict(int,
            {'embrace': 95,
             'great': 2031,
             'satisfaction': 192,
             'opportunity': 386,
             'present': 1314,
             'congratulate': 53,
             'favorable': 175,
             'prospect': 111,
             'public': 1596,
             'affairs': 325,
             'recent': 240,
             'accession': 19,
             'important': 639,
             'north': 203,
             'carolina': 24,
             'constitution': 502,
             'official': 127,
             'information': 266,
             'receive': 636,
             'rise': 226,
             'credit': 289,
             'respectability': 4,
             'country': 1972,
             'general': 807,
             'increase': 1310,
             'good': 899,
             'government': 4247,
             'concord': 9,
             'peace': 1385,
             'plenty': 24,
             'bless': 100,
             'circumstances': 243,
             'auspicious':

In [62]:
# function to filter out words from df that have a frequency less than a certain threshold
threshold = 10
df_lemmatized["lemmatized_speech_filtered"] = df_lemmatized["lemmatized_speech"].apply(
    lambda x: [word for word in x if word_freq[word] >= threshold]
)

In [63]:
df_lemmatized.head()

Unnamed: 0,year,lemmatized_speech,lemmatized_speech_filtered
0,1790,"[embrace, great, satisfaction, opportunity, pr...","[embrace, great, satisfaction, opportunity, pr..."
1,1790,"[meet, feel, satisfaction, able, repeat, congr...","[meet, feel, satisfaction, able, repeat, congr..."
2,1791,"[vain, expect, peace, indians, frontiers, long...","[vain, expect, peace, indians, frontiers, long..."
3,1792,"[abatement, satisfaction, meet, present, occas...","[satisfaction, meet, present, occasion, contin..."
4,1793,"[commencement, term, call, office, fit, occasi...","[commencement, term, call, office, fit, occasi..."
