In [1]:
import os
import gensim
import spacy
from nltk.tokenize import PunktSentenceTokenizer
from collections import Counter

# None of this is neeeded as the inaugural library does it all already

def read_file(file_name):
  with open(file_name, 'r+', encoding='utf-8') as file:
    file_text = file.read()
  return file_text

def process_speeches(speeches):
  word_tokenized_speeches = list()
  for speech in speeches:
    sentence_tokenizer = PunktSentenceTokenizer()
    sentence_tokenized_speech = sentence_tokenizer.tokenize(speech)
    word_tokenized_sentences = list()
    for sentence in sentence_tokenized_speech:
      word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
      word_tokenized_sentences.append(word_tokenized_sentence)
    word_tokenized_speeches.append(word_tokenized_sentences)
  return word_tokenized_speeches

def merge_speeches(speeches):
  all_sentences = list()
  for speech in speeches:
    for sentence in speech:
      all_sentences.append(sentence)
  return all_sentences

def get_president_sentences(president):
  files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
  speeches = [read_file(file) for file in files]
  processed_speeches = process_speeches(speeches)
  all_sentences = merge_speeches(processed_speeches)
  return all_sentences

def get_presidents_sentences(presidents):
  all_sentences = list()
  for president in presidents:
    files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
    speeches = [read_file(file) for file in files]
    processed_speeches = process_speeches(speeches)
    all_prez_sentences = merge_speeches(processed_speeches)
    all_sentences.extend(all_prez_sentences)
  return all_sentences

def most_frequent_words(list_of_sentences):
  all_words = [word for sentence in list_of_sentences for word in sentence]
  return Counter(all_words).most_common()

In order to create word embeddings on the corpus of all the presidents’ speeches, we need to read the text data from each file, separate the files into sentences on a word by word basis, and then merge all the sentences across the speeches into one big list of lists.

Let’s start by finding all the file names for the .txt files we will be analyzing

In [2]:
files = sorted([file for file in os.listdir() if file[-4:] == '.txt'])

In [3]:
from nltk.corpus import inaugural
files = inaugural.fileids()

speeches = []

for file in files:
    line = inaugural.sents(file)
    speeches.append(line)

print(speeches[0][0][0])
print(speeches[0][0])
print(speeches[0])
# processed_speeches = process_speeches(speeches)

Fellow
['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':']
[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ...]


In [4]:
# Create list of sentences for all presidents
sentences_all_presidents = [' '.join(speech) for speech in inaugural.sents()]

# Function to tokenize all sentences and remove punctuation, put in consistent case, etc
def tokenize_sentences(sentences_to_tokenize):
    word_tokenized_sentences = []
    for sentence in sentences_to_tokenize:
        for word in sentence:
            word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
            word_tokenized_sentences.append(word_tokenized_sentence)
    return word_tokenized_sentences

# Tokenize sentences from all presidents
tokenized_sentences_all_presidents = tokenize_sentences(sentences_all_presidents)




In [5]:
# Function to find the most frequent words
def most_freq_words(tokenized_sentences):
    sent_in_speeches = [word for sentence in tokenized_sentences for word in sentence]
    return Counter(sent_in_speeches).most_common()

# Find most frequent words for all presidents
most_freq_words_all_presidents = most_freq_words(tokenized_sentences_all_presidents)

In [11]:
# Create a word embedding model with gensim
# Train it on the sentences in speeches
# Iniitally was vector_size=96, window=5, min_count=1, workers=2, sg=1. Reduced to get it to go faster
all_prez_embeddings = gensim.models.Word2Vec(tokenized_sentences_all_presidents, vector_size=96, window=2, min_count=10, workers=2, sg=1)

# Find words most similar to 'freedom'
similar_to_freedom = all_prez_embeddings.wv.most_similar('freedom')
print(similar_to_freedom)

[('inextricably', 0.5242308974266052), ('a', 0.4748968780040741), ('baggage', 0.45908448100090027), ('deathless', 0.45783382654190063), ('the', 0.45011886954307556), ('speech', 0.44889965653419495), ('in', 0.44363024830818176), ('reborn', 0.43920230865478516), ('defects', 0.4300992488861084), ('values', 0.42966604232788086)]


In [None]:
# Find words most similar to 'spirit'
similar_to_spirit = all_prez_embeddings.wv.most_similar('spirit')
print(similar_to_spirit)

[('story', 0.9695708155632019), ('policy', 0.9658669233322144), ('land', 0.9640661478042603), ('unity', 0.964009165763855), ('condition', 0.9630237817764282), ('very', 0.9629682898521423), ('basis', 0.9629378914833069), ('party', 0.9629285931587219), ('feeling', 0.9620798826217651), ('sound', 0.9620466232299805)]


A fun aspect of word embeddings is to see how different corpora result in different word embeddings, alluding to differences in how words are used between writers/authors/speakers.

Let’s train a word embedding model on a single president and see how their word embeddings differ from the collection of all presidents.

In [15]:
# sentences_roosevelt = inaugural.sents(['1945-Roosevelt.txt', '1941-Roosevelt.txt', '1937-Roosevelt.txt', '1933-Roosevelt.txt'])

sentences_roosevelt = [' '.join(sublist) for sublist in inaugural.sents(['1945-Roosevelt.txt', '1941-Roosevelt.txt', '1937-Roosevelt.txt', '1933-Roosevelt.txt'])]

tokenized_sentences_roosevelt = tokenize_sentences(sentences_roosevelt)

most_freq_words_roosevelt = most_freq_words(tokenized_sentences_roosevelt)

roosevelt_embeddings = gensim.models.Word2Vec(tokenized_sentences_roosevelt, vector_size=96, window=5, min_count=1, workers=2, sg=1)

# Find words most similar to 'freedom'
similar_to_freedom = roosevelt_embeddings.wv.most_similar('freedom')
print(similar_to_freedom)

[('privilege', 0.7726589441299438), ('ebbing', 0.6769092082977295), ('tide', 0.6420332789421082), ('voices', 0.59122234582901), ('wave', 0.5841450691223145), ('rises', 0.551307737827301), ('story', 0.542332649230957), ('should', 0.539277195930481), ('surging', 0.5379772782325745), ('heed', 0.5236226916313171)]


In [None]:
# sentences_roosevelt = inaugural.sents(['1945-Roosevelt.txt', '1941-Roosevelt.txt', '1937-Roosevelt.txt', '1933-Roosevelt.txt'])

sentences_roosevelt = [' '.join(sublist) for sublist in inaugural.sents(['1945-Roosevelt.txt', '1941-Roosevelt.txt', '1937-Roosevelt.txt', '1933-Roosevelt.txt'])]

tokenized_sentences_roosevelt = tokenize_sentences(sentences_roosevelt)

most_freq_words_roosevelt = most_freq_words(tokenized_sentences_roosevelt)

roosevelt_embeddings = gensim.models.Word2Vec(tokenized_sentences_roosevelt, vector_size=96, window=5, min_count=1, workers=2, sg=1)

# Find words most similar to 'freedom'
similar_to_freedom = roosevelt_embeddings.wv.most_similar('freedom')
print(similar_to_freedom)