In [None]:
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import os
import csv
from collections import Counter

In [2]:
# grab all speeches
path  = "cleaned_presidential_speeches.json"
with open(path) as f:
    cleaned_speeches = json.load(f)

In [None]:
# Presidents we want to analyze with their years served
PRESIDENTS = {
    'Ulysses S. Grant': [1869, 1877],
    'Rutherford B. Hayes': [1877, 1881],
    'James A. Garfield': [1881, 1881],
    'Chester A. Arthur': [1881, 1885],
    'Grover Cleveland': [1885, 1889],
    'Benjamin Harrison': [1889, 1893],
    'Grover Cleveland': [1893, 1897],
    'William McKinley': [1897, 1901],
    "Theodore Roosevelt": [1901,1909],
    "William Howard Taft": [1909,1913],
    "Woodrow Wilson": [1913,1921],
    "Warren G. Harding": [1921,1923],
    "Calvin Coolidge": [1923,1929],
    "Herbert Hoover": [1929,1933],
    "Franklin D. Roosevelt": [1933,1945],
    "Harry S. Truman": [1945,1953],
    "Dwight D. Eisenhower": [1953,1961],
    "John F. Kennedy": [1961,1963],
    "Lyndon B. Johnson": [1963,1969],
    "Richard Nixon": [1969,1974],
    "Gerald R. Ford": [1974,1977],
    "Jimmy Carter": [1977,1981],
    "Ronald Reagan": [1981,1989],
    "George Bush": [1989,1993],
    "William J. Clinton": [1993,2001],
    "George W. Bush": [2001,2009],
    "Barack Obama": [2009,2017],
    'Donald J. Trump': [2017, 2021]
}

Cleaning the speeches

In [None]:
# Get only speeches from presidents after Andrew Johnson served
speeches = [speech for speech in cleaned_speeches if int(speech.get("date").split(" ")[-1]) >= 1869]

In [None]:
# Get rid of all speeches not by a president
cleaned_presidential = []
for i in cleaned_speeches:
    if i['speaker'].lower() in [p.lower() for p in PRESIDENTS.keys()] and "Presidential" in i['categories']['primary']:
        cleaned_presidential.append(i)

In [None]:
# Make sure all speeches were given during presidential years
filtered_speeches = []
for i in cleaned_presidential:
    if i['speaker'].lower() in [p.lower() for p in PRESIDENTS.keys()] and "Presidential" in i['categories']['primary'] and i['speaker'].lower() != 'andrew johnson':
        start_year = int(PRESIDENTS[i['speaker'].lower()][0])
        end_year = int(PRESIDENTS[i['speaker'].lower()][1])
        speech_year = int(i['date'][-4:])
        if speech_year >= start_year and speech_year <= end_year:
            filtered_speeches.append(i)

Tokenizing and matching

In [None]:
#tokenize all speeches by president to be able to get proportions later on
proportion_tokens = {}
for president in PRESIDENTS.keys():
    proportion_tokens[president] = Counter()

for data in filtered_speeches:
    # replace em and en dashes
    data["body"] = data["body"].replace("\u2013", " \u2013 ")
    data["body"] = data["body"].replace("\u2014", " \u2014 ")
    tokens = nltk.word_tokenize(data["body"])
    tokens = [token.lower() for token in tokens]
    token_counter = Counter(tokens)

    proportion_tokens[data["speaker"].lower()] += token_counter

In [None]:
#tokenize as well as match words to the hand-curated dictionary
path_to_json = "ensemble_weighted/json_dicts"
results = {}

for speech in filtered_speeches:
    speech_year = speech['date'].split(',')[-1].strip()
    # Grab correct dictionary for year that the speech was given
    for file in os.listdir(path_to_json):
        year_range = file.split(".")[0].split("-")
        if int(year_range[0]) <= int(speech_year) <= int(year_range[1]):
            with open(os.path.join(path_to_json, file), 'r') as f:
                json_data = json.load(f)

            data = speech['body']
            data = data.replace("\u2013", " \u2013 ")
            data = data.replace("\u2014", " \u2014 ")
            tokens = nltk.word_tokenize(data)
            tokens = [token.lower() for token in tokens]
            token_counter = Counter(tokens)

            # Match the tokens to the words in the dictionary
            for list_name, word_list in json_data.items():
                count = 0
                for word in word_list:
                    if word in token_counter:
                        count += token_counter[word]
                if speech['speaker'] not in results:
                    results[speech['speaker']] = {}
                if list_name not in results[speech['speaker']]:
                    results[speech['speaker']][list_name] = 0
                results[speech['speaker']][list_name] += count

Write results to a CSV

In [None]:
file_name = "unprocessed_dynamic_exact.csv"
with open(file_name, 'w', newline='') as f:
    writer = csv.writer(f)

    # Order presidents by years they served
    sorted_keys = sorted(results.keys(), key=lambda x: PRESIDENTS[x][0])
    value_keys = list(results[sorted_keys[0]].keys())

    header = ['Presidents'] + sorted_keys
    writer.writerow(header)

    for value_key in value_keys:
        row = [value_key] + [results[key][value_key] for key in sorted_keys]
        writer.writerow(row)

In [None]:
# Grab total tokens for each president from prior tokenization
total_words = {}
for president, counter in sorted(proportion_tokens.items()):
    total_words[president] = sum(counter.values())

file_name = "unprocessed_dynamic_proportions.csv"
with open(file_name, 'w', newline='') as f:
        writer = csv.writer(f)

        sorted_keys = sorted(results.keys(), key=lambda x: PRESIDENTS[x][0])
        value_keys = list(results[sorted_keys[0]].keys())

        header = ['Presidents'] + sorted_keys
        writer.writerow(header)

        # Divide matching tokens for the dictionary by the total amount of tokens by president
        for value_key in value_keys:
            new_row = [value_key]
            for key in sorted_keys:
                president_name = key.lower()
                proportion = results[key][value_key] / total_words[president_name]
                new_row.append(proportion)
            writer.writerow(new_row)