# Appendix 16 - Timestamped Text of the 2018 State of the Union Address

In [1]:
import pandas as pd
from datetime import datetime
from datetime import timedelta

# Create empty dictionary with three keys
speechdict = {"time": [], "content": [], "linebreak": []}

# The full transcript and video/audio (with subtitles) of the speech was published by hundreds of online news outlets
# We thus obtained a timestamped version of the speech text
# Here we loop through the text file of the speech and append context into dictionary
with open("speech.txt") as speech:
    for line in speech:
        speechdict['time'].append(speech.readline())
        speechdict['content'].append(speech.readline())
        speechdict['linebreak'].append(speech.readline())

In [2]:
speechdict

{'content': ['Mr.. Speaker\n',
  'Mr.. Vice president members\n',
  'of Congress\n',
  'the first lady of the United States and my fellow Americans\n',
  'Less than one year has passed since I first stood at this podium\n',
  'in this majestic chamber\n',
  'To speak on behalf of the American people and to address their concerns their hopes and their dreams\n',
  'That night our new administration had already taken very swift action a\n',
  'New tide of optimism was already sweeping across our land\n',
  'Each day since we have gone forward with a clear vision and a righteous mission\n',
  'to make America great again for all Americans\n',
  'Over the last year we have made incredible progress and achieved\n',
  'extraordinary success\n',
  'We are faced challenges. We expected and others we could never have imagined\n',
  'We have shared in the heights of victory and the pains of hardship\n',
  'We have endured floods and fires and storms\n',
  "But through it all we have seen the bea

In [4]:
# Convert dictionary to dataframe
speechframe = pd.DataFrame(speechdict)

# Get rid of the empty linebreak column
speechframe = speechframe.drop('linebreak', axis=1)

# Method to clean up the time field. x = the string value currently in the dataframe
def clean(x):
    x = x[0:8]  # Strip away the extraneous parts of the original string
    x = datetime.strptime(x, '%H:%M:%S')  # Convert the string to a datetime object
    x += timedelta(days =43129, hours=2, minutes=10, seconds=14)  # Synchronise it with tweet time
    return x

speechframe['tweettime'] = speechframe['time'].apply(clean)

In [5]:
writer = pd.ExcelWriter('speechoutput.xlsx')
speechframe.to_excel(writer,'Sheet1')
writer.save()