## Importing Data and Initial Cleaning

In [6]:
import os
import pandas as pd
import pickle

In [7]:
# Initialize Dataframe 

df = pd.DataFrame(columns=['president','address'])

In [8]:
# Designate Path

path = 'C:\\Users\\Andrew\\Documents\\Metis\\NLP_Inaugural_Addresses\\Data\\InauguralTexts'

In [9]:
# Read all .txt files and append to a single dataframe

for filename in os.listdir(path):
    with open(os.path.join(path,filename)) as f:
        address = f.read()
        current_df = pd.DataFrame({'address': [address],'president':[filename]})
        df = df.append(current_df, ignore_index=True)

In [10]:
# Renaming "Roosevelt" to "FRoosevelt"

for i in range(36,40):
    df['president'][i] = df['president'][i].replace("Roosevelt","FRoosevelt")

# Renaming "Roosevelt" to "TRoosevelt"
df['president'][29] = df['president'][29].replace("Roosevelt","TRoosevelt")

In [11]:
# Renaming "Adams "JAdams"
df['president'][2] = df['president'][2].replace("Adams","JAdams")

# Renaming "Adams" "JQAdams"
df['president'][9] = df['president'][9].replace("Adams","JQAdams")

# Renaming "Bush" "HWBush"
df['president'][50] = df['president'][50].replace("Bush","HWBush")

# Renaming "Bush" "HWBush"
df['president'][53] = df['president'][53].replace("Bush","GWBush")
df['president'][54] = df['president'][54].replace("Bush","GWBush")

# Renaming "Harrison"
df['president'][13] = df['president'][13].replace("Harrison","WHarrison")
df['president'][26] = df['president'][26].replace("Harrison","BHarrison")

In [12]:
# Creating a Column that Represents President Number

df['president_number'] = [i.strip(".")[0:2] for i in df.president]
df = df[['president_number','president','address']]

In [13]:
# Creating a column that is only the presidents name

df['pres_name'] = [i.split(".")[1] for i in df['president']]

# Creat a column that represents term number

df['term'] = [i.split(".")[2] for i in df['president']]

In [14]:
# Create a column that represents predisent name and term number

df['pres_det'] = df['term'] + " " + df['pres_name']

In [15]:
# re-orient columns
df = df[['president_number','term','pres_name','pres_det','president','address']]

In [16]:
# Merging Party Information
path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Data\PresidentInfo"
df_party = pd.read_csv(path+"\Pres_Details.csv")

In [17]:
# Convert to int for merge
df['president_number'] = df['president_number'].astype(int)

# Perform Merge
df = df.merge(df_party,left_on="president_number",right_on='S.No.')

# Drop Columns
df = df.drop(['start','end','president_y','vice','prior','S.No.'],axis=1)

In [18]:
# Merging year of first term
path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Data\PresidentInfo"
df_dates = pd.read_csv(path+"\Inaugural_Dates.csv")

In [19]:
# Perform Merge
df = pd.concat([df,df_dates[['Year','Pres']]],axis=1).drop('Pres',axis=1)
df.head()

Unnamed: 0,president_number,term,pres_name,pres_det,president_x,address,party,Year
0,1,1,Washington,1 Washington,01.Washington.1.txt,AMONG the vicissitudes incident to life no eve...,Nonpartisan,1789
1,1,2,Washington,2 Washington,01.Washington.2.txt,I AM again called upon by the voice of my coun...,Nonpartisan,1793
2,2,1,JAdams,1 JAdams,02.JAdams.1.txt,"WHEN it was first perceived, in early times, ...",Federalist,1797
3,3,1,Jefferson,1 Jefferson,03.Jefferson.1.txt,CALLED upon to undertake the duties of the fi...,Democratic-Republican,1801
4,3,2,Jefferson,2 Jefferson,03.Jefferson.2.txt,"PROCEEDING, fellow-citizens, to that qualific...",Democratic-Republican,1805


In [20]:
# Create helper function to group each address into 50 year blocks

def time_period(x):
    if x < 1801:
        period = 'pre-1800'
    if x > 1800 and i < 1851:
        period = '1800-1850'
    if x > 1850 and i < 1901:
        period = '1851-1900'
    if x > 1900 and i < 1951:
        period = '1901-1950'
    if x > 1950 and i <2001:
        period = '1951-2000'
    if x > 2000:
        period = 'post-2000'
    return period 

In [21]:
# Apply helper function to each row

df['time_period'] = df['Year'].apply(lambda x: time_period(x))
df.head()

Unnamed: 0,president_number,term,pres_name,pres_det,president_x,address,party,Year,time_period
0,1,1,Washington,1 Washington,01.Washington.1.txt,AMONG the vicissitudes incident to life no eve...,Nonpartisan,1789,pre-1800
1,1,2,Washington,2 Washington,01.Washington.2.txt,I AM again called upon by the voice of my coun...,Nonpartisan,1793,pre-1800
2,2,1,JAdams,1 JAdams,02.JAdams.1.txt,"WHEN it was first perceived, in early times, ...",Federalist,1797,pre-1800
3,3,1,Jefferson,1 Jefferson,03.Jefferson.1.txt,CALLED upon to undertake the duties of the fi...,Democratic-Republican,1801,1800-1850
4,3,2,Jefferson,2 Jefferson,03.Jefferson.2.txt,"PROCEEDING, fellow-citizens, to that qualific...",Democratic-Republican,1805,1800-1850


In [22]:
# Group Presidents by Party (seperate pre-Lincoln Dems)

# first remove extra space in republican
df['party'] = df['party'].apply(lambda x: str.strip(x)) 

In [23]:
# Pickle dataframe and save for later use

path = r"C:\Users\Andrew\Documents\Metis\NLP_Inaugural_Addresses\Pickled_Files\cleaned_addresses.pkl"
with open(path, 'wb') as f:
    pickle.dump(df, f)