# Text Extraction from Speech PDFs

In [2]:
from PyPDF2 import PdfReader
import pandas as pd
import os
import re

In [3]:
folder = 'pdfs'

The following function extracts information from two main formats, or ignores certain files if they don't match a given format. I deal with non-matching documents later on.

In [272]:
def extract_data_from_pdf(fname):
    record = {}
    record['filename'] = fname 
    
    reader = PdfReader(os.path.join(folder, fname))
    meta = reader.metadata
    pages = reader.pages

    format = 0
        
    # identify which format a document belongs to
    first_page = pages[0].extract_text()
    if first_page.find('For release on delivery') >= 0 or first_page.find('For release at') >= 0:
        format = 1
        pages = pages[1:]
        first_page = first_page.replace('\n', '')

        record['date'] = re.search('\w* [0-3]{0,1}[0-9]{1} ?, [0-9]{4}', first_page).group()
        record['author'] = re.search('(?:[rR]emarks|Statement)?[ ]{1,2}by[ ]{1,3}([\w]+ [\w]. [\w]+|[\w]+ [\w]+)[ ]{1,3}', first_page).groups()[0]

        if '/Title' in meta.keys():
            if ':' in meta['/Title']:
                record['title'] = meta['/Title'].split(':')[1].strip()
            elif ' -' in meta['/Title']:
                record['title'] = meta['/Title'].split('-')[1].strip()
            else:
                record['title'] = meta['/Title']
        else:
            record['title'] = None
    elif 'Creator' in meta.keys() and 'PScript5' in meta['/Creator']:
        # Skip documents of this format, which have to be manually extracted
        format = 2
        record['title'] = None
        record['date'] = None
    else:
        # The remaining examples fit into the following format
        split_title = meta['/Title'].split(':')
        record['author'] = split_title[0]
        record['date'] = re.search('[0-3]{0,1}[0-9]{1} \w*[,]{0,1} [0-9]{3,4}|\w+ [0-9]{1,2},? [0-9]{4}', meta['/Subject']).group()
        record['title'] = split_title[1].strip()

    combined_text = ""
    for i, page in enumerate(pages):
        text = page.extract_text()
        if format == 0:
            # remove preamble, only on first page
            if i == 0:
                text = text.split('*')[-1]
            text = text.strip()
            # remove page number and footer
            text = text[:text.rfind('\n')]
            # remove footnotes
            text = text[:text.rfind('        ')]
            text = text.replace('\n', '')
        elif format == 1:
            format == 1
            # remove page numbers
            text = re.sub('- [0-9]{1,2} -', '', text)
            # remove footnotes
            text = re.sub('^[0-9]+[ ]{1}\w.*', '', text, flags=re.M)
            # remove footnote references
            text = re.sub('^[1-9]+[ ].', '', text, flags=re.M)
            # remove empty space
            text= re.sub('\n', '', text)
        combined_text = (combined_text + ' ' + text).strip()

    record['text'] = combined_text

    return record

I identified some anamolies, which I exclude from this process and just extract information from manually given how few there are.

In [None]:

problem = ['r220509b.pdf', 'r220615g.pdf', 'r210114b.pdf', 'r211128e.pdf', 'r161003a.pdf', 'r221128l.pdf']

I then create an array of entries to form a dataframe.

In [None]:
folder = 'pdfs'
records = []

for file in os.listdir(folder):
    if file in problem:
        continue
    record = extract_data_from_pdf(file)
    records.append(record)
    
records

Note: Had to manually redownload r150416a.pdf because while it showed up in the folder, it was empty

In [291]:
df = pd.DataFrame.from_records(records)

Here I fix a few typos that messed up date parsing, and then parse the date and save the file.

In [292]:
df['author'].value_counts()

df['date'].fillna('',inplace=True)

df[(df['date'] == '') | (df['date'].str.contains('Mach'))]

# fixing typos
df.loc[df['date'].str.contains('Mach'), 'date'] = '15 March 2016' #mistyped date
df.loc[df['filename'] == 'r171020g.pdf', 'date'] = '18 October 2017' #mistyped date
df.loc[1177, 'date'] = 'June 17, 2022'

df['date'] = pd.to_datetime(df['date'], format='mixed')

df.to_csv('speeches.csv')

Finally, I combine the documents that I systematically extracted information from with those that I had to manually extract information from, and then save the file.

In [38]:
import pandas as pd

manual = pd.read_csv('manual.csv')
df = pd.read_csv('speeches.csv', index_col=0)
df = pd.concat([df,manual], ignore_index=True)

df['date'] = pd.to_datetime(df['date'])

df.to_csv('all_speeches.csv')
