# Scraping PDFs

In [1]:
# load libraries
import pandas as pd 
import re 
from tika import parser

## Create State Dataframe

In [2]:
# Dictionary of state governors and their party
state_gov = [
    {'state': "alabama", "gov_name": "kay ivey", "party": "republican"},
    {'state': "alaska", "gov_name": "mike dunleavy", "party": "republican"},
    {'state': "arizona", "gov_name": "katie hobbs", "party": "democratic"},
    {'state': "arkansas", "gov_name": "sarah huckabee sanders", "party": "republican"},
    {'state': "california", "gov_name": "gavin newsom", "party": "democratic"},
    {'state': "colorado", "gov_name": "jared polis", "party": "democratic"},
    {'state': "connecticut", "gov_name": "ned lamont", "party": "democratic"},
    {'state': "delaware", "gov_name": "john carney", "party": "democratic"},
    {'state': "florida", "gov_name": "ron desantis", "party": "republican"},
    {'state': "georgia", "gov_name": "brian kemp", "party": "republican"},
    {'state': "hawaii", "gov_name": "josh green", "party": "democratic"},
    {'state': "idaho", "gov_name": "brad little", "party": "republican"},
    {'state': "illinois", "gov_name": "jb pritzker", "party": "democratic"},
    {'state': "indiana", "gov_name": "eric holcomb", "party": "republican"},
    {'state': "iowa", "gov_name": "kim reynolds", "party": "republican"},
    {'state': "kansas", "gov_name": "laura kelly", "party": "democratic"},
    {'state': "kentucky", "gov_name": "andy beshear", "party": "democratic"},
    {'state': "louisiana", "gov_name": "john bel edwards", "party": "democratic"},
    {'state': "maine", "gov_name": "janet mills", "party": "democratic"},
    {'state': "maryland", "gov_name": "wes moore", "party": "democratic"},
    {'state': "massachusetts", "gov_name": "maura healey", "party": "democratic"},
    {'state': "michigan", "gov_name": "gretchen whitmer", "party": "democratic"},
    {'state': "minnesota", "gov_name": "tim walz", "party": "democratic"},
    {'state': "mississippi", "gov_name": "tate reeves", "party": "republican"},
    {'state': "missouri", "gov_name": "mike parson", "party": "republican"},
    {'state': "montana", "gov_name": "greg gianforte", "party": "republican"},
    {'state': "nebraska", "gov_name": "jim pillen", "party": "republican"},
    {'state': "nevada", "gov_name": "joe lombardo", "party": "republican"},
    {'state': "new hampshire", "gov_name": "chris sununu", "party": "republican"},
    {'state': "new jersey", "gov_name": "phil murphy", "party": "democratic"},
    {'state': "new mexico", "gov_name": "michelle lujan grisham", "party": "democratic"},
    {'state': "new york", "gov_name": "kathy hochul", "party": "democratic"},
    {'state': "north carolina", "gov_name": "roy cooper", "party": "democratic"},
    {'state': "north dakota", "gov_name": "doug burgum", "party": "republican"},
    {'state': "ohio", "gov_name": "mike dewine", "party": "republican"},
    {'state': "oklahoma", "gov_name": "kevin stitt", "party": "republican"},
    {'state': "oregon", "gov_name": "tina kotek", "party": "democratic"},
    {'state': "pennsylvania", "gov_name": "josh shapiro", "party": "democratic"},
    {'state': "rhode island", "gov_name": "dan mckee", "party": "democratic"},
    {'state': "south carolina", "gov_name": "henry mcmaster", "party": "republican"},
    {'state': "south dakota", "gov_name": "kristi noem", "party": "republican"},
    {'state': "tennessee", "gov_name": "bill lee", "party": "republican"},
    {'state': "texas", "gov_name": "greg abbott", "party": "republican"},
    {'state': "utah", "gov_name": "spencer cox", "party": "republican"},
    {'state': "vermont", "gov_name": "phil scott", "party": "republican"},
    {'state': "virginia", "gov_name": "glenn youngkin", "party": "republican"},
    {'state': "washington", "gov_name": "jay inslee", "party": "democratic"},
    {'state': "west virginia", "gov_name": "jim justice", "party": "republican"},
    {'state': "wisconsin", "gov_name": "tony evers", "party": "democratic"},
    {'state': "wyoming", "gov_name": "mark gordon", "party": "republican"}
]

In [3]:
# convert to a dataframe
df = pd.DataFrame(state_gov)

In [4]:
# add blank columns for type of address and text 
df['type'] = ''
df['text'] = ''

## Scrape PDFs

### Test on 1 pdf 

In [5]:
# parse the text from a pdf 
raw = parser.from_file('sos_pdfs/alaska.pdf')
text = raw['content']

2024-04-10 15:09:16,204 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /var/folders/5_/9k6tj5157bx2n0nwswpfzhhw0000gn/T/tika-server.jar.
2024-04-10 15:09:19,006 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /var/folders/5_/9k6tj5157bx2n0nwswpfzhhw0000gn/T/tika-server.jar.md5.
2024-04-10 15:09:19,281 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [6]:
# see initial results 
text

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGovernor Dunleavy’s 2023 State of the State Address \n\n1 \n\nThank you, Lieutenant Governor Dahlstrom, Senate President Stevens, Speaker Tilton, \nand members of the 33rd Legislative Session. \n \nI want to thank you for the invitation to speak to our fellow Alaskans tonight.  \n \nFirst, I want to recognize my wife of 35 years, First Lady Rose Dunleavy. \n \nIt would be virtually impossible to do this job without your support, and so, thank you, \nRose, for putting up with me. \n \nI’d also like to recognize the members of my cabinet in attendance tonight. Could you \nplease stand and be recognized? \n \nThank you for everything you’re doing for the great State of Alaska. \n \nTo the members of the Legislature: Welcome, and thank you for wanting to serve your \nconstituents.  \n \n\nTonight, as I stand before you and the great people of Alaska regarding the state of our \n\nstate, there is much to be thankfu

In [7]:
# remove escape sequence formatting 
clean_text = re.sub('\n', '', text)

In [8]:
# examine the new result 
clean_text

'Governor Dunleavy’s 2023 State of the State Address 1 Thank you, Lieutenant Governor Dahlstrom, Senate President Stevens, Speaker Tilton, and members of the 33rd Legislative Session.  I want to thank you for the invitation to speak to our fellow Alaskans tonight.   First, I want to recognize my wife of 35 years, First Lady Rose Dunleavy.  It would be virtually impossible to do this job without your support, and so, thank you, Rose, for putting up with me.  I’d also like to recognize the members of my cabinet in attendance tonight. Could you please stand and be recognized?  Thank you for everything you’re doing for the great State of Alaska.  To the members of the Legislature: Welcome, and thank you for wanting to serve your constituents.   Tonight, as I stand before you and the great people of Alaska regarding the state of our state, there is much to be thankful for.  I’m also optimistic about the start of this session because I believe what we do now … in the next four months, will s

In [9]:
# update the text column
df.loc[df['state']=='alaska', 'text'] = clean_text
df.loc[df['state']=='alaska', 'type'] = 'sos'

In [11]:
# check this worked 
df.head()

Unnamed: 0,state,gov_name,party,type,text
0,alabama,kay ivey,republican,,
1,alaska,mike dunleavy,republican,sos,Governor Dunleavy’s 2023 State of the State Ad...
2,arizona,katie hobbs,democratic,,
3,arkansas,sarah huckabee sanders,republican,,
4,california,gavin newsom,democratic,,


### Create Function 

In [12]:
def scrape_pdf(state):
    '''inputs: the name of a state (string)
       outputs: the cleaned text (string) from a state speech pdf (saved locally)'''
    if ' ' in state:
        state_name = re.sub(' ', '_', state)
        raw = parser.from_file(f'sos_pdfs/{state_name}.pdf')
        text = raw['content']
        clean_text = re.sub('\n', '', text)
    else:
        raw = parser.from_file(f'sos_pdfs/{state}.pdf')
        text = raw['content']
        clean_text = re.sub('\n', '', text)
    return clean_text

In [13]:
def add_to_df(STATE, doc_type):
    '''inputs: the name of the state (string) and the type of document (string)
       outputs: the state dataframe with the scraped pdf text and the document type in the appropriate columns'''
    result = scrape_pdf(state = STATE)
    df.loc[df['state']== STATE, 'text'] = result
    df.loc[df['state']== STATE, 'type'] = doc_type 
    return df

In [14]:
# test for one
df = add_to_df('alabama', 'sos')

### Scrape all PDF states

In [15]:
# the following State of the State (or equivalent addresses) were found available in PDF form

pdf_states_sos = ['alabama', 'alaska', 'colorado', 'delaware', 'hawaii', 'idaho', 'kansas', 'kentucky', 'missouri',
                 'north dakota', 'utah']

for i in pdf_states_sos:
    df = add_to_df(i, 'sos')

In [16]:
# check this worked
df

Unnamed: 0,state,gov_name,party,type,text
0,alabama,kay ivey,republican,sos,Page 1 of 8 2023 STATE OF THE STATE ADDRES...
1,alaska,mike dunleavy,republican,sos,Governor Dunleavy’s 2023 State of the State Ad...
2,arizona,katie hobbs,democratic,,
3,arkansas,sarah huckabee sanders,republican,,
4,california,gavin newsom,democratic,,
5,colorado,jared polis,democratic,sos,ForMedia-2023-StateOfState-GovernorPolis-AsPre...
6,connecticut,ned lamont,democratic,,
7,delaware,john carney,democratic,sos,"January 19, 2023 As prepared for delivery #DE..."
8,florida,ron desantis,republican,,
9,georgia,brian kemp,republican,,


## Export 

In [17]:
df.to_csv('sos_withpdf.csv', index=False)