# Scraping SOS from Websites

In [1]:
# set up 
import requests 
import bs4
import pandas as pd 
import numpy as np
import time 
import re
from tika import parser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from io import BytesIO
from urllib.request import urlopen

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


In [2]:
# set headers
header_list = {'User-Agent': 'REDACTED',
              'Accept-Language': 'en-US,en;q=0.9'}

In [14]:
# define a customized BeautifulSoup function (tweaked after scraping)
def scrape_content(link, state, release_type, html1, html2, id_info=None, **kwargs):
    '''purpose: use BeautifulSoup to scrape content from the website and add it to the dataframe
     
       input: the website link (string); state (string); type of document (string); 
              the html tag for the content block; the html tag for the paragraph; the id info (if relevant)(string); 
              other keyword arguments as relevant.
       output: the states_df with the cleaned text (string) and the type of document (sos, budget, or inauguration)
               (string) in the proper column for the state'''
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        try: 
            result = soup_link.find(html1, {"id": id_info}, **kwargs)
        except: 
            result = soup_link.find(html1, **kwargs)
        p_result = result.find_all(html2)
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        string_result_clean = [i.replace(u'\n', u' ') for i in string_result_clean]
        result_merged = " ".join(string_result_clean)
       
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        return df
         
    except Exception as e: 
        print(f'error due to {e}')

In [4]:
# define a customized Selenium function (tweaked after testing individual states)
def scrape_selenium(link, STATE, xpath, ptag, release_type):
    '''purpose: uses Selenium to dynamically scrape the webpage and add it to the dataframe
    
       input: website link (string), state name (string), xpath of content on website (string), html tag for the 
              text (string), and release type (string)
       output: the states_df with the cleaned text (string) and the type of document (sos, budget, or inauguration)
               (string) in the proper column for the state'''
     
    # initialize the session
    driver = webdriver.Chrome()
    driver.get(link)
    time.sleep(3)

    # find the content block
    results = driver.find_element(By.XPATH, xpath)

    # find the text 
    p_results = results.find_elements(By.TAG_NAME, ptag) 

    p_text = [i.text for i in p_results]
    # p_text
    
    result_merged = " ".join(p_text)
    # result_merged

    # add to df
    df.loc[df.state == STATE, 'text'] = result_merged
    df.loc[df.state==STATE, 'type'] = release_type
    
    # close the session
    driver.quit()
    
    #return result_merged
    return df

In [5]:
# copy pdf scraping function developed in the other notebook
def scrape_pdf(state):
    '''inputs: the name of a state (string)
       outputs: the cleaned text (string) from a state speech pdf (saved locally)'''
    if ' ' in state:
        state_name = re.sub(' ', '_', state)
        raw = parser.from_file(f'sos_pdfs/{state_name}.pdf')
        text = raw['content']
        clean_text = re.sub('\n', '', text)
    else:
        raw = parser.from_file(f'sos_pdfs/{state}.pdf')
        text = raw['content']
        clean_text = re.sub('\n', '', text)
    return clean_text

In [6]:
# copy pdf function developed in the other notebook
def add_pdf_to_df(STATE, doc_type):
    '''inputs: the name of the state (string) and the type of document (string)
       outputs: the state dataframe with the scraped pdf text and the document type in the appropriate columns'''
    result = scrape_pdf(state = STATE)
    df.loc[df['state']== STATE, 'text'] = result
    df.loc[df['state']== STATE, 'type'] = doc_type 
    return df

In [7]:
# read in df
# originally used sos_withpdf.csv. The file sos_df_23.csv is a more recent version of the file
df = pd.read_csv('sos_df_23.csv')

In [8]:
df

Unnamed: 0,state,gov_name,party,type,text
0,alabama,kay ivey,republican,sos,Page 1 of 8 2023 STATE OF THE STATE ADDRES...
1,alaska,mike dunleavy,republican,sos,Governor Dunleavy’s 2023 State of the State Ad...
2,arizona,katie hobbs,democratic,sos,"Good afternoon, Chief Justice Brutinel, Secret..."
3,arkansas,sarah huckabee sanders,republican,inaug,"LITTLE ROCK, Ark.— Today, Governor Sarah Hucka..."
4,california,gavin newsom,democratic,inaug,Watch Governor Newsom’s swearing-in and inaugu...
5,colorado,jared polis,democratic,sos,ForMedia-2023-StateOfState-GovernorPolis-AsPre...
6,connecticut,ned lamont,democratic,sos,\r\n01/03/2023 2023 State of the ...
7,delaware,john carney,democratic,sos,"January 19, 2023 As prepared for delivery #DE..."
8,florida,ron desantis,republican,sos,"From the Space Coast to the Suncoast, from St...."
9,georgia,brian kemp,republican,sos,"Lt. Governor Jones, Speaker Burns, President P..."


## Arizona
[Website](https://azgovernor.gov/office-arizona-governor/news/2023/01/transcript-governor-hobbs-2023-state-state-address)

In [79]:
# create function for scraping the Arizona website
def scrape_arizona(link, state, release_type):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        result = soup_link.find('div', class_ = 'margin-top20 clearfix field-name-body margin-bottom20').find_all('p')
        # clean
        string_result = [str(i.text) for i in result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        result_merged = " ".join(string_result_clean)
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        return df
    
    except Exception as e: 
        print(f'error due to {e}')

In [80]:
# scrape and add to df 
df = scrape_arizona(link = 'https://azgovernor.gov/office-arizona-governor/news/2023/01/transcript-governor-hobbs-2023-state-state-address',
              state = 'arizona',
              release_type = 'sos')

## Arkansas
[Website](https://governor.arkansas.gov/news_post/sanders-delivers-inaugural-address/)

In [81]:
# create function to scrape the Arkansas website 
def scrape_arkansas(link, state, release_type):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        result = soup_link.find('div', class_ = 'elementor-element elementor-element-95261b0 elementor-widget elementor-widget-theme-post-content')
        p_result = result.find_all('p')
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        result_merged = " ".join(string_result_clean)
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        return df
    
    except Exception as e: 
        print(f'error due to {e}')

In [82]:
# scrape website and add to the df 
df = scrape_arkansas(link = 'https://governor.arkansas.gov/news_post/sanders-delivers-inaugural-address/', 
                     state = 'arkansas',
                     release_type= 'inaug')

## California
[Website](https://www.gov.ca.gov/2023/01/06/governor-newsom-inaugurated-to-second-term-in-celebration-of-californias-values-diverse-communities/)

In [83]:
# create a function to scrape California's website 
def scrape_california(link, state, release_type):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        result = soup_link.find('div', class_ = 'entry-content')
        p_result = result.find_all('p')
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        result_merged = " ".join(string_result_clean)
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        return df
    
    except Exception as e: 
        print(f'error due to {e}')

In [84]:
# scrape website and add to df 
df = scrape_california(link = 'https://www.gov.ca.gov/2023/01/06/governor-newsom-inaugurated-to-second-term-in-celebration-of-californias-values-diverse-communities/',
                 state = 'california', 
                 release_type = 'inaug')

## Connectictut
[Website](https://portal.ct.gov/Office-of-the-Governor/News/Speeches/Governor-Lamont-2023-State-of-the-State-Address)

In [85]:
# create a function to scrape Connecticut's website 
def scrape_connecticut(link, state, release_type, **kwargs):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        result = soup_link.find('div', **kwargs)
        p_result = result.find_all('p')
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        result_merged = " ".join(string_result_clean)
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        return df
    
    except Exception as e: 
        print(f'error due to {e}')

In [86]:
# scrape and add to the df 
df = scrape_connecticut(link = 'https://portal.ct.gov/Office-of-the-Governor/News/Speeches/Governor-Lamont-2023-State-of-the-State-Address',
                 state = 'connecticut', 
                 release_type = 'sos',
                 class_ = 'content')
# df

Unnamed: 0,state,gov_name,party,type,text
0,alabama,kay ivey,republican,sos,Page 1 of 8 2023 STATE OF THE STATE ADDRES...
1,alaska,mike dunleavy,republican,sos,Governor Dunleavy’s 2023 State of the State Ad...
2,arizona,katie hobbs,democratic,sos,"Good afternoon, Chief Justice Brutinel, Secret..."
3,arkansas,sarah huckabee sanders,republican,inaug,"LITTLE ROCK, Ark.— Today, Governor Sarah Hucka..."
4,california,gavin newsom,democratic,inaug,Watch Governor Newsom’s swearing-in and inaugu...
5,colorado,jared polis,democratic,sos,ForMedia-2023-StateOfState-GovernorPolis-AsPre...
6,connecticut,ned lamont,democratic,sos,\r\n01/03/2023 2023 State of the ...
7,delaware,john carney,democratic,sos,"January 19, 2023 As prepared for delivery #DE..."
8,florida,ron desantis,republican,,
9,georgia,brian kemp,republican,,


## Florida 
[Website](https://www.flgov.com/2023/01/03/governor-desantis-delivers-inaugural-address-sets-priorities-for-second-term/)

In [98]:
# use the customized BeautifulSoup function to scrape the Florida website 
df = scrape_content(link = 'https://www.flgov.com/2023/01/03/governor-desantis-delivers-inaugural-address-sets-priorities-for-second-term/',
                 state = 'florida', 
                 release_type = 'sos',
                 html1 = 'tbody',
                 html2= 'p')

## Georgia
[Website](https://gov.georgia.gov/press-releases/2023-01-25/governor-brian-p-kemps-2023-state-state-address)

In [99]:
# use the customized BeautifulSoup function to scrape the Georgia website 
df = scrape_content(link = 'https://gov.georgia.gov/press-releases/2023-01-25/governor-brian-p-kemps-2023-state-state-address',
                 state = 'georgia', 
                 release_type = 'sos',
                 html1 = 'main',
                 html2= 'p', 
                 class_ = 'content-page__main')

## Illinois
[Website](https://www.illinois.gov/news/press-release.26065.html)

In [142]:
# use the customized BeautifulSoup function to scrape the Illinois website 
df = scrape_content(link = 'https://www.illinois.gov/news/press-release.26065.html',
                 state = 'illinois', 
                 release_type = 'sos',
                 html1 = 'div', 
                 html2 = 'div',
                 class_ = 'press-release aem-GridColumn--default--none aem-GridColumn--phone--12 aem-GridColumn--default--9 aem-GridColumn--phone--newline aem-GridColumn aem-GridColumn--offset--phone--0 aem-GridColumn--offset--default--0')

## Indiana
[Website](https://www.in.gov/gov/newsroom/2023-state-of-the-state-address/)

In [33]:
# use the customized BeautifulSoup function to scrape the Indiana website 
df = scrape_content(link = 'https://www.in.gov/gov/newsroom/2023-state-of-the-state-address/',
                 state = 'indiana', 
                 release_type = 'sos',
                 html1 = 'section', 
                 id_info = 'content_container_649017',
                 html2 = 'p')

## Iowa
[Website](https://governor.iowa.gov/press-release/2023-01-10/gov-reynolds-delivers-2023-condition-state)

In [39]:
# use the customized BeautifulSoup function to scrape the Iowa website 
df = scrape_content(link = 'https://governor.iowa.gov/press-release/2023-01-10/gov-reynolds-delivers-2023-condition-state',
                 state = 'iowa', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'clearfix text-formatted field field--name-field-news__body field--type-text-long field--label-hidden field__item',
                 html2 = 'p') 

## Louisiana
[Website (archived)](https://web.archive.org/web/20230903094025/https://gov.louisiana.gov/index.cfm/newsroom/detail/4057)

In [109]:
# accessed archived site using web.archive.org since the Governor is new in 2024
# use the customized BeautifulSoup function to scrape the archived site
df = scrape_content(link = 'https://web.archive.org/web/20230903094025/https://gov.louisiana.gov/index.cfm/newsroom/detail/4057',
                 state = 'louisiana', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'col-md-8 main-content',
                 html2 = 'p') 

## Maine
PDF

In [None]:
# Added from PDF form (not completed in previous notebook)
df = add_pdf_to_df('maine', 'inaug')

## Maryland
[Website](https://governor.maryland.gov/news/press/pages/Governor-Wes-Moore-Delivers-His-First-State-of-the-State-Address.aspx)

In [111]:
# use the customized BeautifulSoup function to scrape the Maryland website 
df = scrape_content(link = 'https://governor.maryland.gov/news/press/pages/Governor-Wes-Moore-Delivers-His-First-State-of-the-State-Address.aspx',
                 state = 'maryland', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'mdg-pressRelease-content',
                 html2 = 'p') 

## Massachusetts
[Website](https://www.wbur.org/news/2023/01/05/read-healey-inauguration-remarks-transcript)

In [112]:
# Note that this versino was published via local news and not the Governor's website 
# use the customized BeautifulSoup function to scrape 
df = scrape_content(link = 'https://www.wbur.org/news/2023/01/05/read-healey-inauguration-remarks-transcript',
                 state = 'massachusetts', 
                 release_type = 'inaug',
                 html1 = 'section', 
                 class_ = 'article-section--content hang-punctuation article-section--first article-section--centered article-sections',
                 html2 = 'p') 

## Michigan
[Website](https://www.michigan.gov/whitmer/news/press-releases/2023/01/25/governor-whitmers-2023-state-of-the-state-address-as-prepared-for-delivery)

In [113]:
# use the customized BeautifulSoup function to scrape the Michigan website 
df = scrape_content(link = 'https://www.michigan.gov/whitmer/news/press-releases/2023/01/25/governor-whitmers-2023-state-of-the-state-address-as-prepared-for-delivery',
                 state = 'michigan', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'news-item__section-content',
                 html2 = 'p') 

## Minnesota
PDF

In [9]:
# Added from PDF form (not completed in previous notebook)
df = add_pdf_to_df('minnesota', 'sos')

## Mississippi
[Website](https://mississippitoday.org/2023/01/30/tate-reeves-2023-state-of-the-state/)

In [38]:
# Accessed via a news site rather than from the Governor's website 
# try Selenium package to dynamically scrape the site 

# create session
driver = webdriver.Chrome()
driver.get("https://mississippitoday.org/2023/01/30/tate-reeves-2023-state-of-the-state/")
time.sleep(3)

# find content block
results = driver.find_element(By.XPATH, '//*[@id="post-1089678"]/div')
# results

# find text 
p_results = results.find_elements(By.TAG_NAME, 'p') 
p_text = [i.text for i in p_results]
# p_text

# end session
driver.quit()

<selenium.webdriver.remote.webelement.WebElement (session="3db9e42a9de6e04375d73f21aef4204f", element="787A619122493F11AD8ABA7CA2F93A81_element_11")>

['Gov. Tate Reeves, a first-term Republican, delivered his annual State of the State address on Jan. 30, 2023.',
 'Below is the transcript of Reeves’ speech, which aired live on Mississippi Public Broadcasting.',
 'Editor’s note: This transcript was submitted by Reeves’ staff and has not been formatted to match Mississippi Today’s style.',
 'WATCH: Gov. Tate Reeves’ full State of the State address.',
 'Thank you, Lieutenant Governor Hosemann and Speaker Gunn.',
 'To the members of the legislature and other elected officials here tonight, thank you. Thank you for your continued partnership and thank you for the tireless work you do on behalf of our great state and her people.',
 'I also have to take a moment to thank my beautiful wife and Mississippi’s outstanding First Lady, Elee. She’s an incredible wife, an awesome mom, and a wonderful representative for our state. I’m amazed daily by your grace and your kindness, and I’m so thankful to have you in my life every single day.',
 'Final

In [40]:
# merge the text strings
result_merged = " ".join(p_text)
# result_merged

'Gov. Tate Reeves, a first-term Republican, delivered his annual State of the State address on Jan. 30, 2023. Below is the transcript of Reeves’ speech, which aired live on Mississippi Public Broadcasting. Editor’s note: This transcript was submitted by Reeves’ staff and has not been formatted to match Mississippi Today’s style. WATCH: Gov. Tate Reeves’ full State of the State address. Thank you, Lieutenant Governor Hosemann and Speaker Gunn. To the members of the legislature and other elected officials here tonight, thank you. Thank you for your continued partnership and thank you for the tireless work you do on behalf of our great state and her people. I also have to take a moment to thank my beautiful wife and Mississippi’s outstanding First Lady, Elee. She’s an incredible wife, an awesome mom, and a wonderful representative for our state. I’m amazed daily by your grace and your kindness, and I’m so thankful to have you in my life every single day. Finally and most importantly, I ha

In [43]:
# add the cleaned text to the df
df.loc[df.state=='mississippi', 'text'] = result_merged

In [45]:
# update the document type column 
df.loc[df.state=='mississippi', 'type'] = 'sos'

## Montana
[Website](https://news.mt.gov/Governors-Office/Governor_Gianforte_The_American_Dream_Is_Alive_and_Well_Here_in_Montana)

In [115]:
# use the customized BeautifulSoup function to scrape the Montana website 
df = scrape_content(link = 'https://news.mt.gov/Governors-Office/Governor_Gianforte_The_American_Dream_Is_Alive_and_Well_Here_in_Montana',
                 state = 'montana', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'col-lg-8 order-1 order-lg-0',
                 html2 = 'p') 

## Nebraska
[Website](https://governor.nebraska.gov/press/governor-jim-pillen-2023-state-state-address)

In [201]:
# definition for scraping the Nebraska website (tweaked after scraping)
def scrape_nebraska(link, state, release_type, html1, **kwargs):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        result = soup_link.find_all(html1, **kwargs)
        p_result = result
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        string_result_clean = [i.replace(u'\n', u' ') for i in string_result_clean]
        result_merged = " ".join(string_result_clean)
       
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        #return result 
        #return p_result
        return result_merged 
        #return df
         
    except Exception as e: 
        print(f'error due to {e}')

In [193]:
# scrape the Nebraska website 
df = scrape_nebraska(link = 'https://governor.nebraska.gov/press/governor-jim-pillen-2023-state-state-address',
                 state = 'nebraska', 
                 release_type = 'sos',
                 html1 = 'p', 
                 class_ = 'rtecenter') 

## Nevada 
PDF

In [194]:
# Added from PDF form (not completed in previous notebook)
df = add_pdf_to_df('nevada', 'sos')

## New Hampshire
[Website](https://www.governor.nh.gov/news-and-media/governor-chris-sununus-inaugural-address)

In [195]:
# use the customized BeautifulSoup function to scrape the New Hampshire website 
df = scrape_content(link = 'https://www.governor.nh.gov/news-and-media/governor-chris-sununus-inaugural-address',
                 state = 'new hampshire', 
                 release_type = 'inaug',
                 html1 = 'div', 
                 class_ = 'field field--name-body field--type-text-with-summary field--label-hidden field__item',
                 html2 = 'p') 

## New Jersey
[Website](https://nj.gov/governor/news/news/562023/approved/20230110b.shtml)

In [196]:
# use the customized BeautifulSoup function to scrape the New Jersey website 
df = scrape_content(link = 'https://nj.gov/governor/news/news/562023/approved/20230110b.shtml',
                 state = 'new jersey', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'col-sm-12',
                 html2 = 'p') 

## New Mexico
[Website](https://losalamosreporter.com/2023/01/17/full-text-of-gov-lujan-grishams-2023-state-of-the-state-address/)

In [197]:
# Accessed via a local news site for ease of scraping 
# use the customized BeautifulSoup function to scrape the website 
df = scrape_content(link = 'https://losalamosreporter.com/2023/01/17/full-text-of-gov-lujan-grishams-2023-state-of-the-state-address/',
                 state = 'new mexico', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'entry-content',
                 html2 = 'p') 

## New York
[Website](https://www.governor.ny.gov/news/remarks-prepared-governor-hochul-delivers-2023-state-state)

In [198]:
# use the customized BeautifulSoup function to scrape the New York website 
df = scrape_content(link = 'https://www.governor.ny.gov/news/remarks-prepared-governor-hochul-delivers-2023-state-state',
                 state = 'new york', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'a-text__html o-jazzed-release__wysiwyg1',
                 html2 = 'p') 

## North Carolina
[Website](https://nc-governor.medium.com/governor-cooper-state-of-the-state-2023-a82ebebd8e17)

In [47]:
# try Selenium since attempt with general definition (using BeautifulSoup) wasn't working
# Website is Governor's blog post on Medium rather than the Governor website 

# open session
driver = webdriver.Chrome()
driver.get("https://nc-governor.medium.com/governor-cooper-state-of-the-state-2023-a82ebebd8e17")
time.sleep(3)

# access content
results = driver.find_element(By.XPATH,'//*[@id="root"]/div/div[3]/div[2]/div[2]/article/div/div/section/div/div[2]/div/div')
#results

# access individual text blocks
p_results = results.find_elements(By.TAG_NAME, 'p') 
p_text = [i.text for i in p_results]
#p_text

# end session
driver.quit()

<selenium.webdriver.remote.webelement.WebElement (session="6ff2e0c966cfc98c26c28df6b32c1fcf", element="501A1FCD495F6E712CE53ECDA8DC9CC5_element_10")>

['Governor Roy Cooper',
 'Follow',
 '1',
 '',
 '',
 'Mr. President Pro Tempore, Mr. Speaker, Lieutenant Governor, Members of the General Assembly, Council of State, Mr. Chief Justice and members of the Supreme Court, Madame Chief Judge and members of the Court of Appeals, Cabinet Secretaries, and my fellow North Carolinians: I am honored to join you to report on the state of our great state.',
 'With me tonight, is my remarkable wife, our First Lady, Kristin, our three wonderful daughters, Hilary, Natalie and Claire and my son-in-law Zack and my brother Pell, all of whom I’m thankful for each and every day.',
 'Each generation has but so many chances to leave an indelible mark on history that benefits the generations to come. And so often, our greatest advancements come after our greatest upheavals. War, protests, strife, disasters, pandemic. To find ourselves as state leaders at a time like this, is to bear a tremendous responsibility. A responsibility to learn from adversity and make

In [48]:
# merge text results together
result_merged = " ".join(p_text)
# result_merged

# add to df
df.loc[df.state == 'north carolina', 'text'] = result_merged
df.loc[df.state=='north carolina', 'type'] = 'sos'

## Ohio
[Website](https://governor.ohio.gov/media/news-and-media/governor-dewines-2023-state-of-the-state-address-01312023)

In [51]:
# try Selenium since attempt with general definition (using BeautifulSoup) wasn't working

# initiate session 
driver = webdriver.Chrome()
driver.get("https://governor.ohio.gov/media/news-and-media/governor-dewines-2023-state-of-the-state-address-01312023")
time.sleep(3)

# access content 
results = driver.find_element(By.XPATH,'//*[@id="js-odx-content__body"]')
#results

# access text results
p_results = results.find_elements(By.TAG_NAME, 'p') 
p_text = [i.text for i in p_results]
#p_text

# end session
driver.quit()

<selenium.webdriver.remote.webelement.WebElement (session="fcd31a0d66cf5420a8893d2f3b847122", element="5684617F09C74B1BFA7546D392997004_element_43")>

['(COLUMBUS, Ohio)—Ohio Governor Mike DeWine today delivered the 2023 State of the State address today in the House Chamber of the Ohio Statehouse. The remarks, as prepared, are as follows:',
 'Speaker Stephens, President Huffman, Leader Antonio and Leader Russo, Members of the General Assembly, Chief Justice Kennedy and Justices of the Ohio Supreme Court, Elected State Officials, Lieutenant Governor Husted, My Fellow Citizens of Ohio….',
 'We meet at a time of great opportunity for Ohio and its citizens.  ',
 'Yet, it is also a time of great challenges.',
 'Our future is bright -- but that future will be defined by how well we educate all our children and how we tear down the barriers to their success.  We are challenged as never before, because at no time in our history has the full education of all our children been more important.',
 'The budget that I will present to you later today reflects the moral imperative we have to see that ALL Ohioans are fully educated, and therefore, ha

In [54]:
# merge text results 
result_merged = " ".join(p_text)
# result_merged

# add to df
df.loc[df.state == 'ohio', 'text'] = result_merged
df.loc[df.state=='ohio', 'type'] = 'sos'

## Oklahoma
[Website](https://oklahoma.gov/governor/newsroom/newsroom/2023/february2023/governor-stitt-delivers-2023-state-of-the-state-address.html)

In [199]:
# use the customized BeautifulSoup function to scrape the Oklahoma website 
df = scrape_content(link = 'https://oklahoma.gov/governor/newsroom/newsroom/2023/february2023/governor-stitt-delivers-2023-state-of-the-state-address.html',
                 state = 'oklahoma', 
                 release_type = 'sos',
                 html1 = 'div', 
                 id_info='text-684b40b6c6', 
                 html2 = 'p') 

## Oregon
[Website](https://www.oregon.gov/gov/speeches/Pages/2023-01-09-inauguration.aspx)

In [208]:
# use the customized BeautifulSoup function to scrape the Oregon website 
df = scrape_content(link = 'https://www.oregon.gov/gov/speeches/Pages/2023-01-09-inauguration.aspx',
                 state = 'oregon', 
                 release_type = 'inaug',
                 html1 = 'div', 
                 id_info= 'ctl00_ctl00_MainContentPlaceHolder_PageContentPlaceHolder_PageContentPlaceHolder_RichHtmlField1__ControlWrapper_OregonRichHtmlField',
                 html2 = 'p') 

## Pennsylvania
[Website](https://www.governor.pa.gov/newsroom/governor-shapiro-inaugural-address-as-prepared/)

In [15]:
# use the customized BeautifulSoup function to scrape the Pennsylvania website 
df = scrape_content(link = 'https://www.governor.pa.gov/newsroom/governor-shapiro-inaugural-address-as-prepared/',
              state = 'pennsylvania', 
              release_type = 'inaug',
              html1 = 'article',
              id_info = 'post-6351636',
              html2 = 'p')

## Rhode Island
[Website](https://governor.ri.gov/press-releases/governor-daniel-j-mckees-2023-state-state-address)

In [102]:
# use the customized BeautifulSoup function to scrape the Rhode Island website 
df = scrape_content(link = 'https://governor.ri.gov/press-releases/governor-daniel-j-mckees-2023-state-state-address',
                 state = 'rhode island', 
                 release_type = 'sos',
                 html1 = 'article', 
                 class_ = 'node node--type-press-release node--view-mode-full',
                 html2 = 'p') 

## South Carolina
[Website](https://governor.sc.gov/news/2023-01/2023-state-state-address-governor-henry-mcmaster)

In [103]:
# use the customized BeautifulSoup function to scrape the South Carolina website 
df = scrape_content(link = 'https://governor.sc.gov/news/2023-01/2023-state-state-address-governor-henry-mcmaster',
                 state = 'south carolina', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'mainContent group',
                 html2 = 'p') 

## South Dakota 
[Website](https://news.sd.gov/news?id=news_kb_article_view&sys_id=0160a6f91b1029901de443bae54bcb0f)

In [60]:
# use the customized Selenium function to scrape the South Dakota website (wouldn't work with BeautifulSoup)
df = scrape_selenium(link = 'https://news.sd.gov/news?id=news_kb_article_view&sys_id=0160a6f91b1029901de443bae54bcb0f',
               STATE = 'south dakota',
               xpath= '//*[@id="x_g_sdbo_news_u_kb_article_body"]',
               ptag = 'p', 
               release_type='sos')

## Tennessee
[Website](https://www.tn.gov/governor/sots/2023-state-of-the-state-address.html)

In [104]:
# use the customized BeautifulSoup function to scrape the Tennessee website 
df = scrape_content(link = 'https://www.tn.gov/governor/sots/2023-state-of-the-state-address.html',
                 state = 'tennessee', 
                 release_type = 'sos',
                 html1 = 'div', 
                 class_ = 'tn-rte',
                 html2 = 'p') 

## Texas
[Website](https://gov.texas.gov/news/post/governor-abbott-delivers-2023-texas-inaugural-address-)

In [105]:
# use the customized BeautifulSoup function to scrape the Texas website 
df = scrape_content(link = 'https://gov.texas.gov/news/post/governor-abbott-delivers-2023-texas-inaugural-address-',
                 state = 'texas', 
                 release_type = 'inaug',
                 html1 = 'section', 
                 class_ = 'l-content columns small-12',
                 html2 = 'p') 

## Vermont 
[Website](https://governor.vermont.gov/press-release/governor-phil-scott-delivers-fourth-inaugural-address)

In [66]:
# use the customized Selenium function to scrape the Vermont website (BeautifulSoup didn't work)
df = scrape_selenium(link = 'https://governor.vermont.gov/press-release/governor-phil-scott-delivers-fourth-inaugural-address',
               STATE = 'vermont',
               xpath= '//*[@id="block-governor-governor-system-main"]/article/div/div[3]',
               ptag = 'p', 
               release_type='inaug')

## Virginia
[Website](https://www.governor.virginia.gov/newsroom/news-releases/2023/january/name-979187-en.html)

In [106]:
# definition (tweaked after scraping)
def scrape_virginia(link, state, release_type, html1, html2, id_info=None, **kwargs):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        try: 
            result = soup_link.find_all(html1, {"id": id_info}, **kwargs)
        except: 
            result = soup_link.find_all(html1, **kwargs)
        
        result1 = result[1]
        p_result = result1.find_all(html2)
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        string_result_clean = [i.replace(u'\n', u' ') for i in string_result_clean]
        result_merged = " ".join(string_result_clean)
       
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        #return result 
        #return p_result
        #return result_merged 
        return df
         
    except Exception as e: 
        print(f'error due to {e}')

In [107]:
df = scrape_virginia(link = 'https://www.governor.virginia.gov/newsroom/news-releases/2023/january/name-979187-en.html',
                 state = 'virginia', 
                 release_type = 'sos',
                 html1 = 'table', 
                 class_ = 'layout layout--1-column',
                 html2 = 'p') 

## Washington
[Website](https://medium.com/wagovernor/inslee-delivers-state-of-the-state-bold-actions-for-building-a-stronger-washington-5541a39977cb)

In [108]:
# definition (tweaked after scraping)
def scrape_washington(link, state, release_type, html1, id_info=None, **kwargs):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        p_result = soup_link.find_all(html1)
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        string_result_clean = [i.replace(u'\n', u' ') for i in string_result_clean]
        result_merged = " ".join(string_result_clean)
       
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        #return result 
        #return p_result
        #return result_merged 
        return df
         
    except Exception as e: 
        print(f'error due to {e}')

In [109]:
# note, the address was published by the Governor's office on Medium 
df = scrape_washington(link = 'https://medium.com/wagovernor/inslee-delivers-state-of-the-state-bold-actions-for-building-a-stronger-washington-5541a39977cb',
                 state = 'washington', 
                 release_type = 'sos',
                 html1 = 'p') 

## West Virginia
[Website](https://governor.wv.gov/News/press-releases/2023/Pages/Gov.-Justice-delivers-his-2023-West-Virginia-State-of-the-State-Address.aspx)

In [110]:
# definition (tweaked after scraping)
def scrape_west_virginia(link, state, release_type, html1, id_info=None, **kwargs):
    try:
        server_response = requests.get(link, headers = header_list)
        soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
        p_result = soup_link.find_all(html1, **kwargs)
        # clean
        string_result = [str(i.text) for i in p_result]
        string_result_clean = [i.replace(u'\xa0', u' ') for i in string_result]
        string_result_clean = [i.replace(u'\n', u' ') for i in string_result_clean]
        result_merged = " ".join(string_result_clean)
       
        # add to df
        df.loc[df.state == state, 'text'] = result_merged
        df.loc[df.state == state, 'type'] = release_type
        
        #return result 
        #return p_result
        #return result_merged 
        return df
         
    except Exception as e: 
        print(f'error due to {e}')

In [111]:
df = scrape_west_virginia(link = 'https://governor.wv.gov/News/press-releases/2023/Pages/Gov.-Justice-delivers-his-2023-West-Virginia-State-of-the-State-Address.aspx',
                 state = 'west virginia', 
                 release_type = 'sos',
                 html1 = 'td') 

## Wisconsin 
[Wisconsin](https://content.govdelivery.com/accounts/WIGOV/bulletins/343fc2b)

In [74]:
# use the customized Selenium function to scrape the Wisconsin website (wouldn't work with BeautifulSoup)
# note, this website was a newsletter issued by the Governor's office through GovDelivery
df = scrape_selenium(link = 'https://content.govdelivery.com/accounts/WIGOV/bulletins/343fc2b',
               STATE = 'wisconsin',
               xpath= '//*[@id="bulletin_body"]/table/tbody/tr[7]/td',
               ptag = 'em', 
               release_type='inaug')

## Wyoming
[Website](https://governor.wyo.gov/news-releases/governor-gordon-offers-a-blueprint-for-wyoming-to-lead-the-nation-)

In [79]:
# use the customized Selenium function to scrape the Wyoming website (wouldn't work with BeautifulSoup)
df = scrape_selenium(link = 'https://governor.wyo.gov/news-releases/governor-gordon-offers-a-blueprint-for-wyoming-to-lead-the-nation-',
               STATE = 'wyoming',
               xpath= '//*[@id="app"]/div/div/div/main/main/div/div',
               ptag = 'p', 
               release_type='sos')

## Export

In [17]:
#df.to_csv('sos_df_23.csv', index = False)
df.to_csv('sos_df_23_revised.csv', index = False)