# Utah Scraping 

In [1]:
# set up 
import pandas as pd 
import requests 
import bs4
import numpy as np
import time 
import re
from tika import parser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from io import BytesIO
from urllib.request import urlopen

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


In [1]:
# # set headers
# header_list = {'User-Agent': REDACTED,
#               'Accept-Language': 'en-US,en;q=0.9'}

In [3]:
# state - UPDATE EACH TIME
state = 'utah'

In [None]:
# review of robots.txt - no restrictions

In [None]:
# test = pd.read_csv(f'{state}_2023.csv')
# test.head()

## Scrape Links 
`Beautiful Soup`

### Test on one page

In [4]:
link = 'https://governor.utah.gov/news/page/2/'
server_response = requests.get(link, header_list)
server_response

<Response [200]>

In [13]:
soup = bs4.BeautifulSoup(server_response.content, features="html.parser")
results = soup.find('div', class_ = 'posts newsPosts').find_all('div', class_='post newsPost')
# results

title = [i.find('h2').text for i in results]
link = [i.find('a')['href'] for i in results]
date = [i.find('span', class_='postDate').text for i in results]

['Gov. Spencer Cox signs six bills in the 2024 General Legislative Session',
 'Gov. Cox, President Adams, Speaker Schultz issue statement on NHL',
 'Gov. Spencer J. Cox and Lt. Gov. Deidre Henderson: Schedule for Jan. 29- Feb. 2, 2024',
 'Gov. Spencer J. Cox and Lt. Gov. Deidre Henderson: Schedule for Jan. 22-26, 2024',
 'Gov. Spencer Cox signs first bill of the 2024 General Legislative Session',
 'Gov. Cox delivers his 2024 State of the State address',
 'Gov. Spencer J. Cox and Lt. Gov. Deidre Henderson: Schedule for Jan. 15-19, 2024',
 'Gov. Cox encourages educators to remove cell phones during class time',
 'Gov. Spencer J. Cox and Lt. Gov. Deidre Henderson: Schedule for Jan. 8-12, 2024',
 'Gov. Spencer J. Cox and Lt. Gov. Deidre Henderson: Schedule for Jan. 1-5, 2024']

['https://governor.utah.gov/2024/01/30/gov-spencer-cox-signs-six-bills-in-the-2024-general-legislative-session/',
 'https://governor.utah.gov/2024/01/29/gov-cox-president-adams-speaker-schultz-issue-statement-on-nhl/',
 'https://governor.utah.gov/2024/01/29/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-29-feb-2-2024/',
 'https://governor.utah.gov/2024/01/22/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-22-26-2024/',
 'https://governor.utah.gov/2024/01/19/gov-spencer-cox-signs-first-bill-of-the-2024-general-legislative-session/',
 'https://governor.utah.gov/2024/01/18/gov-cox-delivers-his-2024-state-of-the-state-address/',
 'https://governor.utah.gov/2024/01/15/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-15-19-2024/',
 'https://governor.utah.gov/2024/01/10/gov-cox-encourages-educators-to-remove-cell-phones-during-class-time/',
 'https://governor.utah.gov/2024/01/08/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-8-12-20

['January 30, 2024',
 'January 29, 2024',
 'January 29, 2024',
 'January 22, 2024',
 'January 19, 2024',
 'January 18, 2024',
 'January 15, 2024',
 'January 10, 2024',
 'January 8, 2024',
 'December 31, 2023']

In [14]:
# define webpage scraping 
def scrape_utah_links(website, headers):
    server_response = requests.get(website, headers)
    soup = bs4.BeautifulSoup(server_response.content, features="html.parser")
    results = soup.find('div', class_ = 'posts newsPosts').find_all('div', class_='post newsPost')
    results_list = [{'title': i.find('h2').text,
                 'date': i.find('span', class_='postDate').text, 
                'link': i.find('a')['href']} for i in results]
    df = pd.DataFrame.from_dict(results_list)
    df['content'] = np.nan
    return df

### Scrape all links

In [15]:
# try to scrape the first set of pages
# manual review for 2023 - I want pages 2-16, give or take a few links on first and last pages
df_list=[]

for i in range(2,17):
    try: 
        df = scrape_utah_links(f'https://governor.utah.gov/news/page/{i}/', 
                                  header_list)
        df['page_scraped'] = i 
        df_list.append(df)
    except Exception as e:
        print(f'Error: {e} at page {i}')

In [18]:
# combine into one df
combined = pd.concat(df_list, ignore_index=True)
combined.head()
combined.info()

Unnamed: 0,title,date,link,content,page_scraped
0,Gov. Spencer Cox signs six bills in the 2024 G...,"January 30, 2024",https://governor.utah.gov/2024/01/30/gov-spenc...,,2
1,"Gov. Cox, President Adams, Speaker Schultz iss...","January 29, 2024",https://governor.utah.gov/2024/01/29/gov-cox-p...,,2
2,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,"January 29, 2024",https://governor.utah.gov/2024/01/29/gov-spenc...,,2
3,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,"January 22, 2024",https://governor.utah.gov/2024/01/22/gov-spenc...,,2
4,Gov. Spencer Cox signs first bill of the 2024 ...,"January 19, 2024",https://governor.utah.gov/2024/01/19/gov-spenc...,,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         150 non-null    object 
 1   date          150 non-null    object 
 2   link          150 non-null    object 
 3   content       0 non-null      float64
 4   page_scraped  150 non-null    int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 6.0+ KB


In [19]:
# export
combined.to_csv(f'{state}_links.csv', index = False)

In [20]:
test = pd.read_csv(f'{state}_links.csv')
test.head()

Unnamed: 0,title,date,link,content,page_scraped
0,Gov. Spencer Cox signs six bills in the 2024 G...,"January 30, 2024",https://governor.utah.gov/2024/01/30/gov-spenc...,,2
1,"Gov. Cox, President Adams, Speaker Schultz iss...","January 29, 2024",https://governor.utah.gov/2024/01/29/gov-cox-p...,,2
2,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,"January 29, 2024",https://governor.utah.gov/2024/01/29/gov-spenc...,,2
3,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,"January 22, 2024",https://governor.utah.gov/2024/01/22/gov-spenc...,,2
4,Gov. Spencer Cox signs first bill of the 2024 ...,"January 19, 2024",https://governor.utah.gov/2024/01/19/gov-spenc...,,2


## Scrape content from links
`Beautiful Soup`

### Test with one link

In [21]:
# test with one
link = 'https://governor.utah.gov/2023/01/17/news-release-gov-spencer-cox-appoints-jason-nelson-to-2nd-district-court/'
server_response = requests.get(link, headers = header_list)
server_response

<Response [200]>

In [23]:
# test with one contd 
soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
result = soup_link.find('article', class_ = 'post wrapper').find_all('p')
string_result = [str(i.text) for i in result]
string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
result_merged = " ".join(string_result_clean)
result_merged

'NEWS RELEASE Jan. 17, 2023 Contact: Emma Williams Office of the Governor (385) 303-4383, ewilliams@utah.gov Gov. Spencer Cox appoints Jason Nelson to 2nd District Court SALT LAKE CITY (Jan. 17, 2023) – Utah Gov. Spencer J. Cox has appointed Jason Nelson to Utah’s 2nd District Court, filling the vacancy left by Judge David Connors’ retirement. Judicial appointments are subject to confirmation by the Utah Senate. “I am pleased that Jason Nelson is willing to serve Utahns in this new capacity,” Gov. Cox said. “His professionalism and love of our state will serve the community well.” Nelson has served as the Deputy Davis County Attorney since 2008. He has also served as a prosecutor for West Bountiful City and North Salt Lake City. Prior to that, he served as a law clerk in the Utah Second Judicial District in Davis County as well as the Utah Attorney General’s Office Criminal Justice Division. “I am grateful for and humbled by the confidence Gov. Cox has placed in me with this appointmen

In [None]:
# define link scraping 
def get_utah_content(link, df, header_list): 
    try: 
        # check if content has already been pulled
        result = df.loc[df.link == link, 'content'].notnull().all()
        if result:
            return f'content already existing for {link}' 
        else: 
            try: 
                server_response = requests.get(link, headers = header_list)
                soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
                result = soup_link.find('article', class_ = 'post wrapper').find_all('p')
                string_result = [str(i.text) for i in result]
                string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
                string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
                result_merged = " ".join(string_result_clean)
                
                df.loc[df.link == link, 'content'] = result_merged
                return f"success: content added from {link}"
            except: 
                pass
    except Exception as e: 
        return f"unable to gather content from {link} due to {e}"
    #return df_updated 

In [25]:
# test on 5 to see if it works
# links_2023.content = np.nan

for i in combined.link[0:6]:
     get_utah_content(i, combined, header_list)
    
combined.head()

'success: content added from https://governor.utah.gov/2024/01/30/gov-spencer-cox-signs-six-bills-in-the-2024-general-legislative-session/'

'success: content added from https://governor.utah.gov/2024/01/29/gov-cox-president-adams-speaker-schultz-issue-statement-on-nhl/'

'success: content added from https://governor.utah.gov/2024/01/29/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-29-feb-2-2024/'

'success: content added from https://governor.utah.gov/2024/01/22/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-22-26-2024/'

'success: content added from https://governor.utah.gov/2024/01/19/gov-spencer-cox-signs-first-bill-of-the-2024-general-legislative-session/'

'success: content added from https://governor.utah.gov/2024/01/18/gov-cox-delivers-his-2024-state-of-the-state-address/'

Unnamed: 0,title,date,link,content,page_scraped
0,Gov. Spencer Cox signs six bills in the 2024 G...,"January 30, 2024",https://governor.utah.gov/2024/01/30/gov-spenc...,"NEWS RELEASE Jan. 30, 2024 Contact: Emma Willi...",2
1,"Gov. Cox, President Adams, Speaker Schultz iss...","January 29, 2024",https://governor.utah.gov/2024/01/29/gov-cox-p...,"NEWS RELEASE Jan. 29, 2024 Contacts: Emma Will...",2
2,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,"January 29, 2024",https://governor.utah.gov/2024/01/29/gov-spenc...,**Events labeled Media Access indicate that an...,2
3,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,"January 22, 2024",https://governor.utah.gov/2024/01/22/gov-spenc...,**Events labeled Media Access indicate that an...,2
4,Gov. Spencer Cox signs first bill of the 2024 ...,"January 19, 2024",https://governor.utah.gov/2024/01/19/gov-spenc...,"NEWS RELEASE Jan. 19, 2024 Contact: Emma Willi...",2


### Scrape content from all links

In [26]:
for i in combined.link:
     get_utah_content(i, combined, header_list)

'content already existing for https://governor.utah.gov/2024/01/30/gov-spencer-cox-signs-six-bills-in-the-2024-general-legislative-session/'

'content already existing for https://governor.utah.gov/2024/01/29/gov-cox-president-adams-speaker-schultz-issue-statement-on-nhl/'

'content already existing for https://governor.utah.gov/2024/01/29/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-29-feb-2-2024/'

'content already existing for https://governor.utah.gov/2024/01/22/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-22-26-2024/'

'content already existing for https://governor.utah.gov/2024/01/19/gov-spencer-cox-signs-first-bill-of-the-2024-general-legislative-session/'

'content already existing for https://governor.utah.gov/2024/01/18/gov-cox-delivers-his-2024-state-of-the-state-address/'

'success: content added from https://governor.utah.gov/2024/01/15/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-15-19-2024/'

'success: content added from https://governor.utah.gov/2024/01/10/gov-cox-encourages-educators-to-remove-cell-phones-during-class-time/'

'success: content added from https://governor.utah.gov/2024/01/08/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-8-12-2024/'

'success: content added from https://governor.utah.gov/2023/12/31/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-1-5-2024/'

'success: content added from https://governor.utah.gov/2023/12/24/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-dec-25-29-2023/'

'success: content added from https://governor.utah.gov/2023/12/18/gov-cox-orders-flags-to-half-staff-in-recognition-of-the-passing-of-u-s-supreme-court-justice-sandra-day-oconnor/'

'success: content added from https://governor.utah.gov/2023/12/18/gov-cox-names-donna-law-as-new-head-of-department-of-cultural-and-community-engagement-2/'

'success: content added from https://governor.utah.gov/2023/12/18/gov-cox-names-donna-law-as-new-head-of-department-of-cultural-and-community-engagement/'

'success: content added from https://governor.utah.gov/2023/12/18/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-dec-18-22-2023/'

'success: content added from https://governor.utah.gov/2023/12/18/utah-public-service-commission-vacancy/'

'success: content added from https://governor.utah.gov/2023/12/11/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-dec-11-15-2023/'

'success: content added from https://governor.utah.gov/2023/12/06/gov-cox-orders-flags-to-half-staff-in-recognition-of-pearl-harbor-remembrance-day/'

'success: content added from https://governor.utah.gov/2023/12/05/gov-cox-meets-with-israel-bachar-consul-general-of-israel/'

'success: content added from https://governor.utah.gov/2023/12/05/gov-cox-appoints-steve-waldrip-as-new-senior-advisor-for-housing-strategy-and-innovation/'

'success: content added from https://governor.utah.gov/2023/12/05/gov-cox-and-lt-gov-henderson-announce-150-million-utah-first-homes-starter-home-program-in-budget-announcement/'

'success: content added from https://governor.utah.gov/2023/12/04/gov-cox-announces-agreement-with-local-governments-comprehensive-approach-to-homeless-services-as-key-part-of-fy25-budget/'

'success: content added from https://governor.utah.gov/2023/12/03/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-dec-4-8-2023/'

'success: content added from https://governor.utah.gov/2023/11/30/gov-cox-and-first-lady-cox-honor-utah-artists-at-governors-mansion-artist-awards-event-3/'

'success: content added from https://governor.utah.gov/2023/11/28/gov-cox-appoints-brian-bolinder-to-seventh-district-court/'

'success: content added from https://governor.utah.gov/2023/11/27/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-nov-27-dec-1-2023/'

'success: content added from https://governor.utah.gov/2023/11/20/gov-cox-appoints-stephen-nelson-to-third-district-court/'

'success: content added from https://governor.utah.gov/2023/11/20/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-nov-20-24-2023/'

'success: content added from https://governor.utah.gov/2023/11/15/great-salt-lake-airboat-named-in-honor-of-utah-speaker-of-the-house/'

'success: content added from https://governor.utah.gov/2023/11/14/govs-cox-and-polis-call-for-healthy-political-debate-at-colorado-nga-event/'

'success: content added from https://governor.utah.gov/2023/11/13/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-nov-13-17-2023/'

'success: content added from https://governor.utah.gov/2023/11/06/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-nov-6-10-2023/'

'success: content added from https://governor.utah.gov/2023/10/30/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-oct-30-nov-3-2023/'

'success: content added from https://governor.utah.gov/2023/10/26/gov-cox-orders-flags-to-half-staff/'

'success: content added from https://governor.utah.gov/2023/10/25/gov-cox-unveils-new-strategic-partnership-plan-with-mexico/'

'success: content added from https://governor.utah.gov/2023/10/24/utah-sues-meta-for-child-addiction-harm-and-deceiving-parents-about-dangers-of-facebook-and-instagram/'

'success: content added from https://governor.utah.gov/2023/10/19/guiding-our-growth-survey-shows-utahns-want-a-range-of-housing-options-more-water-conservation-transportation-options-and-open-spaces/'

'success: content added from https://governor.utah.gov/2023/10/17/gov-cox-announces-executive-orders-about-service-and-volunteerism/'

'success: content added from https://governor.utah.gov/2023/10/17/gov-cox-appoints-ryan-peters-to-fourth-district-juvenile-court/'

'success: content added from https://governor.utah.gov/2023/10/16/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-oct-16-20-2023/'

'success: content added from https://governor.utah.gov/2023/10/13/gov-spencer-cox-appoints-scott-stephenson-as-new-board-of-pardons-and-parole-chair/'

'success: content added from https://governor.utah.gov/2023/10/12/gov-spencer-cox-presents-governors-award-for-excellence-to-state-employees/'

'success: content added from https://governor.utah.gov/2023/10/11/gov-spencer-cox-condemns-violence-promotes-peace-at-utah-stands-with-israel-rally/'

'success: content added from https://governor.utah.gov/2023/10/10/utah-sues-tiktok-over-child-addiction-harm-targets-enmeshment-with-its-china-based-parent-company/'

'success: content added from https://governor.utah.gov/2023/10/09/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-oct-9-13-2023/'

'success: content added from https://governor.utah.gov/2023/10/09/gov-cox-orders-flags-to-be-lowered-to-half-staff-position/'

'success: content added from https://governor.utah.gov/2023/10/06/nominees-announced-for-3rd-district-court-vacancy-4/'

'success: content added from https://governor.utah.gov/2023/10/06/nominees-announced-for-7th-district-court-vacancy/'

'success: content added from https://governor.utah.gov/2023/10/06/utah-leaders-respond-to-ice-office-memo/'

'success: content added from https://governor.utah.gov/2023/10/05/office-of-the-governor-issues-quarterly-rural-affairs-report/'

'success: content added from https://governor.utah.gov/2023/10/02/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-oct-2-6-2023/'

'success: content added from https://governor.utah.gov/2023/09/29/gov-cox-names-scott-stephenson-to-board-of-pardons-and-parole/'

'success: content added from https://governor.utah.gov/2023/09/29/flags-to-be-lowered-in-remembrance-of-sen-dianne-feinstein/'

'success: content added from https://governor.utah.gov/2023/09/28/gov-spencer-cox-pledges-to-keep-wic-national-parks-open-if-federal-government-shuts-down/'

'success: content added from https://governor.utah.gov/2023/09/25/gov-spencer-j-cox-to-welcome-president-of-hungary-to-utah/'

'success: content added from https://governor.utah.gov/2023/09/25/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-sept-25-29-2023/'

'success: content added from https://governor.utah.gov/2023/09/21/gov-cox-and-first-lady-cox-honor-utah-artists-at-governors-mansion-artist-awards-event-2/'

'success: content added from https://governor.utah.gov/2023/09/18/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-sept-18-22-2023/'

'success: content added from https://governor.utah.gov/2023/09/12/govs-cox-and-sununu-urge-americans-to-disagree-better-at-new-hampshire-event/'

'success: content added from https://governor.utah.gov/2023/09/11/nominees-announced-for-fourth-district-juvenile-court-vacancy/'

'success: content added from https://governor.utah.gov/2023/09/11/gov-cox-orders-flags-to-be-lowered-in-remembrance-of-9-11/'

'success: content added from https://governor.utah.gov/2023/09/11/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-sept-11-15-2023/'

'success: content added from https://governor.utah.gov/2023/09/05/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-sept-4-8-2023/'

'success: content added from https://governor.utah.gov/2023/08/31/gov-cox-orders-flags-to-be-lowered-in-recognition-of-overdose-awareness-day/'

'success: content added from https://governor.utah.gov/2023/08/28/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-aug-28-sept-1-2023/'

'success: content added from https://governor.utah.gov/2023/08/21/gov-spencer-cox-names-brig-gen-daniel-boyack-as-adjutant-general-of-the-utah-national-guard/'

'success: content added from https://governor.utah.gov/2023/08/21/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-aug-21-25-2023/'

'success: content added from https://governor.utah.gov/2023/08/17/gov-spencer-cox-issues-statement-on-utah-national-guard-adjutant-general/'

'success: content added from https://governor.utah.gov/2023/08/14/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-aug-14-18-2023/'

'success: content added from https://governor.utah.gov/2023/08/11/gov-spencer-cox-issues-statement-on-national-monuments-ruling/'

'success: content added from https://governor.utah.gov/2023/08/08/gov-spencer-cox-expresses-frustration-about-new-grand-canyon-national-monument/'

'success: content added from https://governor.utah.gov/2023/08/07/gov-spencer-cox-and-lt-gov-deidre-hendersons-public-schedule-week-of-aug-7-11-2023/'

'success: content added from https://governor.utah.gov/2023/08/03/gov-cox-launches-new-harms-of-social-media-public-awareness-campaign/'

'success: content added from https://governor.utah.gov/2023/07/31/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-july-31-aug-4-2023/'

'success: content added from https://governor.utah.gov/2023/07/27/gov-spencer-cox-recognizes-recipients-of-second-annual-governors-spirit-of-service-award/'

'success: content added from https://governor.utah.gov/2023/07/25/news-release-gov-spencer-cox-and-attorney-general-sean-reyes-press-tiktok-to-respond-to-subpoenas/'

'success: content added from https://governor.utah.gov/2023/07/24/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-july-24-28-2023/'

'success: content added from https://governor.utah.gov/2023/07/17/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-july-17-21-2023/'

'success: content added from https://governor.utah.gov/2023/07/14/utah-gov-spencer-j-cox-elected-chair-of-national-governors-association-launches-disagree-better-initiative/'

'success: content added from https://governor.utah.gov/2023/07/12/gov-cox-appoints-eric-gentry-to-fifth-district-court/'

'success: content added from https://governor.utah.gov/2023/07/10/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-july-10-14-2023/'

'success: content added from https://governor.utah.gov/2023/07/03/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-july-3-7-2023/'

'success: content added from https://governor.utah.gov/2023/06/29/gov-cox-declares-july-2-as-a-day-of-prayer-and-thanksgiving/'

'success: content added from https://governor.utah.gov/2023/06/25/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-june-26-30-2023/'

'success: content added from https://governor.utah.gov/2023/06/22/gov-cox-signs-trade-pacts-with-uk-france-during-european-trade-mission/'

'success: content added from https://governor.utah.gov/2023/06/17/gov-cox-signs-bill-setting-2023-election-dates/'

'success: content added from https://governor.utah.gov/2023/06/14/gov-cox-appoints-charles-stormont-to-third-district-court/'

'success: content added from https://governor.utah.gov/2023/06/11/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-june-12-16-2023/'

'success: content added from https://governor.utah.gov/2023/06/07/executive-legislative-branches-set-new-election-schedule-to-fill-congressman-stewarts-vacancy/'

'success: content added from https://governor.utah.gov/2023/06/04/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-june-5-9-2023/'

'success: content added from https://governor.utah.gov/2023/05/28/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-may-29-june-2-2023/'

'success: content added from https://governor.utah.gov/2023/05/27/gov-cox-orders-flags-to-be-lowered-in-honor-of-memorial-day/'

'success: content added from https://governor.utah.gov/2023/05/25/nominees-announced-for-5th-district-court-vacancy-2/'

'success: content added from https://governor.utah.gov/2023/05/25/gov-cox-names-new-members-to-utah-board-of-higher-education/'

'success: content added from https://governor.utah.gov/2023/05/25/gov-cox-and-first-lady-cox-honor-utah-artists-at-governors-mansion-artist-awards-event/'

'success: content added from https://governor.utah.gov/2023/05/21/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-may-22-26-2023/'

'success: content added from https://governor.utah.gov/2023/05/19/nominees-announced-for-3rd-district-court-vacancy-3/'

'success: content added from https://governor.utah.gov/2023/05/18/gov-spencer-j-cox-signs-three-bills-from-special-session/'

'success: content added from https://governor.utah.gov/2023/05/15/gov-cox-appoints-brian-steed-as-new-great-salt-lake-commissioner/'

'success: content added from https://governor.utah.gov/2023/05/15/gov-cox-adds-two-additional-items-of-business-to-call-for-special-session/'

'success: content added from https://governor.utah.gov/2023/05/14/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-may-15-19-2023/'

'success: content added from https://governor.utah.gov/2023/05/11/news-release-gov-cox-issues-executive-order-to-require-water-conservation-at-state-facilities-and-increase-utahs-drought-resiliency/'

'success: content added from https://governor.utah.gov/2023/05/09/news-release-gov-cox-names-brian-redd-as-new-executive-director-of-the-department-of-corrections/'

'success: content added from https://governor.utah.gov/2023/05/08/news-release-gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-may-8-12-2023/'

'success: content added from https://governor.utah.gov/2023/05/07/news-release-gov-cox-orders-flags-to-be-lowered/'

'success: content added from https://governor.utah.gov/2023/05/07/news-release-gov-cox-orders-flags-lowered-for-national-fallen-firefighters-memorial-service/'

'success: content added from https://governor.utah.gov/2023/05/03/gov-cox-ceremonially-signs-bills-focused-on-education/'

'success: content added from https://governor.utah.gov/2023/05/01/news-release-public-schedule-for-gov-spencer-cox-and-lt-gov-deidre-henderson-week-of-may-1-5-2023/'

'success: content added from https://governor.utah.gov/2023/04/25/gov-cox-ceremonially-signs-bills-focused-on-water/'

'success: content added from https://governor.utah.gov/2023/04/24/release-public-schedule-for-gov-cox-and-lt-gov-henderson-week-of-april-24-2023/'

'success: content added from https://governor.utah.gov/2023/04/21/release-gov-cox-orders-new-coordinated-effort-to-protect-personal-data/'

'success: content added from https://governor.utah.gov/2023/04/21/release-gov-cox-resumes-powers-and-duties-of-governor/'

'success: content added from https://governor.utah.gov/2023/04/21/release-gov-cox-designates-lt-gov-henderson-as-acting-governor-during-medical-procedure/'

'success: content added from https://governor.utah.gov/2023/04/18/advisory-gov-cox-declares-state-of-emergency-due-to-flooding-and-flood-risks/'

'success: content added from https://governor.utah.gov/2023/04/18/gov-cox-ceremonially-signs-bills-focused-on-smart-growth/'

'success: content added from https://governor.utah.gov/2023/04/18/release-gov-spencer-cox-and-lt-gov-hendersons-schedule-april-17-21-2023/'

'success: content added from https://governor.utah.gov/2023/04/14/news-release-gov-cox-names-marvin-dodge-as-new-executive-director-of-department-of-government-operations/'

'success: content added from https://governor.utah.gov/2023/04/14/news-release-utah-named-state-with-best-economic-outlook-for-16th-year-in-a-row/'

'success: content added from https://governor.utah.gov/2023/04/11/gov-cox-ceremonially-signs-bills-focused-on-families/'

'success: content added from https://governor.utah.gov/2023/04/10/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-april-10-14-2023/'

'success: content added from https://governor.utah.gov/2023/04/03/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-april-3-7-2023/'

'success: content added from https://governor.utah.gov/2023/03/27/gov-cox-orders-flags-to-be-lowered-in-honor-of-the-victims-in-nashville-tennessee/'

'success: content added from https://governor.utah.gov/2023/03/27/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-march-27-31-2023/'

'success: content added from https://governor.utah.gov/2023/03/23/gov-cox-signs-bills-focused-on-social-media-in-utah/'

'success: content added from https://governor.utah.gov/2023/03/21/gov-cox-issues-executive-order-outlining-how-to-display-the-historic-state-flag-and-new-state-flag/'

'success: content added from https://governor.utah.gov/2023/02/27/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-feb-27-march-3-2023/'

'success: content added from https://governor.utah.gov/2023/02/21/news-release-legislature-and-governors-office-release-updated-budget-estimates/'

'success: content added from https://governor.utah.gov/2023/02/21/media-release-gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-feb-20-feb-24-2023/'

'success: content added from https://governor.utah.gov/2023/02/16/gov-cox-signs-three-bills-in-the-2023-general-legislative-session/'

'success: content added from https://governor.utah.gov/2023/02/15/media-release-texas-instruments-to-build-new-chip-factory-in-lehi-bringing-hundreds-of-jobs-to-the-state/'

'success: content added from https://governor.utah.gov/2023/02/13/public-schedule-for-gov-cox-and-lt-gov-henderson-feb-13-17-2023/'

'success: content added from https://governor.utah.gov/2023/02/06/gov-cox-issues-statement-on-gas-prices-resources-for-low-income-and-larger-families/'

'success: content added from https://governor.utah.gov/2023/02/06/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-feb-6-feb-10-2023/'

'success: content added from https://governor.utah.gov/2023/02/03/gov-cox-issues-executive-order-to-raise-the-great-salt-lake-causeway-berm/'

'success: content added from https://governor.utah.gov/2023/02/02/gov-cox-signs-eight-bills-in-the-2023-general-legislative-session/'

'success: content added from https://governor.utah.gov/2023/02/01/applications-open-to-serve-on-the-utah-public-service-commission/'

'success: content added from https://governor.utah.gov/2023/02/01/gov-cox-issues-executive-order-to-ensure-data-sharing-between-state-agencies/'

'success: content added from https://governor.utah.gov/2023/01/30/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-30-feb-3-2023/'

'success: content added from https://governor.utah.gov/2023/01/28/gov-spencer-cox-signs-education-funding-transgender-medical-bills/'

'success: content added from https://governor.utah.gov/2023/01/25/utah-governors-spirit-of-service-award/'

'success: content added from https://governor.utah.gov/2023/01/23/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-23-jan-27-2023/'

'success: content added from https://governor.utah.gov/2023/01/19/watch-gov-cox-speaks-to-the-future-of-utah-in-his-2023-state-of-the-state-address/'

'success: content added from https://governor.utah.gov/2023/01/17/news-release-gov-spencer-cox-appoints-jason-nelson-to-2nd-district-court/'

'success: content added from https://governor.utah.gov/2023/01/17/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-16-jan-20-2023/'

'success: content added from https://governor.utah.gov/2023/01/12/news-release-gov-spencer-cox-names-ryan-starks-brad-bonham-to-leadership-roles-in-the-governors-office-of-economic-opportunity/'

'success: content added from https://governor.utah.gov/2023/01/11/gov-spencer-cox-appoints-judge-amy-j-oliver-to-utah-court-of-appeals/'

'success: content added from https://governor.utah.gov/2023/01/10/media-release-gov-cox-hosts-social-media-and-youth-mental-health-symposium/'

'success: content added from https://governor.utah.gov/2023/01/09/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-9-jan-13-2023/'

'success: content added from https://governor.utah.gov/2023/01/05/icymi-what-does-it-mean-to-be-fiscally-responsible-in-utah/'

'success: content added from https://governor.utah.gov/2023/01/04/gov-spencer-j-cox-and-lt-gov-deidre-henderson-schedule-for-jan-2-jan-6-2023/'

## Validate and Clean

In [27]:
#check that content was pulled accurately
combined[combined.content.isna()]

Unnamed: 0,title,date,link,content,page_scraped


In [28]:
# convert to date-time format
combined['date_clean'] = pd.to_datetime(combined['date'])

combined_2023 = combined.drop(columns=['date']).copy()

In [29]:
combined_2023.head()
combined_2023.info()

Unnamed: 0,title,link,content,page_scraped,date_clean
0,Gov. Spencer Cox signs six bills in the 2024 G...,https://governor.utah.gov/2024/01/30/gov-spenc...,"NEWS RELEASE Jan. 30, 2024 Contact: Emma Willi...",2,2024-01-30
1,"Gov. Cox, President Adams, Speaker Schultz iss...",https://governor.utah.gov/2024/01/29/gov-cox-p...,"NEWS RELEASE Jan. 29, 2024 Contacts: Emma Will...",2,2024-01-29
2,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,https://governor.utah.gov/2024/01/29/gov-spenc...,**Events labeled Media Access indicate that an...,2,2024-01-29
3,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,https://governor.utah.gov/2024/01/22/gov-spenc...,**Events labeled Media Access indicate that an...,2,2024-01-22
4,Gov. Spencer Cox signs first bill of the 2024 ...,https://governor.utah.gov/2024/01/19/gov-spenc...,"NEWS RELEASE Jan. 19, 2024 Contact: Emma Willi...",2,2024-01-19


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   title         150 non-null    object        
 1   link          150 non-null    object        
 2   content       150 non-null    object        
 3   page_scraped  150 non-null    int64         
 4   date_clean    150 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 6.0+ KB


In [30]:
print(f"pulled {str(len(combined_2023))} documents from {state} for 2023")

pulled 150 documents from utah for 2023


## Export

In [31]:
combined_2023.to_csv(f'{state}_2023.csv', index = False)

In [32]:
test = pd.read_csv(f'{state}_2023.csv')
test.head()

Unnamed: 0,title,link,content,page_scraped,date_clean
0,Gov. Spencer Cox signs six bills in the 2024 G...,https://governor.utah.gov/2024/01/30/gov-spenc...,"NEWS RELEASE Jan. 30, 2024 Contact: Emma Willi...",2,2024-01-30
1,"Gov. Cox, President Adams, Speaker Schultz iss...",https://governor.utah.gov/2024/01/29/gov-cox-p...,"NEWS RELEASE Jan. 29, 2024 Contacts: Emma Will...",2,2024-01-29
2,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,https://governor.utah.gov/2024/01/29/gov-spenc...,**Events labeled Media Access indicate that an...,2,2024-01-29
3,Gov. Spencer J. Cox and Lt. Gov. Deidre Hender...,https://governor.utah.gov/2024/01/22/gov-spenc...,**Events labeled Media Access indicate that an...,2,2024-01-22
4,Gov. Spencer Cox signs first bill of the 2024 ...,https://governor.utah.gov/2024/01/19/gov-spenc...,"NEWS RELEASE Jan. 19, 2024 Contact: Emma Willi...",2,2024-01-19
