# Oregon Scraping 

In [1]:
# set up 
import pandas as pd 
import requests 
import bs4
import numpy as np
import time 
import re
from tika import parser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from io import BytesIO
from urllib.request import urlopen

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


In [2]:
# # set headers
# header_list = {'User-Agent': REDACTED,
#               'Accept-Language': 'en-US,en;q=0.9'}

In [3]:
# state - UPDATE EACH TIME
state = 'oregon'

In [None]:
# review of robots.txt - no restrictions

In [None]:
# test = pd.read_csv(f'{state}_2023.csv')
# test.head()

## Scrape links 
`Selenium`

In [31]:
driver = webdriver.Chrome()
driver.get('https://www.oregon.gov/newsroom/Pages/Agency.aspx?page=0&pageSize=10&agency=GOV&year=2023')

wait = WebDriverWait(driver, 3)

df_list = []

def scrape_page():
    result = wait.until(EC.presence_of_all_elements_located((By.XPATH, 
                                                              '//*[@id="main-content"]')))
    articles = driver.find_elements(By.TAG_NAME, 'article')

    results_list = [{'title': i.find_element(By.TAG_NAME, 'a').text,
                'date': i.find_element(By.CLASS_NAME, 'or-newsroom-article-date').text, 
                'link': i.find_element(By.TAG_NAME, 'a').get_attribute('href')} for i in articles]

    df = pd.DataFrame.from_dict(results_list)
    df_list.append(df)

scrape_page()

# pagination
pagination = driver.find_element(By.XPATH, '//ul[contains(@class, "pagination")]')
pages = pagination.find_elements(By.TAG_NAME, 'li')
last_page = int(pages[-2].text)

current_page = 1

while current_page <= last_page:
    time.sleep(2)
    scrape_page()
    current_page = current_page + 1
    try:
        next_page = driver.find_element(By.XPATH, '//*[@id="next_span"]')
        next_page.click()
    except:
        pass
    
driver.close()

In [32]:
len(df_list)

7

In [33]:
# concat
combined = pd.concat(df_list)
combined

Unnamed: 0,title,date,link
0,Governor Kotek Appoints Klamath County Distric...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
1,Governor Kotek Issues Statement on Passage of ...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
2,Governor Kotek Issues Statement on Siletz Cons...,"December 20, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
3,Governor Kotek Orders Flags Lowered in Memory ...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
4,Governor Kotek and First Lady Reflect on One O...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
...,...,...,...
18,Governor Tina Kotek Officially Declares Homele...,"January 10, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
19,Governor Tina Kotek Announces Her First Action...,"January 09, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
20,Governor-Elect Tina Kotek Announces Senior Sta...,"January 09, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...
21,Governor-Elect Tina Kotek Announces New Educat...,"January 09, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...


In [53]:
# add empty content column 
combined['content'] = np.nan
combined.head()

Unnamed: 0,title,date,link,content
0,Governor Kotek Appoints Klamath County Distric...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
1,Governor Kotek Issues Statement on Passage of ...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
2,Governor Kotek Issues Statement on Siletz Cons...,"December 20, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
3,Governor Kotek Orders Flags Lowered in Memory ...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
4,Governor Kotek and First Lady Reflect on One O...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,


In [54]:
# export
combined.to_csv(f'{state}_links.csv', index = False)

In [55]:
test = pd.read_csv(f'{state}_links.csv')
test.head()

Unnamed: 0,title,date,link,content
0,Governor Kotek Appoints Klamath County Distric...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
1,Governor Kotek Issues Statement on Passage of ...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
2,Governor Kotek Issues Statement on Siletz Cons...,"December 20, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
3,Governor Kotek Orders Flags Lowered in Memory ...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,
4,Governor Kotek and First Lady Reflect on One O...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,


## Scrape content from links
`Beautiful Soup`

### Test with one link

In [37]:
# test with one
link = 'https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=207105'
server_response = requests.get(link, headers = header_list)
server_response

<Response [200]>

In [46]:
# test with one contd 
soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
result = soup_link.find('div', class_ = 'col-md-9 or-newsroom-article-main-content')
string_result = [str(i.text) for i in result]
string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
result_merged = " ".join(string_result_clean)
result_merged

'   Salem, OR—Media availability to be held on Tuesday, Kotek’s first full day in office[Salem, OR] - Tina Kotek will be sworn in as the 39th Governor of the State of Oregon on Monday, January 9, 2023.Chief Justice Meagan A. Flynn will administer the Oath of Office, and then the new Governor will give her inaugural address before a Joint Session of the Oregon Legislative Assembly.MONDAY: Tina Kotek to be sworn in as Oregon GovernorWhat: Joint Session to receive the Inaugural Address of the Honorable Tina Kotek, Governor-Elect of the State of OregonWhen: Monday, January 9, 1:00 pmWhere: Oregon State Capitol, House ChamberMedia members should plan to enter through the center doors on State Street on the south side of the Capitol. This is the same entrance for the general public. All who enter the Capitol must pass through a metal detector and bags will be scanned by on-site security.For the Joint Session in the House chamber, the only space available for media members are the six spots i

In [None]:
# define link scraping 
def get_oregon_content(link, df, header_list): 
    try: 
        # check if content has already been pulled
        result = df.loc[df.link == link, 'content'].notnull().all()
        if result:
            return f'content already existing for {link}' 
        else: 
            try: 
                server_response = requests.get(link, headers = header_list)
                soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
                soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
                result = soup_link.find('div', class_ = 'col-md-9 or-newsroom-article-main-content')
                string_result = [str(i.text) for i in result]
                string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
                string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
                result_merged = " ".join(string_result_clean)
                
                df.loc[df.link == link, 'content'] = result_merged
                return f"success: content added from {link}"
            except: 
                pass
    except Exception as e: 
        return f"unable to gather content from {link} due to {e}"
    #return df_updated 

In [57]:
# test on 5 to see if it works
# links_2023.content = np.nan

for i in combined.link[0:6]:
     get_oregon_content(i, combined, header_list)
    
combined.head()

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215081'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215074'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215071'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203173'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203171'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203169'

Unnamed: 0,title,date,link,content
0,Governor Kotek Appoints Klamath County Distric...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek announ..."
1,Governor Kotek Issues Statement on Passage of ...,"December 21, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek made t..."
2,Governor Kotek Issues Statement on Siletz Cons...,"December 20, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek made t..."
3,Governor Kotek Orders Flags Lowered in Memory ...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek ordere..."
4,Governor Kotek and First Lady Reflect on One O...,"December 18, 2023",https://www.oregon.gov/newsroom/Pages/NewsDeta...,Governor delivers on promise to visit all 36 ...


### Scrape content from all links

In [58]:
for i in combined.link:
     get_oregon_content(i, combined, header_list)

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215081'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215074'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215071'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203173'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203171'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203169'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203167'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203159'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203157'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203153'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203183'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203991'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204665'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203925'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204651'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203921'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204631'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204625'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203331'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203325'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204581'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204579'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204563'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204573'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204561'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204553'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204551'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204539'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203621'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204517'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215081'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215074'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215071'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203173'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203171'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203169'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203167'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203159'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203157'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203153'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203183'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203991'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204665'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203925'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204651'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203921'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204631'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204625'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203331'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203325'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204581'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204579'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204563'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204573'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204561'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204553'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204551'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204539'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203621'

'content already existing for https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204517'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204509'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203377'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204489'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204487'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204479'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204469'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204467'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204465'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204455'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204417'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204411'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204413'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204299'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204277'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204279'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204271'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204263'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204259'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204239'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204235'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204199'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204181'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203179'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204453'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204445'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204441'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204431'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204429'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204425'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204407'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203735'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203733'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203729'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203727'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203725'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204401'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204395'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204393'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204363'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204361'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204347'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204323'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204315'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204307'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204319'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203665'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203659'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204251'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203595'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204243'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204183'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204163'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204161'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203575'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203573'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203569'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203567'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203565'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204151'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204133'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204127'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204029'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203207'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203175'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204045'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204541'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204529'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204527'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204507'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204115'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203545'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203529'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204107'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204103'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203473'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204097'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204091'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204081'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203443'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204077'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204075'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=204073'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206243'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206201'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206181'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206157'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206123'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206121'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206119'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206083'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206075'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206043'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203279'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205975'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205953'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205959'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205957'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205945'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205937'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205927'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205917'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205815'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205887'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205881'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205879'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205889'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205853'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205843'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205841'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205831'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203675'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203639'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203637'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205793'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205771'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205761'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205765'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205749'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205737'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205731'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205729'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205389'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205459'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205723'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205721'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205709'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205689'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205687'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205647'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205679'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205651'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205383'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205257'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205639'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205635'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=215061'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=205949'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=206455'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=209005'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=209019'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203511'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=203681'

'success: content added from https://www.oregon.gov/newsroom/Pages/NewsDetail.aspx?newsid=207105'

## Validate and Clean

In [59]:
#check that content was pulled accurately
combined[combined.content.isna()]

Unnamed: 0,title,date,link,content


In [60]:
# convert to date-time format
combined['date_clean'] = pd.to_datetime(combined['date'])

combined_2023 = combined.drop(columns=['date']).copy()

In [62]:
combined_2023.head()
combined_2023.info()

Unnamed: 0,title,link,content,date_clean
0,Governor Kotek Appoints Klamath County Distric...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek announ...",2023-12-21
1,Governor Kotek Issues Statement on Passage of ...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek made t...",2023-12-21
2,Governor Kotek Issues Statement on Siletz Cons...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek made t...",2023-12-20
3,Governor Kotek Orders Flags Lowered in Memory ...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek ordere...",2023-12-18
4,Governor Kotek and First Lady Reflect on One O...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,Governor delivers on promise to visit all 36 ...,2023-12-18


<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 0 to 22
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       203 non-null    object        
 1   link        203 non-null    object        
 2   content     203 non-null    object        
 3   date_clean  203 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 7.9+ KB


In [63]:
print(f"pulled {str(len(combined_2023))} documents from {state} for 2023")

pulled 203 documents from oregon for 2023


## Export

In [64]:
combined_2023.to_csv(f'{state}_2023.csv', index = False)

In [65]:
test = pd.read_csv(f'{state}_2023.csv')
test.head()

Unnamed: 0,title,link,content,date_clean
0,Governor Kotek Appoints Klamath County Distric...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek announ...",2023-12-21
1,Governor Kotek Issues Statement on Passage of ...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek made t...",2023-12-21
2,Governor Kotek Issues Statement on Siletz Cons...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek made t...",2023-12-20
3,Governor Kotek Orders Flags Lowered in Memory ...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,"Salem, OR—Today, Governor Tina Kotek ordere...",2023-12-18
4,Governor Kotek and First Lady Reflect on One O...,https://www.oregon.gov/newsroom/Pages/NewsDeta...,Governor delivers on promise to visit all 36 ...,2023-12-18
