# Idaho Scraping 

In [1]:
# set up 
import pandas as pd 
import requests 
import bs4
import numpy as np
import time 
import re
from tika import parser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from io import BytesIO
from urllib.request import urlopen

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


In [2]:
# # set headers
# header_list = {'User-Agent': REDACTED,
#               'Accept-Language': 'en-US,en;q=0.9'}

In [3]:
# state - UPDATE EACH TIME
state = 'idaho'

In [33]:
# review of robots.txt - no restrictions

In [34]:
# test = pd.read_csv(f'{state}_2023.csv')
# test.head()

## Scrape Links 
`Beautiful Soup`

### Try with one link

In [4]:
# access webpage
link = 'https://gov.idaho.gov/pressrelease/'
server_response = requests.get(link, header_list)
server_response

<Response [200]>

In [9]:
# Parse and isolate content
soup = bs4.BeautifulSoup(server_response.content, features="html.parser")
# soup
results = soup.find('main', class_ = 'site-main col-sm-12 col-md-9').find_all('li')
# results

title = [i.find('a').text for i in results]
link = [i.find('a')['href'] for i in results]
date = [i.find('em').text for i in results]

['February 9, 2024',
 'February 9, 2024',
 'February 7, 2024',
 'February 3, 2024',
 'February 1, 2024',
 'January 29, 2024',
 'January 26, 2024',
 'January 25, 2024',
 'January 12, 2024',
 'January 10, 2024',
 'January 8, 2024',
 'January 5, 2024',
 'January 4, 2024',
 'January 2, 2024',
 'December 28, 2023',
 'December 18, 2023',
 'December 15, 2023',
 'December 14, 2023',
 'December 1, 2023',
 'November 29, 2023',
 'November 17, 2023',
 'November 17, 2023',
 'November 15, 2023',
 'November 6, 2023',
 'November 1, 2023',
 'October 3, 2023',
 'September 27, 2023',
 'September 25, 2023',
 'September 22, 2023',
 'September 21, 2023',
 'September 20, 2023',
 'September 7, 2023',
 'September 6, 2023',
 'August 18, 2023',
 'August 16, 2023',
 'August 11, 2023',
 'August 11, 2023',
 'August 10, 2023',
 'August 10, 2023',
 'August 1, 2023',
 'July 19, 2023',
 'July 18, 2023',
 'July 18, 2023',
 'July 6, 2023',
 'June 15, 2023',
 'June 15, 2023',
 'June 8, 2023',
 'June 5, 2023',
 'May 25, 20

### Scrape all links

In [11]:
# define webpage scraping 
def scrape_idaho_links(website, headers):
    server_response = requests.get(website, headers)
    soup = bs4.BeautifulSoup(server_response.content, features="html.parser")
    results = soup.find('main', class_ = 'site-main col-sm-12 col-md-9').find_all('li')
    results_list = [{'title': i.find('a').text,
                 'date': i.find('em').text, 
                'link': i.find('a')['href']} for i in results]
    df = pd.DataFrame.from_dict(results_list)
    df['content'] = np.nan
    return df

In [16]:
link_df = scrape_idaho_links('https://gov.idaho.gov/pressrelease/', header_list)

In [17]:
link_df.info()
link_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552 entries, 0 to 551
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   title    552 non-null    object 
 1   date     552 non-null    object 
 2   link     552 non-null    object 
 3   content  0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 17.4+ KB


Unnamed: 0,title,date,link,content
0,Gov. Little applauds introduction of school fa...,"February 9, 2024",https://gov.idaho.gov/pressrelease/gov-little-...,
1,Gov. Little orders lowering of flags to honor ...,"February 9, 2024",https://gov.idaho.gov/pressrelease/gov-little-...,
2,Gov. Little highlights recent trip to Texas-Me...,"February 7, 2024",https://gov.idaho.gov/pressrelease/gov-little-...,
3,WATCH: Governors’ press conference at Texas-Me...,"February 3, 2024",https://gov.idaho.gov/pressrelease/watch-gover...,
4,"Gov. Little to visit border, join 14 governors...","February 1, 2024",https://gov.idaho.gov/pressrelease/gov-little-...,


In [18]:
link_df.to_csv(f'{state}_links.csv', index = False)

In [19]:
# subset to 2023 only since this has multiple years already 
link_df['date_clean'] = pd.to_datetime(link_df['date'])

link_df = link_df.drop(columns=['date']).copy()

links_2023 = link_df[(link_df['date_clean'] >= '2023-01-01')
                     & (link_df['date_clean'] <= '2023-12-31')].copy()

In [22]:
links_2023.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 14 to 91
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       78 non-null     object        
 1   link        78 non-null     object        
 2   content     0 non-null      float64       
 3   date_clean  78 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 3.0+ KB


## Scrape content from links
`Beautiful Soup`

### Test with one link

In [94]:
# define link scraping 
def get_idaho_content(link, df, header_list):
    try: 
        # check if content has already been pulled
        result = df.loc[df.link == link, 'content'].notnull().all()
        if result:
            return f'content already existing for {link}' 
        else: 
            try: 
                df['content']
                server_response = requests.get(link, headers = header_list)
                soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
                result = soup_link.select('div.col-12:not([aria-label="social-media"]):has(p)')
            
                string_result = [str(i.text) for i in result]
                string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
                string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
                result_merged = " ".join(string_result_clean)
                
                df.loc[df.link == link, 'content'] = result_merged
                return f"success: content added from {link}"
            except: 
                pass
    except Exception as e: 
        return f"unable to gather content from {link} due to {e}"
    #return df_updated 

In [1]:
# access content
link = 'https://gov.idaho.gov/pressrelease/gov-little-orders-lowering-of-flags-to-honor-capt-benjamin-moulton/'
server_response = requests.get(link, headers = header_list)
server_response

NameError: name 'requests' is not defined

In [93]:
# parse and isolate content 
soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
result = soup_link.select('div.col-12:not([aria-label="social-media"]):has(p)')
# result
string_result = [str(i.text) for i in result]
string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
result_merged = " ".join(string_result_clean)
result_merged
# #p_results[3]
# p_text = [str(i.text) for i in p_results[3]]
# p_text
# string_result = [str(i.text) for i in result]
# string_result
# string_result_clean = [i.replace(u'\xa0', u'') for i in string_result]
# string_result_clean = [i.replace(u'\n', u'') for i in string_result_clean]
# string_result_clean = [i.replace(u'\r', u'') for i in string_result_clean]
# result_merged = " ".join(string_result_clean)
# result_merged

'Boise, Idaho - Governor Brad Little has ordered U.S. flags and State of Idaho flags to be lowered to half-staff to honor 27-year-old Capt. Benjamin Moulton of Emmett, one of the five U.S. Marines killed in a helicopter crash in southern California during a training exercise Tuesday.Flags will be lowered immediately until sunset on Tuesday, Feb. 13, 2024.The flags flown over the Capitol building in honor of Capt. Moulton will be presented to his parents."Capt. Benjamin Moulton and the four other U.S. Marines killed Tuesday gave everything for our country and our freedom. The tragic news of their deaths while they were training to fight our enemies weighs heavy on our hearts. Capt. Moulton was an Idaho native with roots dating back to the 1880s. Throughout his life, Capt. Moulton made his family, community, and state proud with his many achievements and his devotion to the U.S. Marine Corps and our great country. Idahoans, please pray for the loved ones of these heroes, and join me in h

### Scrape content from all links

In [95]:
# test on 5 to see if it works
# links_2023.content = np.nan

for i in links_2023.link[0:6]:
     get_idaho_content(i, links_2023, header_list)
    
links_2023.head()

'success: content added from https://gov.idaho.gov/pressrelease/idaho-ranks-first-for-income-growth/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-superintendent-critchfield-reflect-on-empowering-parents-success/'

'success: content added from https://gov.idaho.gov/pressrelease/opinion-ditch-the-secret-backroom-dam-breaching-deals-lets-focus-on-common-ground/'

'success: content added from https://gov.idaho.gov/pressrelease/100-million-for-local-roads-rolls-out-as-part-of-historic-transportation-funding-solution/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-littles-mexico-trade-mission-strengthens-trade-opportunities-for-idaho-businesses/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-homeowners-receive-average-18-percent-property-tax-reduction/'

Unnamed: 0,title,link,content,date_clean
14,Idaho ranks first for income growth,https://gov.idaho.gov/pressrelease/idaho-ranks...,"Boise, Idaho – Governor Brad Little commented ...",2023-12-28
15,"Gov. Little, Superintendent Critchfield reflec...",https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little and Superi...",2023-12-18
16,OPINION: Ditch the secret backroom dam breachi...,https://gov.idaho.gov/pressrelease/opinion-dit...,By Governor Brad Little and Lt. Governor Scott...,2023-12-15
17,$100 million for local roads rolls out as part...,https://gov.idaho.gov/pressrelease/100-million...,"Boise, Idaho – The historic transportation fun...",2023-12-14
18,Gov. Little’s Mexico trade mission strengthens...,https://gov.idaho.gov/pressrelease/gov-littles...,"Boise, Idaho – Governor Brad Little completed ...",2023-12-01


In [96]:
# do for all
for i in links_2023.link:
     get_idaho_content(i, links_2023, header_list)

'content already existing for https://gov.idaho.gov/pressrelease/idaho-ranks-first-for-income-growth/'

'content already existing for https://gov.idaho.gov/pressrelease/gov-little-superintendent-critchfield-reflect-on-empowering-parents-success/'

'content already existing for https://gov.idaho.gov/pressrelease/opinion-ditch-the-secret-backroom-dam-breaching-deals-lets-focus-on-common-ground/'

'content already existing for https://gov.idaho.gov/pressrelease/100-million-for-local-roads-rolls-out-as-part-of-historic-transportation-funding-solution/'

'content already existing for https://gov.idaho.gov/pressrelease/gov-littles-mexico-trade-mission-strengthens-trade-opportunities-for-idaho-businesses/'

'content already existing for https://gov.idaho.gov/pressrelease/idaho-homeowners-receive-average-18-percent-property-tax-reduction/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-capitol-christmas-tree-lighting-takes-place-nov-30/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-to-answer-policy-questions-during-monday-tele-townhall/'

'success: content added from https://gov.idaho.gov/pressrelease/health-and-welfare-director-dave-jeppesen-to-retire-dean-cameron-named-interim-director/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-appoints-district-judge-cynthia-meyer-to-the-idaho-supreme-court/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-highlights-apprenticeships-and-launch-proclaims-november-idaho-apprenticeship-month/'

'success: content added from https://gov.idaho.gov/pressrelease/opinion-launch-is-live/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-appoints-cody-brower-to-sixth-judicial-district/'

'success: content added from https://gov.idaho.gov/pressrelease/american-red-cross-blood-donations-needed-2/'

'success: content added from https://gov.idaho.gov/pressrelease/carmen-to-host-governor-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/launch-grants-go-live-oct-3-sign-up-now-for-alerts/'

'success: content added from https://gov.idaho.gov/pressrelease/eric-fredericksen-to-lead-new-office-of-the-state-public-defender/'

'success: content added from https://gov.idaho.gov/pressrelease/state-of-idaho-starts-to-deploy-property-tax-relief-with-funds-to-local-school-districts/'

'success: content added from https://gov.idaho.gov/pressrelease/steven-bailey-to-lead-idaho-department-of-administration/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-announces-senior-level-staff-changes/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-appoints-annie-mcdevitt-to-fourth-judicial-district/'

'success: content added from https://gov.idaho.gov/pressrelease/st-maries-to-host-governor-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/fentanyl-takes-all-campaign-improving-awareness-on-dangers-of-fentanyl/'

'success: content added from https://gov.idaho.gov/pressrelease/governors-water-summit-reveals-success-stories-challenges-in-looking-at-surface-and-ground-water-statewide/'

'success: content added from https://gov.idaho.gov/pressrelease/mathew-weaver-to-lead-idaho-department-of-water-resources/'

'success: content added from https://gov.idaho.gov/pressrelease/governors-water-summit-to-discuss-trends-investments-in-statewide-water-supply/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-ends-fiscal-year-with-budget-surplus-idahoans-to-receive-total-of-300-million-in-property-tax-relief/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-panel-accepting-applications-for-new-state-public-defender-position/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-appoints-paul-woods-to-state-tax-commission/'

'success: content added from https://gov.idaho.gov/pressrelease/lawmaker-roundtable-focuses-on-lessons-learned-from-state-police-deployment-to-u-s-mexico-border/'

'success: content added from https://gov.idaho.gov/pressrelease/governors-cup-scholarship-recipients-announced/'

'success: content added from https://gov.idaho.gov/pressrelease/little-critchfield-roll-out-the-story-of-america-history-curriculum-for-idaho-schools/'

'success: content added from https://gov.idaho.gov/pressrelease/st-anthony-to-host-governor-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-highlights-border-trip-commits-more-action-to-fight-fentanyl/'

'success: content added from https://gov.idaho.gov/pressrelease/parent-advisory-council-members-chosen-for-empowering-parents-grant-program/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-details-idaho-state-police-deployment-to-support-texas-in-fight-against-fentanyl/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-sues-feds-on-grizzly-delisting/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-marks-fentanyl-awareness-day-readies-for-texas-border-mission/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-stops-millions-in-tax-fraud/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-is-training-loggers-to-fight-forest-fires/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-marks-100-days-of-achievement/'

'success: content added from https://gov.idaho.gov/pressrelease/malad-city-to-host-governor-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-applauds-productive-legislative-session/'

'success: content added from https://gov.idaho.gov/pressrelease/opinion-working-together-we-got-simple-property-tax-relief-done-right/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-further-advances-education-freedom/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-champions-simpler-property-tax-relief-for-idahoans/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-lets-get-property-tax-relief-done-right/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-applauds-senate-passage-of-launch-to-train-students-for-in-demand-careers/'

'success: content added from https://gov.idaho.gov/pressrelease/jfac-advances-governors-recommended-rainy-day-fund-transfers-new-reserve-funds-for-fire-suppression/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-signs-idaho-first-broadband-investments-into-law/'

'success: content added from https://gov.idaho.gov/pressrelease/jfac-advances-governors-recommendation-for-improved-water-infrastructure/'

'success: content added from https://gov.idaho.gov/pressrelease/jfac-advances-governors-transportation-budget-recommendation/'

'success: content added from https://gov.idaho.gov/pressrelease/jfac-advances-governors-recommendation-for-outdoor-recreation/'

'success: content added from https://gov.idaho.gov/pressrelease/jfac-advances-governors-building-maintenance-budget-recommendation/'

'success: content added from https://gov.idaho.gov/pressrelease/jfac-advances-governors-public-schools-budget-recommendation/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-speaks-at-cpac-on-restoring-americas-energy-dominance/'

'success: content added from https://gov.idaho.gov/pressrelease/gooding-to-host-governor-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/public-events-set-to-honor-the-life-of-governor-phil-batt/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-announces-appointments-to-idaho-park-and-recreation-board/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-orders-lowering-of-flags-to-honor-former-idaho-gov-phil-batt/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-leaders-announce-parent-advisory-council-for-empowering-parents-grant-program/'

'success: content added from https://gov.idaho.gov/pressrelease/opinion-defending-state-sovereignty-over-land-water-wildlife-is-in-idahos-dna/'

'success: content added from https://gov.idaho.gov/pressrelease/governors-president-biden-owes-the-american-people-answers/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-to-visit-with-high-schoolers-about-proposed-launch-workforce-training-program/'

'success: content added from https://gov.idaho.gov/pressrelease/bruneau-to-host-governor-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-republican-governors-praise-gov-sanders-solutions-based-message-to-nation/'

'success: content added from https://gov.idaho.gov/pressrelease/little-bedke-crapo-risch-simpson-raise-concerns-about-lava-ridge-wind-farm-proposal-on-federal-land/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-notifies-biden-administration-of-idahos-intent-to-sue-on-grizzly-delisting/'

'success: content added from https://gov.idaho.gov/pressrelease/wall-street-journal-highlights-idahos-historic-tax-cuts/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-leads-governors-in-opposing-bidens-epa-waters-rule/'

'success: content added from https://gov.idaho.gov/pressrelease/idaho-wins-lawsuit-protects-life-and-property-from-illegal-encampments/'

'success: content added from https://gov.idaho.gov/pressrelease/weiser-to-host-gov-littles-capital-for-a-day-event/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-appoints-edward-lodge-to-public-utilities-commission/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-announces-staff-changes-2/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-appoints-new-agriculture-director-following-retirement-of-celia-gould/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-rolls-out-idaho-first-plan-to-support-schools-provide-property-tax-relief-fight-fentanyl-in-2023-state-of-the-state-and-budget-address/'

'success: content added from https://gov.idaho.gov/pressrelease/gov-little-publicly-sworn-in-for-second-term-as-idahos-33rd-governor/'

'success: content added from https://gov.idaho.gov/pressrelease/idahoans-invited-to-watch-gov-little-inauguration-and-2023-state-of-the-state-address/'

## Validate and Clean

In [97]:
#check that content was pulled accurately
links_2023[links_2023.content.isna()]

Unnamed: 0,title,link,content,date_clean


In [101]:
links_2023.head()
links_2023.tail()

Unnamed: 0,title,link,content,date_clean
14,Idaho ranks first for income growth,https://gov.idaho.gov/pressrelease/idaho-ranks...,"Boise, Idaho – Governor Brad Little commented ...",2023-12-28
15,"Gov. Little, Superintendent Critchfield reflec...",https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little and Superi...",2023-12-18
16,OPINION: Ditch the secret backroom dam breachi...,https://gov.idaho.gov/pressrelease/opinion-dit...,By Governor Brad Little and Lt. Governor Scott...,2023-12-15
17,$100 million for local roads rolls out as part...,https://gov.idaho.gov/pressrelease/100-million...,"Boise, Idaho – The historic transportation fun...",2023-12-14
18,Gov. Little’s Mexico trade mission strengthens...,https://gov.idaho.gov/pressrelease/gov-littles...,"Boise, Idaho – Governor Brad Little completed ...",2023-12-01


Unnamed: 0,title,link,content,date_clean
87,Gov. Little announces staff changes,https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little announced ...",2023-01-11
88,Gov. Little appoints new Agriculture Director ...,https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little announced ...",2023-01-10
89,Gov. Little rolls out ‘Idaho First’ plan to su...,https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little delivered ...",2023-01-09
90,Gov. Little publicly sworn in for second term ...,https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little was public...",2023-01-06
91,Idahoans invited to watch Gov. Little Inaugura...,https://gov.idaho.gov/pressrelease/idahoans-in...,"Boise, Idaho – Governor Little invites Idahoan...",2023-01-03


In [102]:
print(f"pulled {str(len(links_2023))} documents from {state} for 2023")

pulled 78 documents from idaho for 2023


## Export

In [103]:
# export
links_2023.to_csv(f'{state}_2023.csv', index = False)

In [104]:
# test = pd.read_csv(f'{state}_2023.csv')
# test.head()

Unnamed: 0,title,link,content,date_clean
0,Idaho ranks first for income growth,https://gov.idaho.gov/pressrelease/idaho-ranks...,"Boise, Idaho – Governor Brad Little commented ...",2023-12-28
1,"Gov. Little, Superintendent Critchfield reflec...",https://gov.idaho.gov/pressrelease/gov-little-...,"Boise, Idaho – Governor Brad Little and Superi...",2023-12-18
2,OPINION: Ditch the secret backroom dam breachi...,https://gov.idaho.gov/pressrelease/opinion-dit...,By Governor Brad Little and Lt. Governor Scott...,2023-12-15
3,$100 million for local roads rolls out as part...,https://gov.idaho.gov/pressrelease/100-million...,"Boise, Idaho – The historic transportation fun...",2023-12-14
4,Gov. Little’s Mexico trade mission strengthens...,https://gov.idaho.gov/pressrelease/gov-littles...,"Boise, Idaho – Governor Brad Little completed ...",2023-12-01
