# Minnesota Scraping 

In [5]:
# set up 
import pandas as pd 
import requests 
import bs4
import numpy as np
import time 
import re
from tika import parser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from io import BytesIO
from urllib.request import urlopen
from selenium.webdriver.chrome.options import Options

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


In [15]:
# # set headers
# header_list = {'User-Agent': REDACTED,
#               'Accept-Language': 'en-US,en;q=0.9'}

In [7]:
# state - UPDATE EACH TIME
state = 'minnesota'

In [None]:
# review of robots.txt - Request-rate: 10

In [None]:
# test = pd.read_csv(f'{state}_2023.csv')
# test.head()

## Scrape Links
`Selenium`

### Test with one link

`Beautiful Soup` **Test**

In [8]:
# access webpage
link = 'https://mn.gov/governor/newsroom/press-releases/#/list/appId/1/filterType//filterValue//page/5/sort//order/'
server_response = requests.get(link, header_list)
server_response

<Response [200]>

In [9]:
# Parse and isolate content - DIDN'T WORK 
soup = bs4.BeautifulSoup(server_response.content, features="html.parser")
results = soup.find('div', class_ = 'row ng-scope')
#find_all('li', class_='ng-scope')
results

# title = [i.find('h2').text for i in results]
# link = [i.find('a')['href'] for i in results]
# date = [i.find('span', class_='postDate').text for i in results]

### Scrape all links

In [None]:
# manual review of pages need 5-35 for 2023
df_list = []

for page in range(5, 36):
    driver = webdriver.Chrome()
    driver.get(f'https://mn.gov/governor/newsroom/press-releases/#/list/appId/1/filterType//filterValue//page/{page}/sort//order/')

    wait = WebDriverWait(driver, 10)

    wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="dynamic-item-list-1"]/div[1]/ul')))

    section = driver.find_element(By.XPATH, '//*[@id="dynamic-item-list-1"]/div[1]/ul')

    results = section.find_elements(By.TAG_NAME, 'li')

    results_list = [{'title': i.find_element(By.TAG_NAME, 'a').text,
                    'date': i.find_element(By.CLASS_NAME, 'meta').text, 
                    'link': i.find_element(By.TAG_NAME, 'a').get_attribute('href')} for i in results]

    df = pd.DataFrame.from_dict(results_list)
    df_list.append(df)
    driver.close()

In [None]:
len(df_list)

In [None]:
# combine df list into one df 
combined = pd.concat(df_list)
combined

In [None]:
# create blank column for use later 
combined['content'] = np.nan
combined.head()

In [None]:
# safety save
combined.to_csv(f'{state}_links.csv', index = False)

## Scrape content from  links 
`Selenium`

### Try with one link

`Beautiful Soup` **Test**

In [10]:
# access webpage
link = 'https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559399'
server_response = requests.get(link, headers = header_list)
server_response

<Response [200]>

In [13]:
# parse and access content - DIDN'T WORK
soup_link = bs4.BeautifulSoup(server_response.content, features="html.parser")
result = soup_link.find('div', class_ = 'ng-binding')
result

In [7]:
# try selenium 
def get_minnesota_content(link, df): 
    try: 
        # check if content has already been pulled
        result = df.loc[df.link == link, 'content'].notnull().all()
        
        if result:
            return f'content already existing for {link}' 
        else: 
            try: 
                driver = webdriver.Chrome()
                driver.get(link)
                time.sleep(3)
                
                wait = WebDriverWait(driver, 10)

                wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="dynamic-item-list-1"]/div[1]/div')))

                results = driver.find_element(By.XPATH,'//*[@id="dynamic-item-list-1"]/div[1]/div')

                p_results = results.find_elements(By.TAG_NAME, 'p') 

                p_text = [i.text for i in p_results]

                li_results = results.find_elements(By.TAG_NAME, 'li') 

                if li_results is not None: 
                    li_text = ["--" + i.text for i in li_results]
                    p_text.extend(li_text)
                else: 
                    pass

                result_merged = " ".join(p_text)
                result_merged
                driver.quit()
                df.loc[df.link == link, 'content'] = result_merged
                return f"success: content added from {link}"
            except: 
                pass
    except Exception as e: 
        return f"unable to gather content from {link} due to {e}"

In [14]:
# test on one link again 
driver = webdriver.Chrome()
driver.get(combined.link[0])
time.sleep(3)

### Scrape content from all links (in chunks)

In [None]:
# first chunk 
for i in combined.link[0:100]:
     get_minnesota_content(i, combined) 

In [None]:
# safety save first chunk 
combined.to_csv('minnesota_0_100.csv', index = False)

In [9]:
# second chunk 
for i in combined.link[100:200]:
     get_minnesota_content(i, combined) 

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/590741'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/590739'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/590738'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/589989'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/589902'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/589904'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/589903'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/589905'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/587743'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/587744'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/587192'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/587021'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586876'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/587191'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586649'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586648'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586647'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586646'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586645'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586053'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586050'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586049'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/586048'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585802'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585411'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585410'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585409'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585408'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585405'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585403'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585402'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585129'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/585130'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584711'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584710'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584433'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584432'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584395'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584392'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584386'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584394'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/584372'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583830'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583487'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583406'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583409'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583402'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583348'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583176'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583175'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583174'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583173'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/582242'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/582241'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/581580'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/581582'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/581584'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/581586'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/580680'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/580533'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/580532'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/580531'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579790'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579789'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579572'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579538'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579413'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579332'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579333'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579334'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579304'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579287'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579303'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579275'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579302'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579291'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579296'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/579288'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578396'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578397'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578333'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578087'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578078'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577816'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577802'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577793'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577792'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577783'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577642'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577266'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577568'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576954'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576675'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576686'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578586'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576617'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577225'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576616'

In [12]:
# safety save
combined.to_csv('minnesota_100_200.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    310 non-null    object
 1   date     310 non-null    object
 2   link     310 non-null    object
 3   content  198 non-null    object
dtypes: object(4)
memory usage: 9.8+ KB


In [13]:
# combined = pd.read_csv('minnesota_100_200.csv')
# combined.head()

Unnamed: 0,title,date,link,content
0,Governor Walz Announces New Preston Veterans H...,"January 3, 2024",https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz announced t..."
1,Governor Walz Announces Over $17 Million for T...,"January 3, 2024",https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz and the Min..."
2,"Governor Walz, Lieutenant Governor Flanagan An...","January 3, 2024",https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz and Lieuten..."
3,Governor Walz Announces $300 Million in Public...,"December 21, 2023",https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz announced t..."
4,Governor Walz Selects Brad Lindsay to Serve as...,"December 20, 2023",https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz today annou..."


In [15]:
# chunk 3 
for i in combined.link[200:250]:
     get_minnesota_content(i, combined) 

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576638'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/577489'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/576642'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575303'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575270'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575189'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575185'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575115'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575125'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/575123'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/574367'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/574369'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573986'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/574041'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/574046'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573983'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573779'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573669'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573670'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573210'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573494'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573201'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/573084'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572826'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572824'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570260'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570081'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572022'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572012'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572007'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572042'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/572041'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/571965'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/571666'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/571649'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/571642'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/571979'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/571053'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570926'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570924'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570911'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570583'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570249'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570080'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570082'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/569987'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/569962'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/569951'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/569108'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/569995'

In [16]:
# chunk 4
for i in combined.link[250:]:
     get_minnesota_content(i, combined) 

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/569998'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/568998'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/568986'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/568965'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/568209'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/567891'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/568961'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/567631'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/567763'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/567606'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/567624'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/566470'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/566469'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/566063'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/566466'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/565817'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/565700'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/565173'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/565410'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/541394'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/565327'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/564997'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/563453'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/563448'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/563445'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/563196'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/563187'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/565055'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562954'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562955'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/567627'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562854'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562847'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562822'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562799'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562864'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562506'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562504'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/562476'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/561786'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/561860'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/561500'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/561484'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560832'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560959'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560905'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560964'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560468'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560144'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/560142'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559978'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559982'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559354'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559353'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/590818'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559399'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/559351'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/557820'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/557499'

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/556844'

In [None]:
combined.info()

In [23]:
# safety save 
combined.to_csv('minnesota_full_pull.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    310 non-null    object
 1   date     310 non-null    object
 2   link     310 non-null    object
 3   content  310 non-null    object
dtypes: object(4)
memory usage: 9.8+ KB


## Validate and Clean

In [22]:
#check that content was pulled accurately
combined[combined.content.isna()]

Unnamed: 0,title,date,link,content


In [19]:
# retry
get_minnesota_content(combined.link[152], combined)

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/583181'

In [21]:
# retry
get_minnesota_content(combined.link[183], combined)

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/578081'

In [29]:
# reset 
combined.at[225, 'content'] = np.nan 

In [32]:
# retry
get_minnesota_content(combined.link[225], combined)

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570260'

In [8]:
combined[combined.content.isna()]

Unnamed: 0,title,link,content,date_clean
222,Governor Walz Signs Bill Providing Funding for...,https://mn.gov/governor/newsroom/press-release...,,2023-04-05


In [9]:
# retry
get_minnesota_content(combined.link[222], combined)

'success: content added from https://mn.gov/governor/newsroom/press-releases/#/detail/appId/1/id/570260'

In [10]:
# retry 
combined.content.iloc[222] # didn't work so let's just manually do this one `

''

In [12]:
# didn't work so let's just manually do this one by copying from the webpage
combined.content.iloc[222] = '[ST. PAUL, MN] – Governor Tim Walz today signed a bill into law providing deficiency funding for the Office of Administrative Hearings (OAH) to continue providing hearings for campaign and data practice matters. Chapter 23, Senate File 1816 provides $196,000 in fiscal year 2023 for the Office of Administrative Hearings. The funds will be used to maintain fair, timely, and impartial hearings in campaign and data practices matters. The purpose of these funds is to accommodate an increase in data practices and campaign cases received by the OAH.'

In [13]:
combined.content.iloc[222]

'[ST. PAUL, MN] – Governor Tim Walz today signed a bill into law providing deficiency funding for the Office of Administrative Hearings (OAH) to continue providing hearings for campaign and data practice matters. Chapter 23, Senate File 1816 provides $196,000 in fiscal year 2023 for the Office of Administrative Hearings. The funds will be used to maintain fair, timely, and impartial hearings in campaign and data practices matters. The purpose of these funds is to accommodate an increase in data practices and campaign cases received by the OAH.'

In [14]:
# subset to only 2023 dates
combined['date_clean'] = pd.to_datetime(combined['date'])

combined_2023 = combined[(combined['date_clean'] >= '2023-01-01')
                     & (combined['date_clean'] <= '2023-12-31')].copy()

combined_2023 = combined.drop(columns=['date']).copy()

In [15]:
combined_2023.head()
combined_2023.info()

Unnamed: 0,title,link,content,date_clean
0,Governor Walz Announces $300 Million in Public...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz announced t...",2023-12-21
1,Governor Walz Selects Brad Lindsay to Serve as...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz today annou...",2023-12-20
2,Governor Walz Orders Flags at Half-Staff in Ho...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – In accordance with a proclama...",2023-12-18
3,Governor’s Merit Selection Panel Recommends Co...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Earlier this year, Governor W...",2023-12-18
4,Governor Walz Appoints Krista Marks to Fill Fi...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz and Lieuten...",2023-12-15


<class 'pandas.core.frame.DataFrame'>
Int64Index: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       306 non-null    object
 1   link        306 non-null    object
 2   content     306 non-null    object
 3   date_clean  306 non-null    object
dtypes: object(4)
memory usage: 12.0+ KB


In [16]:
print(f"pulled {str(len(combined_2023))} documents from {state} for 2023")

pulled 306 documents from minnesota for 2023


## Export

In [17]:
# export
combined_2023.to_csv(f'{state}_2023.csv', index = False)

In [18]:
test = pd.read_csv(f'{state}_2023.csv')
test.head()

Unnamed: 0,title,link,content,date_clean
0,Governor Walz Announces $300 Million in Public...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz announced t...",2023-12-21
1,Governor Walz Selects Brad Lindsay to Serve as...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz today annou...",2023-12-20
2,Governor Walz Orders Flags at Half-Staff in Ho...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – In accordance with a proclama...",2023-12-18
3,Governor’s Merit Selection Panel Recommends Co...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Earlier this year, Governor W...",2023-12-18
4,Governor Walz Appoints Krista Marks to Fill Fi...,https://mn.gov/governor/newsroom/press-release...,"[ST. PAUL, MN] – Governor Tim Walz and Lieuten...",2023-12-15
