## Libraries

In [1]:
import pandas as pd
import requests

from bs4 import BeautifulSoup

In [2]:
# Function to remove '\n', '\xa0' and whitespace
def striptext(s):
    return s.strip().replace('\n', '').replace(u'\xa0',u'')

## Scrape contents from a webpage into a dataframe

In [3]:
# Define URL to scrape the content
url = 'https://www.ucop.edu/operating-budget/budgets-and-reports/legislative-reports/2021-22-legislative-session.html'

In [4]:
# Request content from web page using requests
result = requests.get(url)
c = result.content

# Set as Beautiful Soup Object
soup = BeautifulSoup(c)

In [5]:
# Find the section that we want to scrape the content
summary = soup.find('div', {'class':'list-land', 'id':'content'})

# Find the tables in the HTML
tables = summary.find_all('table')

# Get table caption
table_cap = tables[0].find('caption').text

In [6]:
# Define the dataframe
df_legislative = pd.DataFrame(columns=['Date', 'Report'])

# Getting Data
# 'caption' refers to a table caption
# 'td' refers to a standard cell in an HTML table
# 'tr' refers to a row in an HTML table

for row in tables[0].tbody.find_all('tr'):    
    # Find all data for each column
    cols = row.find_all('td')
    # Take only columns with text
    if(cols != []):
        date = striptext(cols[0].text)
        reports = striptext(cols[1].text)
        
        df_legislative = df_legislative.append({'Date':date, 'Report':reports}, ignore_index = True)

df_legislative

Unnamed: 0,Date,Report
0,09/01/21,2022-23 (EDU 92493 - 92496-2017) Capital Expen...
1,09/01/21,13th Amended List of Proposed Energy (SEP) Pro...
2,09/01/21,UCLA Asian American Research Center-Preliminar...
3,11/01/21,UCPath (pdf)
4,11/01/21,CalFresh (pdf)
5,11/01/21,Instruction and Research Space Summary & Analy...
6,11/30/21,Admission and Enrollment of Students in LCFF H...
7,11/30/21,Five Year Capital Outlay Plan for State Funds ...
8,12/01/21,Project Savings Funded from Capital Outlay Bon...
9,12/01/21,Streamlined Capital Projects Funded from Capit...


In [7]:
# Save to a csv file
df_legislative.to_csv(f'dataset/{table_cap}.csv', index=False)