# Web Scraping Case Info from ca9 website

https://www.ca9.uscourts.gov/media/


In [1]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd


### Part I: Web Scraping Tool
Creating a function that web scrapes from multiple links, parses the HTML, and stores in a pandas DataFrame.

In [2]:
def web_scrape_page(url):
    resp = requests.get(url)
    
    # All info from webpage scraped
    soup = BeautifulSoup(resp.text,'html.parser')
    
    # Scrape coa_dg_table table for all string elements
    data = []
    table = soup.find('table', attrs={'class':'coa_dg_table'})
    table_body = table.find('tbody')

    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
        
    final_df = pd.DataFrame(data)
    final_df = final_df.rename(columns={0: "Case Name", 
                                        1: "Case No.",
                                        2: "Case Panel",
                                        3: "Hearing Location",
                                        4: "Hearing Date",
                                        5: "Audio",
                                        6: "Video"})
    
    # Scrape coa_dg_table table for links to audio and video files
    audio = []
    video = []

    for a in table_body.find_all('a', href=True):
        if 'video' in a['href']:
            video.append(a['href'])
        else:
            audio.append(a['href'])
    
    # Add parent website to beinning of string
    audio = ['https://www.ca9.uscourts.gov/media/' + x for x in audio]
    video = ['https://www.ca9.uscourts.gov/media/' + x for x in video]
    
    # Append to final dataframe
    final_df['Audio'] = audio
    final_df['Video'] = video
    
    
    return final_df

In [3]:
data = web_scrape_page('https://www.ca9.uscourts.gov/media/')

In [4]:
pd.set_option('display.max_colwidth', -1)

In [21]:
data

Unnamed: 0,Case Name,Case No.,Case Panel,Hearing Location,Hearing Date,Audio,Video
0,USA v. Justin Nekeferoff,19-30015,"CHRISTEN, WATFORD, BADE",Anchorage AK,09/25/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035813,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000018000
1,"George Young, Jr. v. State of Hawaii",12-17808,"O'SCANNLAIN, THOMAS, McKEOWN, WARDLAW, W. FLETCHER, CLIFTON, BYBEE, CALLAHAN, IKUTA, FRIEDLAND, NELSON",San Francisc...,09/24/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035812,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017999
2,Maria Medina Tovar v. Laura Zuchowski,18-35072,"THOMAS, McKEOWN, GRABER, RAWLINSON, CALLAHAN, MURGUIA, WATFORD, BENNETT, COLLINS, BRESS, BUMATAY",San Francisc...,09/23/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035811,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017998
3,Jorge Rojas v. FAA,17-55036,"THOMAS, WARDLAW, GRABER, RAWLINSON, CALLAHAN, M. SMITH, IKUTA, WATFORD, HURWITZ, COLLINS, BUMATAY",San Francisc...,09/22/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035810,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017997
4,"Harvest Rock Church, Inc. v. Gavin Newsom",20-55907,"O'SCANNLAIN, RAWLINSON, CHRISTEN",San Francisc...,09/21/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035809,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017996
5,USA v. Isaac Bautista,19-10448,"SCHROEDER, W. FLETCHER, HUNSAKER",San Francisc...,09/18/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035805,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017995
6,Alice Brown v. County of Del Norte,18-16689,"SCHROEDER, W. FLETCHER, HUNSAKER",San Francisc...,09/18/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035806,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017993
7,Jeffrey Green v. City of Phoenix,19-16682,"SCHROEDER, W. FLETCHER, HUNSAKER",San Francisc...,09/18/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035807,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017994
8,Yes on Prop. B v. City and County of S.F.,20-15456,"SCHROEDER, W. FLETCHER, HUNSAKER",San Francisc...,09/18/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035808,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017992
9,Alex Hernandez v. William Barr,17-73332,"SCHROEDER, W. FLETCHER, HUNSAKER",San Francisc...,09/17/2020,https://www.ca9.uscourts.gov/media/view.php?pk_id=0000035798,https://www.ca9.uscourts.gov/media/view_video.php?pk_vid=0000017986


In [10]:
data.iloc[-1]['Hearing Date'][-4:]

'2020'

The movies are spread across 100s of different webpages. We use our function to scrape each page (automated with selenium python) and concatenate the resulting DataFrames, reindexing the final DataFrame so each case has it's own artificial identifier. 

In [12]:
!pip install selenium

[33mYou are using pip version 9.0.1, however version 20.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [14]:
from selenium.common.exceptions import NoSuchElementException
import time

In [16]:
from selenium import webdriver

In [19]:
from selenium.webdriver.common.keys import Keys

driver = webdriver.ChromeOptions()
driver.get("http://www.python.org")


AttributeError: 'Options' object has no attribute 'get'

In [18]:
url = "https://www.ca9.uscourts.gov/media/"

browser.get(url)

page_count = 1

while page_count < 3: # Testing with 3. Put an absurdly large number here as it's going to break once the year is 1999 anyway
    # Increase page_count value on each iteration on +1
    page_count += 1
    
    pg_current = web_scrape_page()
    pg = pd.concat([pg,pg_current])
    
    year = pg_current.iloc[-1]['Hearing Date'][-4:]
    
    if year == '1990':
        break

    try:
        # Clicking on "2" on pagination on first iteration, "3" on second...
        browser.find_element_by_link_text(str(page_count)).click()
    except NoSuchElementException:
        # Stop loop if no more page available
        break

NameError: name 'browser' is not defined

In [None]:
#Initializing pg DataFrame that will eventually contain all scraped information
pg = pd.DataFrame()

#Using selenium's webdriver to create basic Chrome Options when opening browser
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--test-type")

#Using selenium's webdriver to open Google Chrome with stated options 
#Goes directly to url which is the first page of IMDB Top 250 list  
driver = webdriver.Chrome(options=options)
driver.get("https://www.ca9.uscourts.gov/media/")

#Scrape page until 1999 hearing date
#Merge previous pages (pg) to current page (pg_current) until no pages are remaining
year = 2020

while year > 1999:

    pg_current = web_scrape_page()
    pg = pd.concat([pg,pg_current])
    
    year = pg_current.iloc[-1]['Hearing Date'][-4:]

    try:
        next_link = driver.find_element_by_xpath('//a[contains(.,"Next »")]')
        next_link.click()
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(30)

    except NoSuchElementException:
        pages_remaining = False

#Close browser
driver.close()