In [1]:
from bs4 import BeautifulSoup

import json
# import requests

from urllib.parse import urlparse, unquote

from clean import utils
from clean.cache import Cache


In [2]:
first_url ="https://www.lapdonline.org/senate-bill-1421-senate-bill-16-sb-16/"

scraping_complete = False

detail_urls = {}
indexes_scraped = {}
indexes_todo = set()
index_passes = 0

indexes_todo.add(first_url)

In [3]:
# Build a list of URL substitutes

bad_urls = {
    "https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/": "https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unlawful-arrest-unlawful-search/",
    "F118-04 November 22, 2004": "https://lacity.nextrequest.com/documents?folder_filter=F118-04"
}

In [4]:
def url_to_filename(url):
    # We really really really need a slugify thing    
    path = urlparse(url).path
    if path.startswith("/"):
        path = path[1:]
    if path.endswith("/"):
        path = path[:-1]
    path = path.replace("/", "_")
    path += ".html"
    return(path)

In [5]:
def clean_url(page_url, local_url):

    if local_url in bad_urls:
        local_url = bad_urls[local_url]
    if urlparse(local_url).netloc == "":
        local_url = urlparse(page_url).netloc + local_url
    if urlparse(local_url).scheme == "":
        local_url = "https" + local_url
    return(local_url)

In [6]:
# Need to add sleep between calls

while not scraping_complete:
    index_passes += 1
    for page_url in list(indexes_todo):    # work with a copy so we're not thrashing the original
        filename = url_to_filename(page_url)
        indexes_scraped[page_url] = {
            "subindexes": [],
            "details": 0,
        }
        cleaned_page_url = clean_url(page_url, page_url)
        print(f"Trying {cleaned_page_url}")
        r = utils.get_url(cleaned_page_url)
        
        # Need to write the page
        soup = BeautifulSoup(r.content)

        page_title = soup.title
        if page_title:
            page_title = unquote(page_title.text.strip())
        
        content_divs = soup.findAll("div", {"class": "grid-content"})
        content_divs.extend(soup.findAll("div", {"class": "link-box"}))
        for content_div in content_divs:
            links = content_div.findAll("a")
            for link in links:
                original_href = link['href']
                href = clean_url(page_url, original_href)
                if "nextrequest.com" in href:
                    if href not in detail_urls:
                        detail_urls[href] = []
                    detail_urls[href].append({"page_title": page_title, "page_url": page_url})
                    indexes_scraped[page_url]['details'] += 1
                else:
                    if original_href not in indexes_scraped:
                        indexes_todo.add(original_href)
                    indexes_scraped[page_url]["subindexes"].append(original_href)

    for url in indexes_scraped:
        if url in indexes_todo:
            indexes_todo.remove(url)
    if len(indexes_todo) == 0:
        print(f"Index scraping complete, after {len(indexes_scraped):,} indexes reviewed.")
        print(f"{len(detail_urls):,} case URLs found.")
        # print(f"Index pages parsed: {' ... '.join(indexes_scraped)}")
        scraping_complete = True
    else:
        print(f"Index scraping pass {index_passes:,}: {len(indexes_scraped):,} indexes scraped, {len(detail_urls):,} case URLs found")

Trying https://www.lapdonline.org/senate-bill-1421-senate-bill-16-sb-16/
Index scraping pass 1: 1 indexes scraped, 0 case URLs found
Trying https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unlawful-arrest-unlawful-search/
Trying https://www.lapdonline.org/sb-1421-sustained-complaints-of-sexual-assault-dishonesty/
Trying https://www.lapdonline.org/sb-1421-ois-main-page/
Trying https://www.lapdonline.org/sustained-complaints-of-prejudice-or-discrimination/
Trying https://www.lapdonline.org/sb-1421-great-bodily-injury-death/
Trying https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unreasonable-excessive-force-failure-to-intervene-in-excessive-force/
Index scraping pass 2: 7 indexes scraped, 443 case URLs found
Trying https://www.lapdonline.org/sb-1421-ois-2019/
Trying https://www.lapdonline.org/sb-1421-ois-2011/
Trying https://www.lapdonline.org/sb-1421-ois-2024/
Trying https://w

In [7]:
with open("lapd-detail_urls.json", "w", encoding="utf-8") as outfile:
    outfile.write(json.dumps(detail_urls, indent=4*' '))

In [8]:
detail_urls

{'https://lacity.nextrequest.com/documents?folder_filter=CF04-3235': [{'page_url': 'https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/',
   'page_title': 'Sustained Complaints of Unlawful Arrest/Unlawful Search - LAPD Online'},
  {'page_url': 'https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/',
   'page_title': 'Sustained Complaints of Unlawful Arrest/Unlawful Search - LAPD Online'}],
 'https://lacity.nextrequest.com/documents?folder_filter=CF20-003095': [{'page_url': 'https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/',
   'page_title': 'Sustained Complaints of Unlawful Arrest/Unlawful Search - LAPD Online'}],
 'https://lacity.

In [9]:
len(detail_urls)

1065

In [10]:
with open("indexes-scraped.json", "w", encoding="utf-8") as outfile:
    json.dumps(indexes_scraped, indent=4 * ' ')

In [11]:
indexes_scraped

{'https://www.lapdonline.org/senate-bill-1421-senate-bill-16-sb-16/': {'subindexes': ['https://www.lapdonline.org/sb-1421-ois-main-page/',
   'https://www.lapdonline.org/sb-1421-great-bodily-injury-death/',
   'https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unreasonable-excessive-force-failure-to-intervene-in-excessive-force/',
   'https://www.lapdonline.org/sb-1421-sustained-complaints-of-sexual-assault-dishonesty/',
   'https://www.lapdonline.org/sustained-complaints-of-prejudice-or-discrimination/',
   'https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/'],
  'details': 0},
 'https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/': {'subindexes': [],
  'details': 18},
 'https://www.lapdonline.