In [1]:
from bs4 import BeautifulSoup
import requests
import sys
import time
import random
import re
import pandas as pd
from selenium import webdriver
import scrapy

In [2]:
berkeley_code_links = "https://igs.berkeley.edu/library/california-local-government-documents/codes-and-charters"
mtc_website_links = "https://abag.ca.gov/about-abag/what-we-do/our-members"

In [3]:
def fetch(url,delay=(1,3)):
    """
    Simulate human random clicking 1..3 seconds then fetch URL.
    Returns the actual page source fetched and the beautiful soup object.
    """
    time.sleep(random.randint(delay[0],delay[1])) # wait random seconds
    r = requests.get(url, headers={'User-Agent': "Meow"})
    soup = BeautifulSoup(r.text, "html.parser")
    html = soup.prettify()
    return (html,soup)

In [4]:
def collect_city_names(url):
    html,soup = fetch(url)
    links = []
    for county in soup.find_all("tr"):
        my_county = [county.find("h5").a.text]
        for link in county.find_all("li"):
            my_county.append(link.a.text)
        links.append(my_county)
    return links

In [5]:
my_cities = collect_city_names("https://abag.ca.gov/about-abag/what-we-do/our-members")
# flatten list
my_cities = [item for sublist in my_cities for item in sublist]
# replace unnecessary phrases
my_cities = [x.replace("City of","").replace("Town of","") for x in my_cities]
my_cities = [x.replace("City and County of","").replace("County of","") for x in my_cities]
my_cities = [x.replace("County","") for x in my_cities]
my_cities = [x[:x.find('(')] if '(' in x else x for x in my_cities]
# get rid of whitespace
my_cities = [x.strip().lower() for x in my_cities]
# lets take a look at what we got
print(len(my_cities))

108


In [6]:
def collect_city_links(url, known_cities):
    html,soup = fetch(url)
    links = []
    for a in soup.find_all("a"):
        if a.has_attr('href'):
            city_name = a.text
            if '(' in city_name:
                city_name = city_name[:city_name.find('(')-1]
            for known in known_cities:
                if known.lower().strip() in city_name.lower().strip():
                    links.append([city_name.lower().strip(), a.attrs["href"]])
    return [x for x in links if len(x[0])>3]

In [7]:
my_links = collect_city_links(berkeley_code_links, my_cities)

In [8]:
sort_links = sorted(my_links, key = lambda x: x[0])
link_dict = {}

for x in sort_links:
    if x[0] in ['marina', 'portola', 'san marino', 'berkeley igs poll', 'walnut',
               'san francisco bay area city financial documents', 'san francisco - all sections']:
        pass
    elif x[0] in link_dict:
        link_dict[x[0]].append(x[1])
    else:
        link_dict[x[0]] = [x[1]]

# I added these manually since they were not in the cheat link
link_dict["mill valley"] = ["http://qcode.us/codes/millvalley/"]
link_dict["san jose"] = ["https://library.municode.com/ca/san_jose/codes/code_of_ordinances"]
link_dict["saint helena"] = ["https://www.codepublishing.com/CA/StHelena/"]
link_dict["berkeley"] = ["https://www.codepublishing.com/CA/Berkeley/"]

In [9]:
municode = []
codepub = []
qcode = []
amlegal = []
other = []

for x in link_dict.keys():
    link_dict[x] = list(set(link_dict[x]))
    if 'municode' in link_dict[x][0]:
        municode.append(x)
        link_dict[x].append("municode")
    elif 'codepub' in link_dict[x][0]:
        codepub.append(x)
        link_dict[x].append("codepub")
    elif 'qcode' in link_dict[x][0]:
        qcode.append(x)
        link_dict[x].append("qcode")
    elif 'amlegal' in link_dict[x][0]:
        amlegal.append(x)
        link_dict[x].append("amlegal")
    else:
        other.append(x)
        link_dict[x].append("other")
        

print(f"scrape total: {len(link_dict)}")
print(f"municode: {len(municode)}")
print(f"codepublishing: {len(codepub)}")
print(f"qcode: {len(qcode)}")
print(f"amlegal: {len(amlegal)}")
print(f"everything else: {len(other)}")

scrape total: 108
municode: 44
codepublishing: 27
qcode: 15
amlegal: 8
everything else: 14


In [10]:
print(other)

['albany', 'belvedere', 'colma', 'dixon', 'fremont', 'gilroy', 'hayward', 'healdsburg', 'piedmont', 'ross', 'san mateo', 'sausalito', 'sebastopol', 'solano county']


In [11]:
df = pd.DataFrame(list(link_dict.items()))
df.columns = ["city", "stuff"]
# df["city"] = [x.replace(" ","") for x in df["city"]]
# sift thru "stuff" to seperate links and the types of links
df["link_type"] = df["stuff"].str[-1]
df["links"] = df["stuff"].str[:-1]
# now that it is seperated we dont need "stuff" anymore
df.drop("stuff", axis=1, inplace=True)
# reorder cols
cols = ["city", "links", "link_type"]
df = df[cols]

df.to_csv("my_links.csv")
df.head(10)

Unnamed: 0,city,links,link_type
0,alameda,[http://library.municode.com/index.aspx?client...,municode
1,alameda county,[http://www.municode.com/Resources/gateway.asp...,municode
2,albany,[http://clerkshq.com/default.ashx?clientsite=a...,other
3,american canyon,[http://qcode.us/codes/americancanyon/],qcode
4,antioch,[http://www.amlegal.com/antioch_ca/],amlegal
5,atherton,[http://www.codepublishing.com/CA/atherton.html],codepub
6,belmont,[http://www.municode.com/resources/gateway.asp...,municode
7,belvedere,[http://www.cityofbelvedere.org/index.aspx?NID...,other
8,benicia,[http://www.codepublishing.com/ca/benicia/],codepub
9,berkeley,[https://www.codepublishing.com/CA/Berkeley/],codepub
