# Zoning code searching
- This document handles zoning code searching in the zoning ordinates (finding out if a "code" appeared in a document)
- Part 1: codelibrary 
    - 1.1 scraping: fake api calls
    - 1.2 load existing zoning names/codes and find them in the documents
- Part 2: municode

In [2]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pickle
import string
printable_set = set(list(string.printable))

# 1. codelibrary

In [5]:
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27'
# url = 'https://codelibrary.amlegal.com/codes/templecityca/latest/templecity_ca/0-0-0-34203'

headers = {
    'User-Agent': user_agent
}

In [259]:
# temple city
# LOCKPORT, il
# use zoning list instead of the whole zoning (for parsing)

## 1.1 Scraping

In [3]:
idx = 190
url = 'https://codelibrary.amlegal.com/api/render-section/winthropharbor/latest/winthropharbor_il/0-0-0-39885/{}/'.format(idx)   
url
res = requests.get(url, headers = headers)
soup = BeautifulSoup(res.text, features="lxml")
res.status_code == 404

False

In [6]:
def clean_content(sent):
    sent = ''.join([c if c in printable_set else ' ' for c in sent])
    sent = re.sub('\n', ' ', sent)
    sent = re.sub(' +', ' ', sent)
    sent = re.sub('\t+', ' ', sent)
    return sent

def change_url(url):
    if '#' in url:
        url = url.split('#')[0]
    url = re.sub('codes', 'api/render-section', url)
    url = url
    return url

def process_soup_text(soup_text):
    starting_idx = list(re.finditer('"html":"', soup_text))[0].span()[1]
    return soup_text[starting_idx:]

def parse_table(soup):
    table_text = []
    for t in soup.find_all('table'):
        for child in t.tbody.children:
            table_text.append(' '.join([i.text for i in list(child.children)]))
    if len(table_text) == 0:
        return ''
    return '\n'.join(table_text)

def parse_codelibrary(url):
    res = requests.get(url, headers = headers)
    soup = BeautifulSoup(res.text, features="lxml")

    i = 0
    start_scraping = False
    section_count = 0
    all_text = []
    
    curr_section = soup.find_all('div', attrs = {'id': 'curr-section'})
    if len(curr_section) == 0:
        print('[ERROR!] No current_section found! URL:', url)
        return ''
    
    curr_section_found = False
    starting_id = 0
    for child in curr_section[0].children:
        try:
            starting_id = child['id'].replace('section-', '').strip()
            starting_id = int(starting_id)
            curr_section_found = True
        except:
            continue
        if curr_section_found:
            all_text.append(child.text)
            section_count += 1
            break
    
    i = starting_id + 1
    while True:
        if not start_scraping:
            section_i = soup.find_all('div', attrs = {'id': 'section-{}'.format(i)})
            #print(i, len(section_i))
            if len(section_i) == 0:
                start_scraping = True
            else:
                all_text.append(section_i[0].text)
                section_count += 1
                i += 1
        else:
            section_url = change_url(url) + '/{}/'.format(i)
            try:
                res = requests.get(section_url, headers = headers)
            except:
                break
            #print(section_url)
            #print(i, res.status_code)
            if res.status_code == 404:
                break
            soup = BeautifulSoup(res.text, features="lxml")
            table_text = parse_table(soup)
            tmp_text_result = process_soup_text(soup.text) + '\n' + table_text
            all_text.append(tmp_text_result)
            section_count += 1
            if section_count > 10:
                break
            i += 1
    return '\n'.join([clean_content(doc) for doc in all_text]), section_count
    #return all_text, section_count

In [4]:
# def clean_content(sent):
#     sent = ''.join([c if c in printable_set else ' ' for c in sent])
#     sent = re.sub('\n', ' ', sent)
#     sent = re.sub(' +', ' ', sent)
#     sent = re.sub('\t+', ' ', sent)
#     return sent

# def change_url(url):
#     if '#' in url:
#         url = url.split('#')[0]
#     url = re.sub('codes', 'api/render-section', url)
#     url = url
#     return url

# def process_soup_text(soup_text):
#     #print(soup_text)
#     #print('"html":"' in soup_text)
#     #print(list(re.finditer('"html":"', soup_text)))
#     starting_idx = list(re.finditer('"html":"', soup_text))[0].span()[1]
#     return soup_text[starting_idx:]

# def parse_codelibrary(url):
#     res = requests.get(url, headers = headers)
#     soup = BeautifulSoup(res.text, features="lxml")

#     i = 0
#     start_scraping = False
#     section_count = 0
#     all_text = []
    
#     curr_section = soup.find_all('div', attrs = {'id': 'curr-section'})
#     if len(curr_section) == 0:
#         print('[ERROR!] No current_section found! URL:', url)
#         return ''
    
#     curr_section_found = False
#     starting_id = 0
#     for child in curr_section[0].children:
#         try:
#             starting_id = child['id'].replace('section-', '').strip()
#             starting_id = int(starting_id)
#             curr_section_found = True
#         except:
#             continue
#         if curr_section_found:
#             all_text.append(child.text)
#             section_count += 1
#             break
    
#     i = starting_id + 1
#     while True:
#         if not start_scraping:
#             section_i = soup.find_all('div', attrs = {'id': 'section-{}'.format(i)})
#             #print(i, len(section_i))
#             if len(section_i) == 0:
#                 start_scraping = True
#             else:
#                 all_text.append(section_i[0].text)
#                 section_count += 1
#                 i += 1
#         else:
#             section_url = change_url(url) + '/{}/'.format(i)
#             try:
#                 res = requests.get(section_url, headers = headers)
#             except:
#                 break
#             #print(section_url)
#             #print(i, res.status_code)
#             if res.status_code == 404:
#                 break
#             soup = BeautifulSoup(res.text, features="lxml")
#             all_text.append(process_soup_text(soup.text))
#             section_count += 1
#             i += 1
#     return '\n'.join([clean_content(doc) for doc in all_text]), section_count

- **testing scraper**

In [7]:
# url = 'https://codelibrary.amlegal.com/codes/lockport/latest/lockport_il/0-0-0-77949'
# lockport, section_count = parse_codelibrary(url)

# url = 'https://codelibrary.amlegal.com/codes/danvilleca/latest/danville_ca/0-0-0-7018'
# danville_ca, section_count = parse_codelibrary(url)

url = 'https://codelibrary.amlegal.com/codes/minookail/latest/minooka_il/0-0-0-5147'
minoka, section_count = parse_codelibrary(url)

In [8]:
section_count

4

In [9]:
minoka

'CHAPTER 4ZONING DISTRICTS AND MAPSECTION:5-4-1 Establishment Of Districts5-4-2 Zoning District Map; Boundaries Of Districts5-4-3 Annexed Territory \n5-4-1: ESTABLISHMENT OF DISTRICTS:In order to carry out the purposes and intent of this title, the village is hereby divided into the following districts:AAgricultural districtR1Single-family detached residence districtR1ASingle-family detached residence districtR2Single-family detached residence districtR3Single-family attached and multiple-family residence districtR4Two-family (duplex) residence districtR4ATwo-family (duplex) residence districtR5Single-family attached residence districtR6Multiple-family residence districtB1Business districtB2Commercial districtM1Manufacturing districtM2Manufacturing districtLowland conservancy overlay district (Ord. 2003-34, 8-26-2003)\\n"} A Agricultural district R1 Single-family detached residence district R1A Single-family detached residence district R2 Single-family detached residence district R3 Si

In [363]:
# https://codelibrary.amlegal.com/api/render-section/lockport/latest/lockport_il/0-0-0-77949/2/

In [364]:
# res = requests.get(url, headers = headers)
# soup = BeautifulSoup(res.text, features="lxml")

In [365]:
# res.status_code

## 1.2 Matching existing oridinates

In [366]:
za_ordinates_links = pd.read_csv('../data/za_ordinates_links.csv')
za_ordinates_links.head(2)

Unnamed: 0,assignedto,ID,ID_notunique,GEOID,state,county,place,muncipalcode,site,priority1129,...,Use for model,complex?,zoningchapter,List of Zoning Districts,Residential_density_section,Table or text,Zoning chapter or density table as PDF? (new field),Other district density section,Notes,Unnamed: 20
0,Jacob,,30,151696,Alabama,Jefferson County,MOUNTAIN BROOK,https://library.municode.com/al/mountain_brook,Municode,0.0,...,1.0,,https://library.municode.com/al/mountain_brook...,https://library.municode.com/al/mountain_brook...,,text,,,different density section for each district,
1,Jacob,,16,100820,Alabama,Shelby County,ALABASTER,https://library.municode.com/al/alabaster,Municode,0.0,...,1.0,,https://library.municode.com/al/alabaster/code...,https://library.municode.com/al/alabaster/code...,https://library.municode.com/al/alabaster/code...,multiple tables,,,,


In [392]:
# za_ordinates_links.isna().sum()
len(all_results), i

(36, 112)

In [390]:
%run ../src/tree_spider.py
%run ../src/utils.py

In [397]:
all_results = []
counter = 0
# starting = 112
weird_county = []

In [398]:
for i in tqdm(range(za_ordinates_links.shape[0])):
#     if i < starting:
#         continue
    curr_row = za_ordinates_links.iloc[i]
    url = curr_row['zoningchapter']
    if isinstance(url, str) and 'codelibrary' in url:
        all_text, section_count = parse_codelibrary(url)
        if section_count == 1:
            url = curr_row['List of Zoning Districts']
            #print(curr_row)
            try:
                all_text, section_count = parse_codelibrary(url)
            except:
                weird_county.append(i)
                
        all_results.append({
            'idx': counter,
            'state': curr_row.state,
            'county': curr_row.county,
            'muni': curr_row.place,
            'all_text': all_text,
            'section_count': section_count,
            'source': 'codelibrary'
        })
    elif isinstance(url, str) and 'ecode360' in url:
        try:
            ts = TreeSpider(url.split('/')[-1])
            ts.run()
            all_text = ts.find_all_leaf_text()
        except:
            all_text = 'ecode_360_nothing_found'
        section_count = -1
        all_results.append({
            'idx': counter,
            'state': curr_row.state,
            'county': curr_row.county,
            'muni': curr_row.place,
            'all_text': all_text,
            'section_count': section_count,
            'source': 'ecode360'
        })
        
    counter += 1

100%|███████████████████████████████| 193/193 [02:14<00:00,  1.43it/s]


In [446]:
# ts.root_node.children[0]

[]

In [399]:
codelibrary_results = pd.DataFrame(all_results)
codelibrary_results.head(2)

Unnamed: 0,idx,state,county,muni,all_text,section_count
0,3,California,Contra Costa County,DANVILLE,CHAPTER XXXIIPLANNING AND LAND USE \t\t\t\t* \...,582
1,6,California,Los Angeles County,TEMPLE CITY,TITLE 9ZONING REGULATIONSCHAPTER 1ZONING CODE\...,27


In [400]:
codelibrary_results.query('muni == "LAKE ZURICH"')

Unnamed: 0,idx,state,county,muni,all_text,section_count
13,47,Illinois,Lake County,LAKE ZURICH,CHAPTER 2ZONING DISTRICTS AND OFFICIAL DOCUMEN...,6


In [401]:
zoning_code_julia = pd.read_csv('../data/zoning_codes_julia.csv')
zoning_code_julia.head(2)
# count_zoning >2

Unnamed: 0,ID,state,muni,county,zoning,zoning_name,likelyres,count_zoning
0,115.0,MD,Baltimore City,Baltimore,R-2,Detached and Semi-Detached Residential Zoning ...,1.0,99.0
1,168.0,MD,Oxford,Talbot,R-2,Historic Residential District,1.0,99.0


In [402]:
all_codes = zoning_code_julia.zoning.dropna().values
len(all_codes)

3213

In [403]:
all_codes = [code for code in all_codes if len(code) > 1] #ignore R, C, etc.

In [404]:
# top_n = 200

unique_codes, counts = np.unique(all_codes, return_counts = True) #counts >2
all_codes_w_counts = dict(zip(unique_codes, counts))
# all_codes_w_counts = dict(sorted(all_codes_w_counts.items(), key = lambda x: x[1], reverse = True))

# top_n_codes = set(list(all_codes_w_counts.keys()))

In [405]:
# all_codes_w_counts
from nltk.tokenize import word_tokenize

In [406]:
all_counts_results = []
for row_idx in tqdm(range(codelibrary_results.shape[0])):
    curr_row = codelibrary_results.iloc[row_idx]
    all_text = curr_row.all_text
    all_tokens = np.array(word_tokenize(all_text))
    unique_counts = 0
    all_counts = 0
    matches = []
    for k in top_n_codes:
        if k in all_tokens:
            unique_counts += 1
            all_counts += sum(all_tokens == k)
            matches.append(k)
    all_counts_results.append({
        'idx': curr_row.idx,
        'state': curr_row.state,
        'county': curr_row.county,
        'muni': curr_row.muni,
        'unique_matches': matches,
        'unique_counts': unique_counts,
        'all_counts': all_counts
    })

100%|█████████████████████████████████| 61/61 [02:24<00:00,  2.36s/it]


In [245]:
# type(word_tokenize(all_text))

list

In [407]:
all_counts_results_df = pd.DataFrame(all_counts_results)
all_counts_results_df.to_csv('../data/all_counts_results_df.csv', index = False)

In [436]:
# all_counts_results_df

In [432]:
# all_counts_results_df.muni.isna().sum()
za_ordinates_links['idx'] = range(za_ordinates_links.shape[0])

In [434]:
all_counts_results_df = all_counts_results_df.merge(za_ordinates_links[['idx', 'muncipalcode']], how = 'left', left_on = 'idx', right_on = 'idx')
def find_source(url):
    if 'codelibrary' in url:
        return 'codelibrary'
    if 'ecode360' in url:
        return 'ecode360'
    return 'NOT_FOUND'

all_counts_results_df['source'] = all_counts_results_df.muncipalcode.apply(find_source)

In [435]:
all_counts_results_df.query('source == "NOT_FOUND"')

Unnamed: 0,idx,state,county,muni,unique_matches,unique_counts,all_counts,muncipalcode,source


In [437]:
all_counts_results_df.to_csv('../data/all_counts_results_df.csv', index = False)

# 2. municode - not finished
- Hard to parse

In [160]:
user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
headers = {
    'User-Agent': user_agent
}

In [161]:
# headers

In [260]:
url = 'https://library.municode.com/il/sauk_village/codes/code_of_ordinances?nodeId=MUCO_CH82ZOPLDE_ARTVIIZODI_DIV2REDI_S82-222BUYARE'
# url = 'https://api.municode.com/codesToc/breadcrumb?jobId=381269&nodeId=MUCO_CH82ZOPLDE_ARTVIIZODI_DIV2REDI_S82-222BUYARE&productId=15156'
# url = 'https://api.municode.com/codesToc?jobId=381269&nodeId=MUCO_CH82ZOPLDE_ARTIVZOAPAP&productId=15156'
# url = 'https://api.municode.com/CodesContent?jobId=381269&nodeId=MUCO_CH82ZOPLDE_ARTVIIZODI_DIV2REDI_S82-222BUYARE&productId=15156'
res = requests.get(url, headers = headers)
soup = BeautifulSoup(res.text, features="lxml")

In [261]:
res

<Response [200]>

In [179]:
# clean_content(soup.text)

In [1]:
# soup