# Zoning code searching
- This document handles zoning code searching in the zoning ordinates (finding out if a "code" appeared in a document)
- Part 1: codelibrary 
    - 1.1 scraping: fake api calls
    - 1.2 load existing zoning names/codes and find them in the documents
- Part 2: municode

In [1]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pickle
import string
printable_set = set(list(string.printable))

# 1. codelibrary

In [2]:
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27'
# url = 'https://codelibrary.amlegal.com/codes/templecityca/latest/templecity_ca/0-0-0-34203'

headers = {
    'User-Agent': user_agent
}

In [3]:
# temple city
# LOCKPORT, il
# use zoning list instead of the whole zoning (for parsing)

## 1.1 Scraping

In [4]:
idx = 190
# url = 'https://codelibrary.amlegal.com/api/render-section/winthropharbor/latest/winthropharbor_il/0-0-0-39885/{}/'.format(idx)   
# url
url = 'https://codelibrary.amlegal.com/codes/mentor/latest/mentor_oh/0-0-0-1805'
res = requests.get(url, headers = headers)
soup = BeautifulSoup(res.text, features="lxml")
res.status_code == 404

False

In [5]:
text = "Multi-Family ResidentialRMH Mobile Home,C-1 Conservation.C-2 Commercial RecreationOV Old Village District;B-1 Community ServiceB-2 General BusinessB-3"
new_text = re.sub(r'(?<=[a-z\.\,\;])(?=[A-Z])', ' ', text)
new_text

'Multi-Family Residential RMH Mobile Home, C-1 Conservation. C-2 Commercial Recreation OV Old Village District; B-1 Community Service B-2 General Business B-3'

In [6]:
def clean_content(sent):
    sent = ''.join([c if c in printable_set else ' ' for c in sent])
    sent = re.sub('\n', ' ', sent)
    sent = re.sub(' +', ' ', sent)
    sent = re.sub('\t+', ' ', sent)
    return sent

def change_url(url):
    if '#' in url:
        url = url.split('#')[0]
    url = re.sub('codes', 'api/render-section', url)
    url = url
    return url

def process_soup_text(soup_text):
    starting_idx = list(re.finditer('"html":"', soup_text))[0].span()[1]
    return soup_text[starting_idx:]

def parse_table(soup):
    table_text = []
    for t in soup.find_all('table'):
        for child in t.tbody.children:
            table_text.append(' '.join([i.text for i in list(child.children)]))
    if len(table_text) == 0:
        return ''
    return '\n'.join(table_text)

def parse_codelibrary(url):
    res = requests.get(url, headers = headers)
    soup = BeautifulSoup(res.text, features="lxml")

    i = 0
    start_scraping = False
    section_count = 0
    all_text = []
    
    curr_section = soup.find_all('div', attrs = {'id': 'curr-section'})
    if len(curr_section) == 0:
        print('[ERROR!] No current_section found! URL:', url)
        return ''
    
    curr_section_found = False
    starting_id = 0
    for child in curr_section[0].children:
        try:
            starting_id = child['id'].replace('section-', '').strip()
            starting_id = int(starting_id)
            curr_section_found = True
        except:
            continue
        if curr_section_found:
            all_text.append(child.text)
            section_count += 1
            break
    
    i = starting_id + 1
    while True:
        if not start_scraping:
            section_i = soup.find_all('div', attrs = {'id': 'section-{}'.format(i)})
            #print(i, len(section_i))
            if len(section_i) == 0:
                start_scraping = True
            else:
                all_text.append(section_i[0].text)
                section_count += 1
                i += 1
        else:
            section_url = change_url(url) + '/{}/'.format(i)
            try:
                res = requests.get(section_url, headers = headers)
            except:
                break
            #print(section_url)
            #print(i, res.status_code)
            if res.status_code == 404:
                break
            soup = BeautifulSoup(res.text, features="lxml")
            table_text = parse_table(soup)
            tmp_text_result = process_soup_text(soup.text) + '\n' + table_text
            all_text.append(tmp_text_result)
            section_count += 1
            if section_count > 80:
                break
            i += 1
    return '\n'.join([clean_content(doc) for doc in all_text]), section_count
    #return all_text, section_count

In [7]:
# def clean_content(sent):
#     sent = ''.join([c if c in printable_set else ' ' for c in sent])
#     sent = re.sub('\n', ' ', sent)
#     sent = re.sub(' +', ' ', sent)
#     sent = re.sub('\t+', ' ', sent)
#     return sent

# def change_url(url):
#     if '#' in url:
#         url = url.split('#')[0]
#     url = re.sub('codes', 'api/render-section', url)
#     url = url
#     return url

# def process_soup_text(soup_text):
#     #print(soup_text)
#     #print('"html":"' in soup_text)
#     #print(list(re.finditer('"html":"', soup_text)))
#     starting_idx = list(re.finditer('"html":"', soup_text))[0].span()[1]
#     return soup_text[starting_idx:]

# def parse_codelibrary2(url):
#     res = requests.get(url, headers = headers)
#     soup = BeautifulSoup(res.text, features="lxml")

#     i = 0
#     start_scraping = False
#     section_count = 0
#     all_text = []
    
#     curr_section = soup.find_all('div', attrs = {'id': 'curr-section'})
#     if len(curr_section) == 0:
#         print('[ERROR!] No current_section found! URL:', url)
#         return ''
    
#     curr_section_found = False
#     starting_id = 0
#     for child in curr_section[0].children:
#         try:
#             starting_id = child['id'].replace('section-', '').strip()
#             starting_id = int(starting_id)
#             curr_section_found = True
#         except:
#             continue
#         if curr_section_found:
#             all_text.append(child.text)
#             section_count += 1
#             break
    
#     i = starting_id + 1
#     while True:
#         if not start_scraping:
#             section_i = soup.find_all('div', attrs = {'id': 'section-{}'.format(i)})
#             #print(i, len(section_i))
#             if len(section_i) == 0:
#                 start_scraping = True
#             else:
#                 all_text.append(section_i[0].text)
#                 section_count += 1
#                 i += 1
#         else:
#             section_url = change_url(url) + '/{}/'.format(i)
#             try:
#                 res = requests.get(section_url, headers = headers)
#             except:
#                 break
#             #print(section_url)
#             #print(i, res.status_code)
#             if res.status_code == 404:
#                 break
#             soup = BeautifulSoup(res.text, features="lxml")
#             all_text.append(process_soup_text(soup.text))
#             section_count += 1
#             i += 1
#             if section_count > 100:
#                 break
#     return '\n'.join([clean_content(doc) for doc in all_text]), section_count

- **testing scraper**

In [8]:
# url = 'https://codelibrary.amlegal.com/codes/lockport/latest/lockport_il/0-0-0-77949'
# lockport, section_count = parse_codelibrary(url)

# url = 'https://codelibrary.amlegal.com/codes/danvilleca/latest/danville_ca/0-0-0-7018'
# danville_ca, section_count = parse_codelibrary(url)

# url = 'https://codelibrary.amlegal.com/codes/minookail/latest/minooka_il/0-0-0-5147'
# minoka, section_count = parse_codelibrary(url)

url = 'https://codelibrary.amlegal.com/codes/mentor/latest/mentor_oh/0-0-0-18059'
MENTOR, section_count = parse_codelibrary(url)


In [9]:
section_count

1

In [10]:
new_MENTOR = re.sub(r'(?<=[a-z\.\,\;])(?=[A-Z])', ' ',MENTOR)
new_MENTOR

'Share Download Bookmark Print1153.01 DISTRICTS ESTABLISHED. (a) The City is hereby divided into the following zones or districts:R-1 Single Family Residential R-2 Single Family Residential R-3 Single Family Residential R-4 Single Family Residential R-5 Estate Zoning R-10 Multi-Family Residential RMH Mobile Home C-1 Conservation C-2 Commercial Recreation OV Old Village District B-1 Community Service B-2 General Business B-3 Interchange Service District PUD Planned Unit Development M-1 Light Manufacturing M-2 Heavy Manufacturing MRD Research & Development MIP Industrial Park (b) Land may also be classified in the following special districts: FH Flood Hazard(1969 Code 150.101; Ord. 07-0-42. Passed 4-17-07; Ord. 20-0-062. Passed 8-18-20.) (c) Old Village (OV) District. (1) Purpose and intent. The purpose of the Old Village District (OV), hereinafter sometimes referred to as the Old Village, is to: A. Preserve, stabilize, restore and enhance pedestrian-oriented small- scale commercial and 

In [11]:
# https://codelibrary.amlegal.com/api/render-section/lockport/latest/lockport_il/0-0-0-77949/2/

In [12]:
# res = requests.get(url, headers = headers)
# soup = BeautifulSoup(res.text, features="lxml")

In [13]:
# res.status_code

## 1.2 Matching existing oridinates

In [14]:
za_ordinates_links = pd.read_csv('../data/za_ordinates_links.csv')
za_ordinates_links.head(3)

Unnamed: 0,assignedto,ID,ID_notunique,GEOID,state,county,place,muncipalcode,site,priority1129,...,Use for model,complex?,zoningchapter,List of Zoning Districts,Residential_density_section,Table or text,Zoning chapter or density table as PDF? (new field),Other district density section,Notes,Unnamed: 20
0,Jacob,,30,151696,Alabama,Jefferson County,MOUNTAIN BROOK,https://library.municode.com/al/mountain_brook,Municode,0.0,...,1.0,,https://library.municode.com/al/mountain_brook...,https://library.municode.com/al/mountain_brook...,,text,,,different density section for each district,
1,Jacob,,16,100820,Alabama,Shelby County,ALABASTER,https://library.municode.com/al/alabaster,Municode,0.0,...,1.0,,https://library.municode.com/al/alabaster/code...,https://library.municode.com/al/alabaster/code...,https://library.municode.com/al/alabaster/code...,multiple tables,,,,
2,Jacob,,135,444410,Arizona,Pinal County,MARICOPA,https://maricopa.municipal.codes/,Codebook,1.0,...,1.0,,https://maricopa.municipal.codes/MCC/18,https://maricopa.municipal.codes/MCC/18.10.010,https://maricopa.municipal.codes/MCC/18.35.030,table,,,"different density table for rural, residential...",


In [15]:
za_ordinates_links.isna().sum()
# len(all_results), i

assignedto                                               1
ID                                                     193
ID_notunique                                             1
GEOID                                                    1
state                                                    1
county                                                   1
place                                                    1
muncipalcode                                             1
site                                                     1
priority1129                                             1
priority1130                                             1
Use for model                                           22
complex?                                               110
zoningchapter                                           34
List of Zoning Districts                                51
Residential_density_section                             53
Table or text                                           

In [16]:
%run ../src/tree_spider.py
%run ../src/utils.py

In [17]:
all_results = []
counter = 0
# starting = 112
weird_county = []

In [18]:
for i in tqdm(range(za_ordinates_links.shape[0])):
    curr_row = za_ordinates_links.iloc[i]
    url = curr_row['List of Zoning Districts']
    url2 = curr_row['zoningchapter']
    if isinstance(url, str) and 'codelibrary' in url:
        try:
            all_text1, section_count1 = parse_codelibrary(url)
            all_text2, section_count2 = parse_codelibrary(url2)
            all_text1_new = re.sub(r'(?<=[a-z\.\,\;])(?=[A-Z])', ' ',all_text1)
            all_text2_new = re.sub(r'(?<=[a-z\.\,\;])(?=[A-Z])', ' ',all_text2)
            all_text = "List of Zoning Districts: " +all_text1_new +'\n'+"zoning chapter: "+all_text2_new
            all_results.append({
                'idx': counter,
                'state': curr_row.state,
                'county': curr_row.county,
                'muni': curr_row.place,
                'all_text': all_text,
                'section_count_zoningchapter': section_count2,
                'section_count_zoningdistricts': section_count1,
                'source': 'codelibrary'
            })
        except:
            weird_county.append(i)
    elif isinstance(url, str) and 'ecode360' in url:
        try:
            ts = TreeSpider(url.split('/')[-1])
            ts.run()
            all_text = ts.find_all_leaf_text()
        except:
            all_text = 'ecode_360_nothing_found'
        section_count = -1
        all_results.append({
            'idx': counter,
            'state': curr_row.state,
            'county': curr_row.county,
            'muni': curr_row.place,
            'all_text': all_text,
            'section_count': section_count,
            'source': 'ecode360'
        })
        
    counter += 1

100%|██████████| 193/193 [05:46<00:00,  1.80s/it]


In [19]:
weird_county

[53, 54, 67]

In [20]:
# for i in tqdm(range(za_ordinates_links.shape[0])):
# #     if i < starting:
# #         continue
#     curr_row = za_ordinates_links.iloc[i]
#     url = curr_row['List of Zoning Districts']
#     if isinstance(url, str) and 'codelibrary' in url:
#         all_text, section_count = parse_codelibrary(url)
# #         if section_count == 1:
# #             url = curr_row['List of Zoning Districts']
# #             #print(curr_row)
# #             try:
# #                 all_text, section_count = parse_codelibrary(url)
# #             except:
# #                 weird_county.append(i)
                
#         all_results.append({
#             'idx': counter,
#             'state': curr_row.state,
#             'county': curr_row.county,
#             'muni': curr_row.place,
#             'all_text': all_text,
#             'section_count': section_count,
#             'source': 'codelibrary'
#         })
#     elif isinstance(url, str) and 'ecode360' in url:
#         try:
#             ts = TreeSpider(url.split('/')[-1])
#             ts.run()
#             all_text = ts.find_all_leaf_text()
#         except:
#             all_text = 'ecode_360_nothing_found'
#         section_count = -1
#         all_results.append({
#             'idx': counter,
#             'state': curr_row.state,
#             'county': curr_row.county,
#             'muni': curr_row.place,
#             'all_text': all_text,
#             'section_count': section_count,
#             'source': 'ecode360'
#         })
        
#     counter += 1

In [21]:
# for i in tqdm(range(za_ordinates_links.shape[0])):
# #     if i < starting:
# #         continue
#     curr_row = za_ordinates_links.iloc[i]
#     url = curr_row['zoningchapter']
#     if isinstance(url, str) and 'codelibrary' in url:
#         all_text, section_count = parse_codelibrary(url)
#         if section_count == 1:
#             url = curr_row['List of Zoning Districts']
#             #print(curr_row)
#             try:
#                 all_text, section_count = parse_codelibrary(url)
#             except:
#                 weird_county.append(i)
                
#         all_results.append({
#             'idx': counter,
#             'state': curr_row.state,
#             'county': curr_row.county,
#             'muni': curr_row.place,
#             'all_text': all_text,
#             'section_count': section_count,
#             'source': 'codelibrary'
#         })
#     elif isinstance(url, str) and 'ecode360' in url:
#         try:
#             ts = TreeSpider(url.split('/')[-1])
#             ts.run()
#             all_text = ts.find_all_leaf_text()
#         except:
#             all_text = 'ecode_360_nothing_found'
#         section_count = -1
#         all_results.append({
#             'idx': counter,
#             'state': curr_row.state,
#             'county': curr_row.county,
#             'muni': curr_row.place,
#             'all_text': all_text,
#             'section_count': section_count,
#             'source': 'ecode360'
#         })
        
#     counter += 1

In [22]:
# ts.root_node.children[0]

In [23]:
codelibrary_results = pd.DataFrame(all_results)
codelibrary_results.head(7)

Unnamed: 0,idx,state,county,muni,all_text,section_count_zoningchapter,section_count_zoningdistricts,source,section_count
0,3,California,Contra Costa County,DANVILLE,List of Zoning Districts: Share Download Bookm...,81.0,1.0,codelibrary,
1,6,California,Los Angeles County,TEMPLE CITY,List of Zoning Districts: Share Download Bookm...,81.0,1.0,codelibrary,
2,22,Florida,Broward County,COOPER CITY,List of Zoning Districts: Chapter 23ZONING DIS...,81.0,81.0,codelibrary,
3,31,Idaho,Canyon County,MIDDLETON,List of Zoning Districts: Share Download Bookm...,6.0,1.0,codelibrary,
4,33,Illinois,Cook County,WESTERN SPRINGS,List of Zoning Districts: CHAPTER 5ZONING DIST...,1.0,5.0,codelibrary,
5,34,Illinois,DuPage County,BARTLETT,List of Zoning Districts: Share Download Bookm...,14.0,1.0,codelibrary,
6,35,Illinois,DuPage County,HINSDALE,List of Zoning Districts: Share Download Bookm...,81.0,1.0,codelibrary,


In [24]:
codelibrary_results.query('muni == "DANVILLE"').all_text

0    List of Zoning Districts: Share Download Bookm...
Name: all_text, dtype: object

In [25]:
codelibrary_results.to_csv('../data/codelibrary_results_1209.csv', index = False)

### 1.2.1 Clean Zoning Code

In [26]:
zoning_code_julia = pd.read_csv('../data/zoning_codes_julia.csv')
zoning_code_julia.head(3)

Unnamed: 0,ID,state,muni,county,zoning,zoning_name,likelyres,count_zoning
0,115.0,MD,Baltimore City,Baltimore,R-2,Detached and Semi-Detached Residential Zoning ...,1.0,99.0
1,168.0,MD,Oxford,Talbot,R-2,Historic Residential District,1.0,99.0
2,188.0,MD,Frederick,Frederick,R-2,Low Density Residential,1.0,99.0


In [27]:
len(zoning_code_julia.zoning.values)

3735

In [28]:
##drop na value
all_codes = zoning_code_julia.zoning.dropna().values
len(all_codes)

3213

In [29]:
all_codes = [code for code in all_codes if len(code) > 1] #no need to ignore R, C? 
all_codes
# match = re.findall(r'^\d',all_codes[2])
# if match:
#     print(match.group())
# else:
#     print("match not found")


['R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'C-1',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-2',
 'R-1',
 'R-1',
 'C-2',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',
 'R-1',


In [30]:
# remove all the zoning code which are pure number or start from a number
all_codes_w_num = []
all_codes_wo_num = []
for i in tqdm(range(len(all_codes))):
        match = re.findall(r'^\d',all_codes[i])
        if match:
            all_codes_w_num.append(all_codes[i])
        else:
            all_codes_wo_num.append(all_codes[i])
            
        

100%|██████████| 3103/3103 [00:00<00:00, 569132.64it/s]


In [31]:
len(all_codes_wo_num)
range(len(all_codes_wo_num))

range(0, 3092)

In [32]:
# r"\([^()]*\)"
#.*?

In [34]:
# remove parenthese and content inside
all_codes_wo_num_par = []
for i in tqdm(range(len(all_codes_wo_num))):
        match = re.findall(r"\((?s).*\)",all_codes_wo_num[i])
        if match:
            new_codes = re.sub(r"\((?s).*\)", "", all_codes_wo_num[i])
            new_codes = new_codes.strip()
            all_codes_wo_num_par.append(new_codes)
        else:
            all_codes_wo_num_par.append(all_codes_wo_num[i])

100%|██████████| 3092/3092 [00:00<00:00, 575776.41it/s]


In [35]:
# remove "" from zoning codes
while("" in all_codes_wo_num_par) :
    all_codes_wo_num_par.remove("")

In [36]:
print(all_codes_wo_num_par)

['R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'C-1', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-2', 'R-1', 'R-1', 'C-2', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-1', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3', 'R-3'

In [37]:
all_codes_wo_num_par = [code for code in all_codes_wo_num_par if len(code) > 1] 

In [38]:
# top_n = 200

unique_codes, counts = np.unique(all_codes_wo_num_par, return_counts = True) #counts >2
all_codes_w_counts = dict(zip(unique_codes, counts))
# all_codes_w_counts = dict(sorted(all_codes_w_counts.items(), key = lambda x: x[1], reverse = True))

# top_n_codes = set(list(all_codes_w_counts.keys()))

In [39]:
print(all_codes_w_counts)

{'A-1': 4, 'A-1\xa0': 1, 'A-2': 2, 'A-5': 1, 'A-6': 1, 'A-C': 1, 'A-G': 1, 'A-O-B': 1, 'A-R': 2, 'A.S.': 1, 'A/LC': 1, 'AA-1': 1, 'AA-2': 1, 'AAH': 1, 'AAR': 1, 'AARC': 1, 'AB': 1, 'ABC': 1, 'AC': 3, 'AC-RCA': 1, 'AET': 1, 'AET ': 1, 'AF': 1, 'AF-1': 1, 'AF-2': 1, 'AG': 2, 'AH': 3, 'AH and RCA': 1, 'AH-1': 7, 'AH-2': 6, 'AH-3': 7, 'AH-3A': 1, 'AH-3B': 1, 'AH-4': 3, 'AH-5': 2, 'AH-6': 2, 'AH-7': 2, 'AH-8': 1, 'AH-C': 1, 'AHA': 1, 'AHD-1': 1, 'AHD-2': 1, 'AHD-3A': 1, 'AHD-3B': 1, 'AHD-4': 1, 'AHD-5': 1, 'AHD-6': 1, 'AHD-7': 1, 'AHD-MU': 1, 'AHO-A': 1, 'AHOZ': 2, 'AHOZ-2': 2, 'AHOZ-3': 3, 'AHOZ-4': 3, 'AHZ': 1, 'AI-10': 1, 'AL': 1, 'ALO': 1, 'ALR': 1, 'AP': 2, 'AP-1': 1, 'AP-2': 1, 'APT': 1, 'APT/TH': 1, 'AR': 10, 'AR-1': 2, 'AR-2': 1, 'AR-200': 1, 'AR-250': 1, 'AR-3': 1, 'AR-300': 1, 'AR-4': 1, 'AR-5/2': 1, 'AR-500': 1, 'AR-6': 1, 'AR-7': 1, 'ARD': 1, 'ARE-1': 1, 'ARE-2': 2, 'ARE-3': 1, 'ARE-4': 1, 'ARE-6': 1, 'ARE-C': 1, 'ARE-NRW': 1, 'ARH': 1, 'ARU': 1, 'ASCH': 2, 'AT': 1, 'AZD': 1, 'A

In [40]:
all_codes_highfreq = []
for(key,value) in all_codes_w_counts.items():
    if value>=1 :
        all_codes_highfreq.append(key)

In [41]:
all_codes_highfreq
len(all_codes_highfreq)


1682

In [42]:
# all_codes_w_counts
from nltk.tokenize import word_tokenize

In [43]:
all_counts_results = []
for row_idx in tqdm(range(codelibrary_results.shape[0])):
    curr_row = codelibrary_results.iloc[row_idx]
    all_text = curr_row.all_text
    all_tokens = np.array(word_tokenize(all_text))
    unique_counts = 0
    all_counts = 0
    matches = []
    for k in all_codes_highfreq:
        if k in all_tokens:
            unique_counts += 1
            all_counts += sum(all_tokens == k)
            matches.append(k)
    all_counts_results.append({
        'idx': curr_row.idx,
        'state': curr_row.state,
        'county': curr_row.county,
        'muni': curr_row.muni,
        'unique_matches': matches,
        'unique_counts': unique_counts,
        'all_counts': all_counts
    })

100%|██████████| 51/51 [01:12<00:00,  1.42s/it]


In [44]:
# type(word_tokenize(all_text))

In [45]:
all_counts_results_df = pd.DataFrame(all_counts_results)


In [46]:
all_counts_results_df

Unnamed: 0,idx,state,county,muni,unique_matches,unique_counts,all_counts
0,3,California,Contra Costa County,DANVILLE,"[A-1, A-2, CC, D-1, FAR, G-1, IV, L-I, O-1, OR...",24,165
1,6,California,Los Angeles County,TEMPLE CITY,"[ABC, CC, FAR, IL, MU, NC, OR, OS, PARK, PC, P...",18,88
2,22,Florida,Broward County,COOPER CITY,"[A-1, A-2, B-1, B-2, B-3, B1, B2, B3, C-1, C-2...",38,1044
3,31,Idaho,Canyon County,MIDDLETON,"[A-R, C-1, C-2, C-3, M-1, M-2, R-1, R-2, R-3, ...",11,31
4,33,Illinois,Cook County,WESTERN SPRINGS,"[C1, C2, DT, MXD, R1, R2, R3, R4, R5, RB]",10,23
5,34,Illinois,DuPage County,BARTLETT,[PUD],1,4
6,35,Illinois,DuPage County,HINSDALE,"[B-1, B-2, B-3, B1, B2, DR, HS, IV, O-1, O-2, ...",27,228
7,36,Illinois,Grundy County,MINOOKA,"[B1, B2, R1, R1A, R2, R3, R4, R5, R6]",9,11
8,39,Illinois,Lake County,LINDENHURST,"[BA, CB, IN, NB, OSR, PP, PUD, R-1, R-2, R-3, ...",15,143
9,41,Illinois,Will County,MANHATTAN,"[A-1, BP, C-1, C-2, C-3, CBD, CR, ER, I-1, I-2...",18,60


In [47]:
# all_counts_results_df.muni.isna().sum()
za_ordinates_links['idx'] = range(za_ordinates_links.shape[0])

In [48]:
all_counts_results_df = all_counts_results_df.merge(za_ordinates_links[['idx', 'muncipalcode']], how = 'left', left_on = 'idx', right_on = 'idx')
def find_source(url):
    if 'codelibrary' in url:
        return 'codelibrary'
    if 'ecode360' in url:
        return 'ecode360'
    return 'NOT_FOUND'

all_counts_results_df['source'] = all_counts_results_df.muncipalcode.apply(find_source)


In [49]:
all_counts_results_df.query('source == "NOT_FOUND"')

Unnamed: 0,idx,state,county,muni,unique_matches,unique_counts,all_counts,muncipalcode,source


In [50]:
all_counts_results_df.to_csv('../data/all_counts_results_df__nonumbercode_nopar_freqall_both_1209_5.csv', index = False)

In [160]:
user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
headers = {
    'User-Agent': user_agent
}

In [161]:
# headers

In [260]:
url = 'https://library.municode.com/il/sauk_village/codes/code_of_ordinances?nodeId=MUCO_CH82ZOPLDE_ARTVIIZODI_DIV2REDI_S82-222BUYARE'
# url = 'https://api.municode.com/codesToc/breadcrumb?jobId=381269&nodeId=MUCO_CH82ZOPLDE_ARTVIIZODI_DIV2REDI_S82-222BUYARE&productId=15156'
# url = 'https://api.municode.com/codesToc?jobId=381269&nodeId=MUCO_CH82ZOPLDE_ARTIVZOAPAP&productId=15156'
# url = 'https://api.municode.com/CodesContent?jobId=381269&nodeId=MUCO_CH82ZOPLDE_ARTVIIZODI_DIV2REDI_S82-222BUYARE&productId=15156'
res = requests.get(url, headers = headers)
soup = BeautifulSoup(res.text, features="lxml")

In [261]:
res

<Response [200]>

In [179]:
# clean_content(soup.text)

In [1]:
# soup