# Web Scraping
- This notebook contains code to scrap and save (to pickle files) zoning ordinates websites
- Part 1: ecode360 scraper (tree spider)
- Part 2: n/a

In [2]:
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pickle

# 1. Tree-based scraping

In [2]:
# load meta data (which urls do we need to scrap?)
all_meta_data = pd.read_csv('../data/ordinates_meta_data.csv', sep = '*')
all_meta_data_ecode_360 = all_meta_data.query('Site == "General Code"').reset_index(drop = True)

In [116]:
all_meta_data_ecode_360.shape

(2703, 6)

In [94]:
all_meta_data_ecode_360['href'] = all_meta_data_ecode_360.Ordinance.apply(lambda x: x.split('/')[-1]).values
all_meta_data_ecode_360['unique_id'] = range(all_meta_data_ecode_360.shape[0])

meta_data = all_meta_data_ecode_360
meta_data.head(2)

Unnamed: 0,State,City/County,Ordinance,Site,href,unique_id
0,Arkansas,City of Lincoln (Washington County),https://ecode360.com/LI3811,General Code,LI3811,0
1,California,City of Albany (Alameda County),https://ecode360.com/AL4074,General Code,AL4074,1


- **\[IMPORTANT\]: In the file `#src/tree_spider.py`, I defined a "spider" to help me scrap the ecode360 websites. The overall idea is to treat each zoning ordinate as a tree and each page as a node. If a page only contains links and no content, it's not a leaf node and we need to scrap all the links in that page; if a page contains content, it is a leaf node and we only scrap the content (text) and stop there.**

In [95]:
%run ../src/tree_spider.py 

In [94]:
for row_idx in tqdm(range(100)):
    curr_row = meta_data.iloc[row_idx]w')
    ts = TreeSpider('/' + curr_row.href)
    ts.run()
    pickle.dump(ts, open('../data/scrapped/' + str(curr_row.unique_id) + '.pkl', 'wb'))

100%|█████████████████████████████████| 33/33 [15:00<00:00, 27.29s/it]


In [72]:
# pickle.dump(ts, open('../data/scrapped/' + str(curr_row.unique_id) + '.pkl', 'wb'))

In [73]:
# tmp = pickle.load(open('../data/scrapped/' + str(curr_row.unique_id) + '.pkl', 'rb'))

In [None]:
import string
valid_ending_letters = set(list(string.ascii_lowercase + ','))
lower_case_letters = set(list(string.ascii_lowercase))

def post_processing(doc):
    '''
        process the doc (str) that comes from the web-scraping result
    '''
    doc += '\n'
    all_sent = doc.split('\n')
    result = ''
    i = 0
    pbar = tqdm(total = len(all_sent)-1)
    print(' => Post processing,', (len(all_sent)-1), 'steps in total.')
    while i < (len(all_sent) - 1):
        curr_str = all_sent[i].strip()
        next_str = all_sent[i+1].strip()
        if len(curr_str) > 0 and len(next_str) > 0 and curr_str[-1] in valid_ending_letters and next_str[0] in lower_case_letters:
            result += curr_str
        else:
            result += curr_str
            result += '\n'
        i += 1
        pbar.update(1)
    pbar.close()
    return result      

In [120]:
# test on an example (www.ecode360.com/BR4037)

href = '/BR4037'
ts = TreeSpider(href)
ts.run()