In [44]:
import re
import string
import pickle
import pandas as pd
import numpy as np
from numpy.linalg import norm

from tqdm import tqdm

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [45]:
%run ../src/tree_spider.py
%run ../src/utils.py

In [11]:
glove_dict = load_glove_embeddings()

100%|███████████████████████| 400000/400000 [01:16<00:00, 5254.88it/s]


In [12]:
def flatten_table(t):
    if len(t) == 0:
        return []
    if isinstance(t[0], str):
        return t
    result = []
    for sub_lst in t:
        result += flatten_table(sub_lst)
    return result

def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

In [26]:
def clean_title_for_table(title):
    title = title.lower().strip().replace('\t', ' ').replace('\n', ' ')
    title = title.replace('chapter', '').replace('article', '')
    title = re.sub('[^a-zA-Z]', ' ', title)
    return title.strip()

def gather_tables(ts):
    table_dict = ts.tables
    table_lst = []
    for title in table_dict.keys():
        for t_pair in table_dict[title]:
            t, href = t_pair
            t = np.array(t)
            if len(t.shape) == 2 and len(t) > 0:
                table_text = ' '.join(t.flatten())
                table_lst.append((clean_title_for_table(title), t, table_text, href))
    return table_lst

In [27]:
i = 0
ts = pickle.load(open('../data/scrapped/' + str(i) + '.pkl', 'rb'))

In [28]:
len(ts.tables.items())

185

In [31]:
# list(ts.tables.items())[:40]

In [32]:
table_lst = gather_tables(ts)
len(table_lst)

  t = np.array(t)


121

In [33]:
table_lst[0]

('fire department',
 array([['',
         'In this situation, the persons under 16 years of age should\nbe removed once other responders arrive or the condition in Subsection\nD1 above is met.']],
       dtype='<U147'),
 ' In this situation, the persons under 16 years of age should\nbe removed once other responders arrive or the condition in Subsection\nD1 above is met.',
 '/32099519')

In [38]:
table_tokens = ['agricultural', 'residential', 'commercial', 'business', 'industrial', 'district', 'housing']
zoning_tokens = ['zoning', 'zone']

In [42]:
def embedding(w, embed_dict):
    w = lemmatizer.lemmatize(w)
    try:
        return embed_dict[w]
    except:
        return None

def cos_similarity(v1, v2):
    cos_sim = np.dot(v1, v2)/(norm(v1)*norm(v2))
    return cos_sim
    
def relavance(text, topic_tokens):
    text_rep = []
    text_words = word_tokenize(text)
    for w in text_words:
        w_vec = embedding(w, glove_dict)
        if w_vec:
            text_rep.append(w_vec)
    if len(text_rep) == 0:
        return 0
    text_rep = np.array(text_rep).mean(axis = 0)
    
    topic_rep = []
    for w in topic_tokens:
        t_vec = embedding(w, glove_dict)
        if t_vec:
            topic_rep.append(t_vec)
    topic_rep = np.array(topic_rep).mean(axis = 0)
    
    return cos_similarity(text_rep, topic_rep)

def find_most_relevance_table(ts):
    table_lst = gather_tables(ts)
    print('=> # of tables found:', len(table_lst))
    
    sim_scores = {}
    for i in range(len(table_lst)):
        title, table, table_text, href = table_lst[i]
        table_text = clean_text(table_text)
        content_sim = relavance(table_text, table_tokens)
        title_sim = relavance(title, zoning_tokens)
        sim_scores[i] = (0.4 * content_sim + 0.6 * title_sim, title, table, table_text, href)
    
    top_tables = sorted(sim_scores.items(), key = lambda item: item[1][0], reverse = True)
    print('=> Top 3 tables in terms of similarity:')
    print()
    for t_idx in top_tables[:3]:
        print(table_lst[t_idx[0]])
        print()
        print()
    return top_tables

In [47]:
all_top_tables = []
for i in tqdm(range(100)):
    ts = pickle.load(open('../data/scrapped/' + str(i) + '.pkl', 'rb'))
    top_tables = find_most_relevance_table(ts)
    all_top_tables.append(top_tables)

  t = np.array(t)


=> # of tables found: 121


  1%|▎                                | 1/100 [00:00<00:24,  4.10it/s]

=> Top 3 tables in terms of similarity:

('annexing  vacating and re zoning', array([['Ord. No. 97-16', 'Alley in Block 1 of Norwood Addition'],
       ['Ord. No. 99-7', 'Alley in Block 22'],
       ['Ord. No. 99-8', 'Part of Lee St. South of Park Street'],
       ['Ord. No. 99-13',
        'Alley West of Lots 13 & 14 of Block 1 of Norwood Addition'],
       ['Ord. No. 2000-05', 'NE Corner of Lot 12 of Block 3']],
      dtype='<U57'), 'Ord. No. 97-16 Alley in Block 1 of Norwood Addition Ord. No. 99-7 Alley in Block 22 Ord. No. 99-8 Part of Lee St. South of Park Street Ord. No. 99-13 Alley West of Lots 13 & 14 of Block 1 of Norwood Addition Ord. No. 2000-05 NE Corner of Lot 12 of Block 3', '/32101151')


('annexing  vacating and re zoning', array([['Ord. No. 97-12', 'From R-1 to R-3',
        'Part of NE 1/4 of S 30, Twp 15 N, Range 32 W'],
       ['Ord. No. 97-14', 'From R-1 to R-3',
        'Part of NW 14/ of S 31, Twp 15 N, Range 32 W'],
       ['Ord. No. 98-5', 'From A-1 to R-1',
  

  2%|▋                                | 2/100 [00:00<00:24,  4.01it/s]

=> Top 3 tables in terms of similarity:

('planning and zoning', array([['Shopping Center and Industrial', "12'", "45'", "14'"],
       ['Commercial', "12'", "25'", "14'"]], dtype='<U30'), "Shopping Center and Industrial 12' 45' 14' Commercial 12' 25' 14'", '/34858251')


('planning and zoning', array([['The term "condominium," as used in this section is intended\nto include all three (3) types of community housing.']],
      dtype='<U112'), 'The term "condominium," as used in this section is intended\nto include all three (3) types of community housing.', '/34858251')


('planning and zoning', array([['Single-family dwelling1', '2 spaces per dwelling unit'],
       ['Two-family dwelling2', '1.5 spaces per dwelling unit'],
       ['Multifamily dwelling3', '1 space per unit'],
       ['Live/work space', '1 space per unit'],
       ['Senior citizen housing development', '0.5 space per unit'],
       ['Residential mixed-use development4', '1 space per unit'],
       ['Affordable dwelling 

  3%|▉                                | 3/100 [00:00<00:32,  3.01it/s]

=> Top 3 tables in terms of similarity:

('establishment of loading  parking  and standing zones  curb markings', array([['In no event shall more than 1/2 of the total curb length in\nany block be reserved for loading zone purposes.']],
      dtype='<U108'), 'In no event shall more than 1/2 of the total curb length in\nany block be reserved for loading zone purposes.', '/35893512')


('coastal development procedures', array([['All repair and maintenance activities governed by the above\nprovisions shall be subject to the permit regulations promulgated\npursuant to the California Coastal Act of 1976, including, but not\nlimited to, the regulations governing administrative and emergency\npermits. The provisions of this section shall not be applicable to\nmethods of repair and maintenance undertaken by the ports listed in\nPublic Resources Code §\xa030700.']],
      dtype='<U425'), 'All repair and maintenance activities governed by the above\nprovisions shall be subject to the permit regu

  4%|█▎                               | 4/100 [00:01<00:36,  2.63it/s]

=> Top 3 tables in terms of similarity:

('zoning', array([['Very Low Income', '5%', '20%', '2.5%', '11%'],
       ['Low Income', '10%', '20%', '1.5%', '20%'],
       ['Moderate Income', '10%', '5%', '1%', '40%'],
       ['Senior Housing/Mobile Home Park', '100%', '20%', '—', '—']],
      dtype='<U31'), 'Very Low Income 5% 20% 2.5% 11% Low Income 10% 20% 1.5% 20% Moderate Income 10% 5% 1% 40% Senior Housing/Mobile Home Park 100% 20% — —', '/35622209')


('zoning', array([['R-A', 'Residential Agriculture'],
       ['R-L', 'Low-Density Residential'],
       ['R-M', 'Medium-Density Residential'],
       ['R-H', 'High-Density Residential'],
       ['C-O', 'Professional Office'],
       ['C-L', 'Limited Commercial'],
       ['C-M', 'Commercial Manufacturing'],
       ['M-L', 'Limited Manufacturing'],
       ['M-H', 'Heavy Manufacturing'],
       ['B', 'Buffer'],
       ['P', 'Automobile Parking'],
       ['D', 'Planned Development'],
       ['B-O', 'Billboard Overlay Zone']], dtype='<U26'),

  5%|█▋                               | 5/100 [00:01<00:33,  2.82it/s]

=> Top 3 tables in terms of similarity:

('zoning', array([['UR', 'Urban Reserve'],
       ['RA', 'Rural Residential'],
       ['R-1',
        'Single-Family Residential, minimum lot size of 6,500 square\nfeet'],
       ['R-1-5',
        'Single-Family Residential, minimum lot size of 5,000 square\nfeet'],
       ['R-1-4.25',
        'Single-Family Residential, minimum lot size of 4,250 square\nfeet'],
       ['R-1 (TN)',
        'Single-Family Residential (Traditional Neighborhood Development),\nminimum lot size of 6,500 square feet'],
       ['R-1-5 (TN)',
        'Single-Family Residential (Traditional Neighborhood Development),\nminimum lot size of 5,000 square feet'],
       ['R-1-4.25 (TN)',
        'Single-Family Residential (Traditional Neighborhood Development),\nminimum lot size of 4,250 square feet'],
       ['R-2',
        'Low-Density Multiple-Family Residential, one unit per 3,250\nsquare feet of lot area'],
       ['R-3',
        'Medium-High-Density Multiple-Family Resi

  8%|██▋                              | 8/100 [00:02<00:17,  5.12it/s]

=> Top 3 tables in terms of similarity:

('zoning', array([['Very Low Income', '5%', '20%', '2.5%', '11%'],
       ['Low Income', '10%', '20%', '1.5%', '20%'],
       ['Moderate Income (Condo or PD only)', '10%', '5%', '1%', '40%'],
       ['Senior Citizen Housing Development', '100%', '20%', '—', '—']],
      dtype='<U34'), 'Very Low Income 5% 20% 2.5% 11% Low Income 10% 20% 1.5% 20% Moderate Income (Condo or PD only) 10% 5% 1% 40% Senior Citizen Housing Development 100% 20% — —', '/35224404')


('zoning', array([['REAL ESTATE TRANSFER DISCLOSURE STATEMENT'],
       ['THIS DISCLOSURE STATEMENT CONCERNS THE REAL PROPERTY LOCATED\nIN THE CITY OF OAKDALE, COUNTY OF STANISLAUS, STATE OF CALIFORNIA,\nDESCRIBED AS ___________________________________________. THIS DISCLOSURE\nOF THE CONDITION OF THE ABOVE DESCRIBED PROPERTY IN COMPLIANCE WITH\nORDINANCE NUMBER ___________ OF THE CITY CODE AS OF__________________________________.\nIT IS NOT A WARRANTY OF ANY KIND BY THE SELLER(S) OR ANY AGENT

=> # of tables found: 36


 13%|████▏                           | 13/100 [00:02<00:11,  7.70it/s]

=> Top 3 tables in terms of similarity:

('a         town meeting rules', array([['', '1', 'Building Inspection and Enforcement'],
       ['', '2', 'Conservation and Environment'],
       ['', '3', 'Economic Development'],
       ['', '4', 'Inland Wetlands and Natural Resources'],
       ['', '5', 'Planning and Zoning'],
       ['', '6', 'Shellfish Commission'],
       ['', '7', 'Water Pollution Control Authority'],
       ['', '8', 'Zoning Board of Appeals']], dtype='<U37'), ' 1 Building Inspection and Enforcement  2 Conservation and Environment  3 Economic Development  4 Inland Wetlands and Natural Resources  5 Planning and Zoning  6 Shellfish Commission  7 Water Pollution Control Authority  8 Zoning Board of Appeals', '/8893608')


('a         town meeting rules', array([['', '1', 'Board of Finance'],
       ['', '2', 'Branford Cable Television'],
       ['', '3', 'Capital Projects that require Bonding'],
       ['', '4', 'Contingency'],
       ['', '5', 'Employee Group Insurance'],

=> Top 3 tables in terms of similarity:

('i      canton center', array([['',
        'The southern boundary of the district on the\nwesterly side of Route 179 is defined by the southerly boundary of\nthe Lyle Crowley property, and on the east side of Route 179 by a\nlinear extension of that line. The district extends northerly from\nthis boundary along both sides of Route 179 to Meadow Road, then westerly\nalong both sides of Meadow Road to West Road, then southerly along\nboth sides of West Road and Barbourtown Road to Route 179. The northern\nboundary of the district on the east side of Route 179 is a line extending\nperpendicular to Route 179 on the north side of the Samuel Richardson\nhouse. On the west side of West Road the northerly boundary is defined\nby the northerly boundary of the Rodney Pratt property. The district\nincludes all property within 1,000 feet of the center line of the\nspecified sections of the above roads, except as noted below.'],
       ['',
        'Annex 

 20%|██████▍                         | 20/100 [00:03<00:07, 11.33it/s]

=> Top 3 tables in terms of similarity:

('a         special acts', array([['[House Joint Resolution No. 345.]'],
       ['[43]'],
       ['CONCERNING SCHOOL PROPERTY IN TOWN OF CLINTON.'],
       ['Resolved by this Assembly: That the Town of\nClinton shall succeed to all rights which any of its school districts\nenjoyed in any real estate within said Town, at the time when said\nTown voted to consolidate its school districts.'],
       ['Approved, March 21, 1895.'],
       ['General Assembly — January, 1895.'],
       ['Special Laws — Vol. XII, p. 43.']], dtype='<U228'), '[House Joint Resolution No. 345.] [43] CONCERNING SCHOOL PROPERTY IN TOWN OF CLINTON. Resolved by this Assembly: That the Town of\nClinton shall succeed to all rights which any of its school districts\nenjoyed in any real estate within said Town, at the time when said\nTown voted to consolidate its school districts. Approved, March 21, 1895. General Assembly — January, 1895. Special Laws — Vol. XII, p. 43.', '/127306

 24%|███████▋                        | 24/100 [00:03<00:05, 15.07it/s]

=> Top 3 tables in terms of similarity:

('ix      business incentive program', array([['15', '$3M or larger investment in construction or renovations'],
       ['14', '$1M or larger investment in construction or renovations'],
       ['13',
        '$800k or larger investment in construction or renovations'],
       ['12',
        '$500k or larger investment in construction or renovations'],
       ['11',
        'A business new to that parcel investing over $300k or a business\nexisting on that parcel investing at least $250k in construction or\nrenovations'],
       ['10',
        'A business new to that parcel investing over $200k or a business\nexisting on that parcel investing at least $100,000 in construction\nor renovations']],
      dtype='<U147'), '15 $3M or larger investment in construction or renovations 14 $1M or larger investment in construction or renovations 13 $800k or larger investment in construction or renovations 12 $500k or larger investment in construction or ren

 31%|█████████▉                      | 31/100 [00:03<00:04, 16.68it/s]

=> Top 3 tables in terms of similarity:

('fees for land use application processing and review', array([['',
        'One-family and two-family residences or administrative approval',
        '$100', '+', '$60', '=', '$160'],
       ['', 'Commercial multifamily residence per dwelling unit', '$50',
        '+', 'Basic Fee and State Fee', '=', 'Variable'],
       ['', 'If applicable add:', '', '', '', '', ''],
       ['', 'Public Hearing', '$175', '+', 'Basic Fee and State Fee',
        '=', 'Variable'],
       ['', 'Wetlands Jurisdiction Review', '$75', '+',
        'Basic Fee and State Fee', '=', 'Variable'],
       ['', 'Regulation Amendment', '$175', '+',
        'Basic Fee and State Fee', '=', 'Variable'],
       ['',
        'Subdivision: For each lot with wetlands or watercourses and/or\nin which activity is proposed in a wetland/regulated area',
        '$50/lot', '+', 'Basic Fee and State Fee', '=', 'Variable'],
       ['', 'Commercial Activity', '$300', '+',
        'Basic Fee 

 34%|██████████▉                     | 34/100 [00:03<00:04, 15.35it/s]

=> Top 3 tables in terms of similarity:

('affordable housing agency regulations', array([['',
        'HUD Middletown - Middlesex County area median income = X; (X\nx .80) /12 x .30 = amount available for housing = H. H (-taxes) (-hazard\ninsurance) = amount available for mortgage payment, or A.']],
      dtype='<U191'), ' HUD Middletown - Middlesex County area median income = X; (X\nx .80) /12 x .30 = amount available for housing = H. H (-taxes) (-hazard\ninsurance) = amount available for mortgage payment, or A.', '/15732539')


('affordable housing agency regulations', array([['',
        '$320,966 for Killingworth in 2007 using Middletown - Middlesex\nCounty area'],
       ['',
        '$254,918 for Killingworth in 2007 using New Haven County area']],
      dtype='<U74'), ' $320,966 for Killingworth in 2007 using Middletown - Middlesex\nCounty area  $254,918 for Killingworth in 2007 using New Haven County area', '/15732539')


('a         special acts', array([['',
        'January

=> # of tables found: 27
=> Top 3 tables in terms of similarity:

('housing code', array([['',
        'Exception: All existing residential\nrental housing units that are the property of the State of Connecticut\nshall be exempt from this code. This exemption shall not include residential\nrental housing units owned by an entity leasing real property from\nthe State of Connecticut.']],
      dtype='<U276'), ' Exception: All existing residential\nrental housing units that are the property of the State of Connecticut\nshall be exempt from this code. This exemption shall not include residential\nrental housing units owned by an entity leasing real property from\nthe State of Connecticut.', '/11768144')


('housing code', array([['The provisions of this code shall be governed\nby the following:'],
       ['Connecticut State Building Code and Connecticut\nSupplement'],
       ['Connecticut Fire Safety Code'],
       ['Connecticut Public Health Code'],
       ['Town of Mansfield Code of Ordi

 38%|████████████▏                   | 38/100 [00:04<00:05, 12.18it/s]

=> Top 3 tables in terms of similarity:

('ii      establishment of zoning districts and provision for official zoning map', array([['R', 'Residential Zone'],
       ['DMR', 'Designed Multiple Residence'],
       ['GC', 'General Commercial Zone'],
       ['VCD', 'Village Center District'],
       ['DBIZ', 'Designed Business/Industrial Zone'],
       ['GI', 'General Industrial Zone'],
       ['DI', 'Designed Industrial Zone'],
       ['DR', 'Designed Recreation Zone'],
       ['DC', 'Designed Commercial Zone']], dtype='<U33'), 'R Residential Zone DMR Designed Multiple Residence GC General Commercial Zone VCD Village Center District DBIZ Designed Business/Industrial Zone GI General Industrial Zone DI Designed Industrial Zone DR Designed Recreation Zone DC Designed Commercial Zone', '/29252344')


('vi      residential zone regulations', array([['a.',
        'On-site individual subsurface sewage disposal system and well',
        '80,000 square feet', '200 feet'],
       ['b.',
        '

 40%|████████████▊                   | 40/100 [00:04<00:05, 10.29it/s]

=> Top 3 tables in terms of similarity:

('enterprise zone', array([['', 'First', '100%'],
       ['', 'Second', '100%'],
       ['', 'Third', '50%'],
       ['', 'Fourth', '40%'],
       ['', 'Fifth', '30%'],
       ['', 'Sixth', '20%'],
       ['', 'Seventh', '10%']], dtype='<U7'), ' First 100%  Second 100%  Third 50%  Fourth 40%  Fifth 30%  Sixth 20%  Seventh 10%', '/8364279')


('i      tax abatements  exemptions  deferrals and incentive programs', array([['Year 1',
        '5% of the deferred assessment will be assessed in addition to\nthe base assessment.'],
       ['Year 2',
        '5% of the deferred assessment will be assessed in addition to\nthe base assessment.'],
       ['Year 3',
        '10% of the deferred assessment will be assessed in addition\nto the base assessment.'],
       ['Year 4',
        '15% of the deferred assessment will be assessed in addition\nto the base assessment.'],
       ['Year 5',
        '30% of the deferred assessment will be assessed in additio

 44%|██████████████                  | 44/100 [00:04<00:04, 11.99it/s]

=> Top 3 tables in terms of similarity:

('a         conveyance ordinances', array([['HJRes. 440', '', '', '6-26-1895',
        'Union Baptist Church authorized to take gifts\nof property'],
       ['O-Q', '1/465', '1/21', '10-24-1972',
        'Leasing of residence at Camp Oakdale (repealed\nby Ord. No. O-U-1)'],
       ['O-R', '1/466', '1/21', '10-24-1972',
        'Leasing of property at 298 Norwich-New London\nTurnpike'],
       ['O-S', '1/488', '1/22', '12-11-1972',
        'Gifford property utility easement'],
       ['O-T', '1/489', '1/23', '12-11-1972',
        'Leasing or sale of Bridge Street property adjacent\nto All-Time Manufacturing Company property'],
       ['O-V', '2/22', '1/24', '7-9-1973',
        'Maple Avenue property adjacent to Fred Johnson\nproperty'],
       ['O-G-1', '3/70', '1/34', '6-14-1976',
        'Southerly side of PTA Lane'],
       ['O-U-1', '4/180', '1/46', '1-14-1980',
        'Leasing of residence at Camp Oakdale (repeals\nOrd. No. O-U and is super

 46%|██████████████▋                 | 46/100 [00:05<00:04, 11.18it/s]

=> # of tables found: 40
=> Top 3 tables in terms of similarity:

('iii      establishment of zoning districts', array([['Farming and Residential - R-1/2'],
       ['Farming and Residential - R-1'],
       ['Farming and Residential - R-2'],
       ['Farming and Residential - R-3'],
       ['Multiple Family for Elderly Housing -EH-10'],
       ['Incentive Housing Overlay Zone - IHOZ'],
       ['Business and Professional Office - BPO'],
       ['Retail Business - B-1'],
       ['General Business - B-2'],
       ['Industrial - M-1'],
       ['Planned Commercial Development - M-2A[Added 4-17-2000]'],
       ['Industrial - M-3'],
       ['Industrial - M-4'],
       ['Industrial - M-5'],
       ['Special Development District #1 - SDD-1'],
       ['Fairfield Hills Adaptive Reuse - FHAR'],
       ['Conservation and Agriculture - CA[Added 5-18-1998]'],
       ['Hawleyville Center Design District (HCDD)[Added 8-9-1999]'],
       ['Sandy Hook Design District - SHDD[Added 10-2-1995]'],
       ['Pu

 49%|███████████████▋                | 49/100 [00:05<00:04, 10.69it/s]

=> # of tables found: 70
=> Top 3 tables in terms of similarity:

('zone designations', array([['', 'AAA Residence Zones'],
       ['', 'AA Residence Zones'],
       ['', 'A Residence Zones'],
       ['', 'B Residence Zones'],
       ['', 'C Residence Zones'],
       ['', 'D Residence Zones'],
       ['', 'Executive Office Zone'],
       ['', 'Business Zones No. 1'],
       ['', 'Business Zones No. 2'],
       ['', 'Rowayton Avenue Village District'],
       ['', 'Silvermine Tavern Village District'],
       ['', 'East Avenue Village District'],
       ['', 'Golden Hill Village District'],
       ['', 'SoNo Station Design District'],
       ['', 'Neighborhood Business Zone'],
       ['', 'South Norwalk Business District'],
       ['', 'Central Business District'],
       ['', 'Marine Commercial Zone'],
       ['', 'Industrial No. 1 Zone'],
       ['', 'Light Industrial No. 2 Zone'],
       ['', 'Restricted Industrial Zone'],
       ['', 'Research and Development Zone'],
       ['', 'Is

 51%|████████████████▎               | 51/100 [00:05<00:04, 11.15it/s]

=> Top 3 tables in terms of similarity:

('v      subdivision design and construction standards', array([['', 'Arterial', '250', '10', '250'],
       ['', 'Collector', '200', '10', '200'],
       ['', 'Local residential', '155', '12', '200']], dtype='<U17'), ' Arterial 250 10 250  Collector 200 10 200  Local residential 155 12 200', '/8842073')


('vii      road criteria', array([['Collector road', '60', '8', '30'],
       ['Local residential road', '50', '4', '24'],
       ['Private residential road', '50', '4', '22']], dtype='<U24'), 'Collector road 60 8 30 Local residential road 50 4 24 Private residential road 50 4 22', '/34254317')


('a         special acts affecting old lyme', array([['The Hatchetts Improvement Company, its successors and assigns,\nis authorized to construct and maintain a sea-wall or sea-walls in front of\nand around its land situated at Hatchetts Point, at South Lyme in the town\nof Old Lyme, bounded easterly and southerly by Long Island Sound, which sea-walls

 55%|█████████████████▌              | 55/100 [00:05<00:03, 11.96it/s]

=> # of tables found: 14
=> Top 3 tables in terms of similarity:

('a         special acts', array([['An Act Incorporating the Town of Plainville',
        'Approved July 6, 1869'],
       ['An Act Validating the Vote of the Town of Plainville To Unite\nIts School Districts',
        'Approved July 18, 1872'],
       ['An Act Authorizing the Town of Plainville To Issue Sidewalk\nConstruction Bonds',
        'Special Act 29-462, approved June 18, 1929'],
       ['An Act Concerning the Vesting of Title to Certain Real Estate\nLocated in the Town of Plainville',
        'Special Act 41-19, approved March 12, 1941'],
       ['An Act Validating Certain Tax Sales and Tax Deeds in the Town\nof Plainville',
        'Special Act 41-24, approved March 12, 1941'],
       ['An Act Validating Acts and Deeds, Valid Except for Certain Irregularities\nand Omissions',
        'Special Act 51-558, approved July 9, 1951'],
       ['An Act Placing the Town Clerk of Plainville on Salary',
        'Special 

 57%|██████████████████▏             | 57/100 [00:06<00:03, 12.93it/s]

=> Top 3 tables in terms of similarity:

('sewers and sewage disposal', array([['', 'Sand/Soil', 'Ledge Rock'],
       ['Single-, 2- or 3-family structure', '100 feet',
        'Sanitary sewer exists at property line'],
       ['Structure with 4 or more family units; residential developments with\nfewer than 10 dwelling units',
        '500 feet', '250 feet'],
       ['Residential development with 10 or more dwelling units',
        '3,000 feet', '1,500 feet'],
       ['Commercial structures or properties with multiple commercial structures\nwith a combined, designed sewage flow of less than 2,000 gallons per day',
        '750 feet', '500 feet']], dtype='<U144'), ' Sand/Soil Ledge Rock Single-, 2- or 3-family structure 100 feet Sanitary sewer exists at property line Structure with 4 or more family units; residential developments with\nfewer than 10 dwelling units 500 feet 250 feet Residential development with 10 or more dwelling units 3,000 feet 1,500 feet Commercial structures or pro

 59%|██████████████████▉             | 59/100 [00:06<00:03, 12.24it/s]

=> Top 3 tables in terms of similarity:

('iii      residential districts', array([['Somers', 'Somersville'],
       ['Residential', 'Agricultural']], dtype='<U12'), 'Somers Somersville Residential Agricultural', '/8851778')


('iii      residential districts', array([['Provisions may be different in an open space subdivision approved in accordance with Section 214-6.6 of the Zoning Regulations and §\xa0213-62 of the Somers Subdivision Regulations.']],
      dtype='<U178'), 'Provisions may be different in an open space subdivision approved in accordance with Section 214-6.6 of the Zoning Regulations and §\xa0213-62 of the Somers Subdivision Regulations.', '/8851778')


('iv      business industrial districts', array([['See Section 214-4.2 for declaration of the Village Business District as a "Village District" in accordance with C.G.S. Section 8-2j.']],
      dtype='<U132'), 'See Section 214-4.2 for declaration of the Village Business District as a "Village District" in accordance with

 63%|████████████████████▏           | 63/100 [00:06<00:03, 11.71it/s]

=> Top 3 tables in terms of similarity:

('section        retail commercial districts  ca and cnc', array([['RETAIL COMMERCIAL NEIGHBORHOOD DISTRICTS CNC'],
       ['(Neighborhood Shopping Centers)']], dtype='<U44'), 'RETAIL COMMERCIAL NEIGHBORHOOD DISTRICTS CNC (Neighborhood Shopping Centers)', '/35401595')


('noise', array([['', 'Residential', '62', '55', '55', '45'],
       ['', 'Commercial', '62', '62', '55', '45'],
       ['', 'Industrial', '70', '66', '61', '51']], dtype='<U11'), ' Residential 62 55 55 45  Commercial 62 62 55 45  Industrial 70 66 61 51', '/9062023')


('section        districts', array([['One-Family Residence Districts (RS), comprising:'],
       ['RS-1 Districts'],
       ['RS-2 Districts'],
       ['RS-3 Districts'],
       ['RS-4 Districts'],
       ['Resource Conservation Districts'],
       ['Multifamily Residence Districts (RM), comprising:'],
       ['RM-1 Districts'],
       ['Limited Business Districts, comprising:'],
       ['LB Districts'],
       ['L

 65%|████████████████████▊           | 65/100 [00:06<00:03, 11.10it/s]

=> Top 3 tables in terms of similarity:

('building construction', array([['Building, plumbing, mechanical, electrical and\ndemolition value',
        ''],
       ['0 to $2,000', '$28'],
       ['Over $2,000', '$14 per thousand or any part thereof'],
       ['Late fee', '$50 (for work started prior to permit)'],
       ['All certificates of occupancies, Housing Code certificates of\ncompliance and all reinspections',
        '$10 per unit']], dtype='<U94'), 'Building, plumbing, mechanical, electrical and\ndemolition value  0 to $2,000 $28 Over $2,000 $14 per thousand or any part thereof Late fee $50 (for work started prior to permit) All certificates of occupancies, Housing Code certificates of\ncompliance and all reinspections $10 per unit', '/8856019')


       ['', 'The accumulation of three or more unsatisfied'],
       ['', 'municipal vehicle parking citations upon any one'],
       ['', 'vehicle may result in the impoundment or'],
       ['', 'immobilization of said vehicle upon 

 69%|██████████████████████          | 69/100 [00:07<00:02, 12.63it/s]

=> Top 3 tables in terms of similarity:

('a         special acts', array([['21-482', '6-24-1921',
        'An Act Dividing the Town of Orange and Creating the Town of\nWest Haven'],
       ['23-21', '3-9-1923',
        'An Act Authorizing the Board of Selectmen of the Town of West\nHaven to Regulate the Construction of Marquees in the Town of West\nHaven'],
       ['23-22', '3-9-1923',
        'An Act Concerning the Board of Finance in the Town of West Haven'],
       ['23-23', '3-9-1923',
        'An Act Concerning the Board of Police Commissioners of the Town\nof West Haven'],
       ['25-205', '4-23-1925',
        'An Act Concerning the Board of Finance of the Town of West Haven'],
       ['29-273', '6-12-1929',
        'An Act Authorizing the Town of West Haven to Issue Bonds for\nPermanent Road Improvements'],
       ['31-136', '4-1-1931',
        'An Act Defining the Boundary Lines of the First Taxation District\nof the Town of West Haven'],
       ['31-137', '4-1-1931',
       

 71%|██████████████████████▋         | 71/100 [00:07<00:02, 13.66it/s]

=> Top 3 tables in terms of similarity:

('going out of business sales', array([['For a license period not exceeding\n15 days: $25.'],
       ['For a license period not exceeding\n30 days: $50.'],
       ['For a license period not exceeding\n60 days: $75.'],
       ['For an extension after the original\nlicense period, $3 per day and, in each instance, a further fee of\n$1 per $1,000 of the cost of the property to be sold under such license\nas set forth in the inventory attached to the application for such\nlicense. In the case of application by the licensee for a further\nextension of the license period within the maximum sixty-day license\nperiod, the applicant shall pay an additional license fee equal to\nthe difference between the fee for the total period of the license\nand the fee paid for the original license period.']],
      dtype='<U560'), 'For a license period not exceeding\n15 days: $25. For a license period not exceeding\n30 days: $50. For a license period not exceeding\n

 77%|████████████████████████▋       | 77/100 [00:07<00:01, 19.65it/s]

=> Top 3 tables in terms of similarity:

('vi      districts established', array([['R-1', 'Residential District'],
       ['R-1A', 'Boardwalk Residential District'],
       ['R-1B', 'Sea Villas Residential District'],
       ['R-2', 'Residential District'],
       ['C-1', 'Central Commercial District'],
       ['C-2', 'Neighborhood Commercial District'],
       ['CL-1', 'Commercial Lodging District'],
       ['MORE',
        'Municipal, Open Space, Recreation Facilities\nand Educational District']],
      dtype='<U69'), 'R-1 Residential District R-1A Boardwalk Residential District R-1B Sea Villas Residential District R-2 Residential District C-1 Central Commercial District C-2 Neighborhood Commercial District CL-1 Commercial Lodging District MORE Municipal, Open Space, Recreation Facilities\nand Educational District', '/11903136')


('xv      off street parking and loading', array([['', 'All commercial uses (except commercial lodging\nrooms)',
        '1,000 to 19,999', '1'],
       ['

 79%|█████████████████████████▎      | 79/100 [00:07<00:01, 12.20it/s]

=> Top 3 tables in terms of similarity:

('ix      overlay zoning districts', array([['', 'Any use permitted in the underlying zone.'],
       ['', 'Beekeeping.'],
       ['',
        'Cultivation for sale of agricultural products\ngrown to provide food, forage, or fibers.'],
       ['',
        'Horticultural and floricultural activities,\nsuch as flowers, shrubs, and trees intended for ornamental or landscaping\npurposes, including wholesale or retail nurseries limited to the sale\nof products grown on-site and in greenhouses on-site.'],
       ['', 'Mechanical repair facilities for agricultural\nequipment.'],
       ['', 'Orchards.'],
       ['',
        'Packing or processing of agricultural crops,\nanimals, and their by-products that entails more than picking, cutting,\nsorting and boxing or crating, but does not include rendering, tanning\nor reduction of meat.'],
       ['',
        'Roadside stands for the sale of agricultural\nproducts, provided adequate off-street parking sha

 81%|█████████████████████████▉      | 81/100 [00:08<00:01, 10.44it/s]

=> Top 3 tables in terms of similarity:

('enforcement and control of vehicles and traffic', array([['Section 1.', 'Definitions'],
       ['Section 3.',
        'Department of Public Transportation and Department of Safety\nand Homeland Security'],
       ['Section 4.', 'Reciprocal Agreements'],
       ['Section 5.',
        'Marking of Highways and Erection of Traffic Signals and Other\nSigns'],
       ['Section 7.', 'Enforcement; Arrest, Bail and Appeal'],
       ['Section 8.', 'Provisions Regarding Civil Traffic Offenses'],
       ['Section 21.', 'Registration of Vehicles'],
       ['Section 23.', 'Title and Liens'],
       ['Section 25.', 'Transfer of Title, Registration and Liens'],
       ['Section 26.', 'Uniform Commercial Driver License Act'],
       ['Section 27.', "Driver's License"],
       ['Section 28.', 'Habitual Offenders'],
       ['Section 29.', 'Motor Vehicle Safety--Responsibility'],
       ['Section 31.', 'Nondriver Identification Cards'],
       ['Section 41.', 'Ru

 83%|██████████████████████████▌     | 83/100 [00:08<00:01, 10.90it/s]

=> Top 3 tables in terms of similarity:

('zoning', array([['', 'Single-family dwelling or townhouse', '2 per dwelling', ''],
       ['', 'Church or temple, auditorium or place of assembly',
        '1 per 5 seats or bench seating spaces',
        '(Seats in main auditorium only)'],
       ['',
        'Public buildings such as public library, museum, art gallery\nor community center',
        '10 per use',
        'Plus 1 additional space for each 300 square feet of floor area'],
       ['', 'Motel/hotel', '1 per sleeping unit',
        'Plus 1 for each 10 rooms or portion thereof'],
       ['', 'Professional services building',
        '1 per 200 square feet of floor area',
        '6 minimum, 10 minimum for a clinic'],
       ['', 'Restaurant or eatery',
        '1 per 100 square feet of total patron area', ''],
       ['', 'Retail store', '1 per 300 square feet of floor area', '']],
      dtype='<U80'), ' Single-family dwelling or townhouse 2 per dwelling   Church or temple, audito

 85%|███████████████████████████▏    | 85/100 [00:08<00:01, 10.57it/s]

=> Top 3 tables in terms of similarity:

('ii      zoning classifications', array([['Single-Family Residential Zone (R-1)'],
       ['Duplex Residential Zone (R-2)'],
       ['Townhouse Residential Zone (R-3)'],
       ['Multifamily Residential Zone (R-4)'],
       ['Neighborhood Commercial Zone (C-1)'],
       ['Central Commercial Zone (C-2)'],
       ['Service Commercial Zone (C-3)'],
       ['Manufacturing Zone (M)'],
       ['Industrial Park Manufacturing Zone (IMP)'],
       ['Traditional Neighborhood Development Zone (TND)']], dtype='<U47'), 'Single-Family Residential Zone (R-1) Duplex Residential Zone (R-2) Townhouse Residential Zone (R-3) Multifamily Residential Zone (R-4) Neighborhood Commercial Zone (C-1) Central Commercial Zone (C-2) Service Commercial Zone (C-3) Manufacturing Zone (M) Industrial Park Manufacturing Zone (IMP) Traditional Neighborhood Development Zone (TND)', '/10132523')


('xv      parking and access', array([['Residential', '60%', '100%', '100%', '80%', '1

 87%|███████████████████████████▊    | 87/100 [00:08<00:01,  7.75it/s]

=> Top 3 tables in terms of similarity:

('iii      districts and district maps', array([['AC', 'Agricultural Conservation'],
       ['AR', 'Agricultural Residential'],
       ['RS1', 'Single-family Residential'],
       ['RS5', 'Medium-Density Residential'],
       ['RMH', 'Residential Manufactured Home'],
       ['RM', 'Multifamily Residential'],
       ['BN', 'Neighborhood Business'],
       ['BC', 'Business Complex'],
       ['BG', 'General Business'],
       ['IL', 'Limited Industrial'],
       ['IG', 'General Industrial']], dtype='<U29'), 'AC Agricultural Conservation AR Agricultural Residential RS1 Single-family Residential RS5 Medium-Density Residential RMH Residential Manufactured Home RM Multifamily Residential BN Neighborhood Business BC Business Complex BG General Business IL Limited Industrial IG General Industrial', '/7602290')


('vi      ar   agricultural residential district', array([['Airfields[1]'],
       ['Animal hospital'],
       ['Auction facilities'],
       ['

 89%|████████████████████████████▍   | 89/100 [00:09<00:01,  7.31it/s]

=> Top 3 tables in terms of similarity:

('iv      improvement specifications and design standards for major subdivisions', array([['Residential', '25', '150', '10%', '150', '26', '12']],
      dtype='<U11'), 'Residential 25 150 10% 150 26 12', '/7033895')


('iv      improvement specifications and design standards for major subdivisions', array([['', 'Main thoroughfares', '80 to 100', '44', 'Yes'],
       ['', 'Secondary streets', '60 to 80', '36 to 40', 'Yes'],
       ['', 'Residential streets and group housing and apartments', '50',
        '32', 'Yes'],
       ['', 'Residential streets', '50', '32', 'Yes']], dtype='<U52'), ' Main thoroughfares 80 to 100 44 Yes  Secondary streets 60 to 80 36 to 40 Yes  Residential streets and group housing and apartments 50 32 Yes  Residential streets 50 32 Yes', '/7033895')


('vii      off street parking and loading', array([['',
        'Retail store, department store, eating and drinking establishment,\nwholesale establishment, warehouse, genera

 92%|█████████████████████████████▍  | 92/100 [00:09<00:00,  9.18it/s]

=> Top 3 tables in terms of similarity:

('iii      zoning districts and zoning map', array([['MR', 'Medium-Density Residential'],
       ['HR', 'High-Density Residential'],
       ['MH', 'Mobile Home Park'],
       ['UB', 'Urban Business'],
       ['C', 'Commercial'],
       ['HC', 'Highway Commercial'],
       ['LI', 'Light Industry'],
       ['RPC', 'Residential Planned Community'],
       ['PC', 'Planned Commercial'],
       ['OS', 'Open Space'],
       ['WP', 'Wellhead Protection Overlay'],
       ['TND', 'Traditional Neighborhood Development']], dtype='<U36'), 'MR Medium-Density Residential HR High-Density Residential MH Mobile Home Park UB Urban Business C Commercial HC Highway Commercial LI Light Industry RPC Residential Planned Community PC Planned Commercial OS Open Space WP Wellhead Protection Overlay TND Traditional Neighborhood Development', '/8028074')


('subdivision of land', array([['', 'Medium-Density Residential (MR)', '15.1 - 25', '10%'],
       ['', '', '25.1 or mo

 94%|██████████████████████████████  | 94/100 [00:09<00:00, 10.04it/s]

=> # of tables found: 46
=> Top 3 tables in terms of similarity:

('vii      planned community zoning districts', array([['1.',
        'Gross tract area means the total area of land contained in the\npetition for reclassification to a planned community district.'],
       ['2.',
        'Net tract area means the area of land from which the minimum\ncommon open space has been subtracted.']],
      dtype='<U125'), '1. Gross tract area means the total area of land contained in the\npetition for reclassification to a planned community district. 2. Net tract area means the area of land from which the minimum\ncommon open space has been subtracted.', '/15703962')


('vii      planned community zoning districts', array([['', 'Dimensional standards',
        'May be established by ordinance enacted by the Town Council\nat the time a property is placed in a planned community zoning district'],
       ['',
        'Dimensional standards not specified by ordinance enacted at\nthe time a property

 96%|██████████████████████████████▋ | 96/100 [00:09<00:00,  7.95it/s]

=> # of tables found: 59
=> Top 3 tables in terms of similarity:

('ii      establishment of districts', array([['R-1 Residential District'],
       ['R-2 Residential District'],
       ['R-3 Residential District'],
       ['R-4 Residential District[Added 5-1-2006]'],
       ['MR Multifamily Residential District'],
       ['DR Duplex Residential District'],
       ['MH Mobile Home Park Residential District'],
       ['HR Historic Residential District'],
       ['HB Historic Business District'],
       ['GC General Commercial District'],
       ['IP Industrial Park District'],
       ['RPC Residential Planned Community District[Added 12-3-2012]']],
      dtype='<U59'), 'R-1 Residential District R-2 Residential District R-3 Residential District R-4 Residential District[Added 5-1-2006] MR Multifamily Residential District DR Duplex Residential District MH Mobile Home Park Residential District HR Historic Residential District HB Historic Business District GC General Commercial District IP I

=> # of tables found: 180


 98%|███████████████████████████████▎| 98/100 [00:10<00:00,  5.81it/s]

=> Top 3 tables in terms of similarity:

('ii      establishment of districts  maps', array([['AR-1', 'Agricultural Residential District'],
       ['AR-2', 'Agricultural Residential District'],
       ['MR', 'Medium-Density Residential District'],
       ['GR', 'General Residential District'],
       ['HR-1', 'High-Density Residential District'],
       ['HR-2', 'High-Density Residential District'],
       ['UR', 'Urban Residential District'],
       ['RPC', 'Residential Planned Community District'],
       ['VRP', 'Vacation-Retirement-Residential-Park District']],
      dtype='<U45'), 'AR-1 Agricultural Residential District AR-2 Agricultural Residential District MR Medium-Density Residential District GR General Residential District HR-1 High-Density Residential District HR-2 High-Density Residential District UR Urban Residential District RPC Residential Planned Community District VRP Vacation-Retirement-Residential-Park District', '/8883884')


('ii      establishment of districts  ma

100%|███████████████████████████████| 100/100 [00:10<00:00,  9.18it/s]

=> Top 3 tables in terms of similarity:

('iii      zoning districts', array([['Land Use Category', 'Zoning District'],
       ['Residential Low', 'RS-70 Single Family Residential'],
       ['Residential Low', 'RS-50 Single Family Residential'],
       ['Residential Low', 'RS-35 Single Family Residential'],
       ['Residential Low Medium', 'RM-10 Residential Multifamily'],
       ['Residential Medium', 'RM-12.5 Residential Multifamily'],
       ['Residential Medium', 'RM-15 Residential Multifamily'],
       ['Residential Medium', 'MH-13.5 Mobile Home Residential'],
       ['Residential High', 'RM-15.5 Residential Multifamily'],
       ['Commercial General', 'Commercial General FAR 0.55'],
       ['Commercial water-dependent', 'Commercial General FAR 0.50'],
       ['Institutional', 'Commercial General FAR 0.65'],
       ['Institutional-public', 'Commercial General FAR 0.65'],
       ['Preservation', 'Aquatic Lands, Preservation'],
       ['Rec/Open Space',
        'All districts excep




In [None]:
for i in tqdm(range(100)):
    ts = pickle.load(open('../data/scrapped/' + str(i) + '.pkl', 'rb'))

In [None]:
# 1. filter out table with wrong format
# 2. save all the tables into some local format
# 3. change the scoring function