In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib3
import warnings
from dataclasses import dataclass, field, asdict
import uuid
from datetime import datetime

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")

In [7]:
property_tag_mapping = {
    "MainContent_lblPid": "pid",
    "MainContent_lblAcctNum": "account_number",
    "lblTownName": "town_name",
    "MainContent_lblLocation": "address",
    "MainContent_lblGenOwner": "owner",
    "MainContent_lblAddr1": "owner_address",
    "MainContent_lblCoOwner": "co_owner",
    "MainContent_lblPrice": "sale_price",
    "MainContent_lblSaleDate": "sale_date",
    "MainContent_lblBp": "book_page",
    "MainContent_lblInstrument": "label_instrument",
    "MainContent_lblGenAssessment": "assesment_value",
    "MainContent_lblGenAppraisal": "appraisal_value",
    "MainContent_lblBldCount": "building_count",
    "MainContent_lblUseCodeDescription": "building_use",
    "MainContent_lblAltApproved": "land_alt_approved",
    "MainContent_lblUseCode": "land_use_code",
    "MainContent_lblZone": "land_zone",
    "MainContent_lblNbhd": "land_neighborhood_code",
    "MainContent_lblLndAcres": "land_size_acres",
    "MainContent_lblLndFront": "land_frontage",
    "MainContent_lblDepth": "land_depth",
    "MainContent_lblLndAsmt": "land_assessed_value",
    "MainContent_lblLndAppr": "land_appraised_value"
}

In [8]:
new_haven_url = f"https://gis.vgsi.com/NewHavenCT/Parcel.aspx?pid="


def get_soup(url, pid):
    page = requests.get(url+str(pid), verify=False)
    return BeautifulSoup(page.content, "html.parser")

In [9]:
soup = get_soup(new_haven_url, 300)

In [35]:
def load_property_dict(soup):
    
    property_tag_mapping = {
        "MainContent_lblPid": "pid",
        "MainContent_lblAcctNum": "account_number",
        "lblTownName": "town_name",
        "MainContent_lblLocation": "address",
        "MainContent_lblGenOwner": "owner",
        "MainContent_lblAddr1": "owner_address",
        "MainContent_lblCoOwner": "co_owner",
        "MainContent_lblPrice": "sale_price",
        "MainContent_lblCertificate": "certificate",
        "MainContent_lblSaleDate": "sale_date",
        "MainContent_lblBp": "book_page",
        "MainContent_lblBookLabel": "book_label",
        "MainContent_lblBook": "book",
        "MainContent_lblPageLabel": "page_label",
        "MainContent_lblPage": "page",
        "MainContent_lblInstrument": "label_instrument",
        "MainContent_lblGenAssessment": "assesment_value",
        "MainContent_lblGenAppraisal": "appraisal_value",
        "MainContent_lblBldCount": "building_count",
        "MainContent_lblUseCodeDescription": "building_use",
        "MainContent_lblAltApproved": "land_alt_approved",
        "MainContent_lblUseCode": "land_use_code",
        "MainContent_lblZone": "land_zone",
        "MainContent_lblNbhd": "land_neighborhood_code",
        "MainContent_lblLndAcres": "land_size_acres",
        "MainContent_lblLndFront": "land_frontage",
        "MainContent_lblDepth": "land_depth",
        "MainContent_lblLndAsmt": "land_assessed_value",
        "MainContent_lblLndAppr": "land_appraised_value"
    }

    property_dict = {}

    for tag in soup.find_all('span'):
        try:
            property_dict.update({property_tag_mapping[tag['id']]: tag.get_text(separator = ' ', strip = True)})
        except KeyError:
            pass

    return property_dict

In [36]:
property_dict = load_property_dict(soup)
property_dict

{'town_name': 'New Haven, CT',
 'address': '110 ELIZABETH ANN DR',
 'account_number': '022 0928 03700',
 'owner': 'DEITCH JOSHUA',
 'assesment_value': '$172,270',
 'appraisal_value': '$246,100',
 'pid': '300',
 'building_count': '1',
 'co_owner': '',
 'owner_address': '110 ELIZABETH ANN DR NEW HAVEN, CT 06512',
 'sale_price': '$252,000',
 'certificate': '',
 'book_page': '9597/0069',
 'book_label': '',
 'book': '',
 'page_label': '',
 'page': '',
 'sale_date': '07/12/2017',
 'label_instrument': '00',
 'land_use_code': '1040',
 'building_use': 'Two Family',
 'land_zone': 'RM1',
 'land_neighborhood_code': '0300',
 'land_alt_approved': 'No',
 'land_size_acres': '0.2',
 'land_frontage': '69',
 'land_depth': '124',
 'land_assessed_value': '$54,320',
 'land_appraised_value': '$77,600'}

In [39]:
for i in soup.find_all('span'):
    try:
        tag = property_tag_mapping[i['id']]
    except:
        tag = 'DNE'
    try:
        print(tag + " (" + i['id'] + "): " + i.get_text(separator = ' ', strip = True))
    except:
        pass

town_name (lblTownName): New Haven, CT
DNE (legend): 
DNE (MainContent_lblTab1Title): 110 ELIZABETH ANN DR
DNE (printButton): Print
DNE (showMapLinks): Map
                                It
address (MainContent_lblLocation): 110 ELIZABETH ANN DR
DNE (MainContent_lblMbluLabel): Mblu
DNE (MainContent_lblMblu): 022/  0928/  03700/  /
DNE (MainContent_lblAcctNumLabel): Acct#
account_number (MainContent_lblAcctNum): 022 0928 03700
owner (MainContent_lblGenOwner): DEITCH JOSHUA
DNE (MainContent_lblGenAssessmentLabel): Assessment
assesment_value (MainContent_lblGenAssessment): $172,270
DNE (MainContent_lblGenAppraisalLabel): Appraisal
appraisal_value (MainContent_lblGenAppraisal): $246,100
DNE (MainContent_lblPidLabel): PID
pid (MainContent_lblPid): 300
building_count (MainContent_lblBldCount): 1
DNE (MainContent_lblDummyToPlaceSmartLinkInTheRightColumn): 
DNE (MainContent_lblDummyToPlaceSmartLinkInTheRightColumnText): 
DNE (MainContent_lblTab2Title): Owner of Record
DNE (MainContent_lblOwne

In [105]:
for tag in soup.find('table', id="MainContent_grdSales").find_all('tr'):
    for th in tag.find_all('th'):
        print(th.get_text(separator = ' ', strip = True).replace('&', 'and').lower().replace(' ', '_'))

for tag in soup.find('table', id="MainContent_grdSales").find_all('tr')[1].find_all('td'):
    print(tag.get_text(separator = ' ', strip = True).replace('&', 'and').lower())

owner
sale_price
certificate
book_and_page
instrument
sale_date
deitch joshua
$252,000

9597/0069
00
07/12/2017


In [113]:
soup.find('table', id="MainContent_grdSales").find_all('tr')[1]

<tr class="RowStyle">
<td>DEITCH JOSHUA</td><td align="right">$252,000</td><td align="center"> </td><td align="center">9597/0069</td><td align="center">00</td><td align="center">07/12/2017</td>
</tr>

In [107]:
table_tag.format("tag2")

'tag'

In [108]:
h = None

In [109]:
if h:
    print("dumbass")