In [1]:
import requests
import re
from bs4 import BeautifulSoup
from mechanize import Browser
import pandas as pd
from siuba import *
import usaddress
from datetime import datetime

# Scrape public notices

In [2]:
url = "http://ny.mypublicnotices.com/PublicNotice.asp"

# Create a Browser instance to interact with the webpage
br = Browser()
br.set_handle_robots(False)  # Ignore robots.txt
br.open(url)

<response_seek_wrapper at 0x11814ef50 whose wrapped object = <closeable_response at 0x117d80610 whose fp = <_io.BufferedReader name=70>>>

In [3]:
# Find the form on the page
br.select_form(nr=1)
# for f in br.forms():
#     print(f)

In [4]:
# Set the Date Range input to "Last 7 days"
br.form["DateRange"] = ["Last7"]
# Set the Category input to "Auction and Sale"
br.form["Category"] = ["1"]
# Submit the form
response = br.submit()

In [5]:
# Find the form to change to full notices on the resulting page
br.select_form(nr=3)

# Set the input to 100 complete notices
br["FullTextType"] = ["0"]
br["Count"] = ["100"]
response = br.submit()

In [6]:
# Parse the resulting page using BeautifulSoup
soup = BeautifulSoup(response.read(), "html.parser")

In [43]:
table = soup.select("table.BorderedTable")[1]
all_rows = []
for i, child in enumerate(table.children):
    # Skip first rows
    # if i > 2: 
    first_anchor = child.find("a")
    if first_anchor and first_anchor != -1:
        # url = f"http://ny.mypublicnotices.com/{first_anchor["href"]}
        href = first_anchor["href"]
        all_rows.append({
            "text": child.get_text(),
            "url": f"http://ny.mypublicnotices.com{href}"
        })
# all_rows = [child.get_text() for child in table.children]
# table.select("tr")[2].find("a")['href']
raw_text_df = pd.DataFrame(all_rows)

In [45]:
# Parse addresses

# text = df['text'].iloc[1]

def clean_address_tuple(t):
    address = t[0].replace('Plaintiff', '')
    address = address.replace('Dated', '')
    return (address, t[1])


def parse_addresses(text):
    allowed_types = [
        'AddressNumber', 'StreetName', 'StreetNamePostType', 'PlaceName', 'StateName', 'ZipCode'
    ]
    parsed = [clean_address_tuple(field) for field in usaddress.parse(text) if field[1] in allowed_types]
    address_start_indices = []
    for i, field in enumerate(parsed): 
        if field[1] == 'AddressNumber':
            address_start_indices.append(i)
            
    addresses = []

    for i, j in enumerate(address_start_indices):
        if i == len(address_start_indices) - 1:
            address_list = parsed[j:]
        else:        
            address_list = parsed[j:address_start_indices[i+1]]
        address = ' '.join([a[0] for a in address_list])
        addresses.append(address)

    return addresses

# TODO: Filter out all the known courthouse addresses (count the addresses once theyre parsed)
# TODO: Strip symbols from the end of addresses
# TODO: Parse the date(s)



In [47]:
# County <> Newspaper mapping
county_dict = [
    {
        'county': 'Orange',
        'newspaper': 'Cornwall Local, The'
    },
    {
        'county': 'Orange',
        'newspaper': 'Mid Hudson Times'
    },
    {
        'county': 'Orange',
        'newspaper': 'News of the Highlands'
    },
    {
        'county': 'Orange',
        'newspaper': 'Southern Ulster Times'
    },
    {
        'county': 'Ulster',
        'newspaper': 'Daily Freeman'
    },
    # Note: Greene county has no newspapers listed on MyPublicNotices...
    {
        'county': 'Greene',
        'newspaper': 'Catskill Daily Mail'
    },
    {
        'county': 'Greene',
        'newspaper': 'Greene County Daily World'
    },
    {
        'county': 'Greene',
        'newspaper': 'The Register Star'
    },
    {
        'county': 'Greene',
        'newspaper': 'Register Star, The'
    },
    {
        'county': 'Greene',
        'newspaper': 'Windham Journal'
    },
    {
        'county': 'Greene',
        'newspaper': 'Greene County Record'
    },
    {
        'county': 'Unknown',
        'newspaper': 'Newsday'
    }
]

counties = pd.DataFrame(county_dict)

In [50]:
df1 = (
    raw_text_df
    >> filter(
        # _.text != '\n', 
        # ~_.text.str.contains('Select notice to print'),
        ~_.text.str.contains('vehicle', case=False),
        ~_.text.str.contains('storage', case=False),
        _.text.str.contains('premise', case=False)
    )
    >> mutate(
        addresses=_.text.apply(parse_addresses),
        newspaper=_.text.str.extract('Appeared in: (.+) on'),
        dates_raw=_.text.str.extract('Appeared in: .+ on (.+)')
    )     
).reset_index()

In [54]:
addresses_only_df = pd.DataFrame(df1['addresses'].to_list())
# addresses_only_df
addresses_df = (
    pd.concat([df1, addresses_only_df], axis=1) 
    >> select(-_.index)
    >> gather('address_index', 'address', -_.text, -_.addresses, -_.newspaper, -_.url, -_.dates_raw)
    >> filter(_.address.notna())
    >> mutate(address=_.address.str.replace('\n', ''))
    >> left_join(_, counties, on="newspaper")
    >> filter(
        # Addresses must contain more than 3 letters
        _.address.str.contains('[a-zA-Z]{3,}'),
        # Remove courthouses
        (~_.address.str.contains('285 Wall Street') & ~_.address.str.contains('Kingston')),
        (~_.address.str.contains('138 Court Street') & ~_.address.str.contains('Wampsville')),
        # Only look in certain counties
        # (_.county == 'Ulster' | _.county == 'Orange' | _.county == 'Greene')
        _.county.notna()
    )
    >> arrange(_.text)
)
# addresses_only_df
addresses_df.head() 

Unnamed: 0,text,url,addresses,newspaper,dates_raw,address_index,address,county
65,\n\n\n\n\nLEGAL NOITCE SUPREME COURT OF THE ST...,http://ny.mypublicnotices.com/Link.asp?ID=AD05...,"[285 Wall Street, Kingston, New York 12401,, 4...",Daily Freeman,"04/16/2023, 04/23/2023 and 04/30/2023",1,"46 Butterville Road, NewPaltz, New York 12561.",Ulster
58,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,http://ny.mypublicnotices.com/Link.asp?ID=AD05...,"[285 Wall Street, Kingston NY 12401, 18 Villag...",Daily Freeman,04/25/2023 and 05/02/2023,1,"18 Village Drive, Saugerties, NY 12477.",Ulster
91,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,http://ny.mypublicnotices.com/Link.asp?ID=AD05...,"[285 Wall Street, Kingston NY 12401, 18 Villag...",Daily Freeman,04/25/2023 and 05/02/2023,2,"1775 Wehrle Drive Williamsville, NY 14221",Ulster
50,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,http://ny.mypublicnotices.com/Link.asp?ID=AD05...,"[285 Wall Street, Kingston NY 12401, 675 Platt...",Daily Freeman,"04/12/2023, 04/19/2023, 04/26/2023 and 05/03/2023",1,"675 Plattekill Ardonia Road, Plattekill, NY 12...",Ulster
83,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,http://ny.mypublicnotices.com/Link.asp?ID=AD05...,"[285 Wall Street, Kingston NY 12401, 675 Platt...",Daily Freeman,"04/12/2023, 04/19/2023, 04/26/2023 and 05/03/2023",2,"53 Gibson Street Bay Shore, NY 11706",Ulster


In [47]:
filename = "~/Desktop/{}_auction_listings_search.csv".format(datetime.today().strftime('%Y-%m-%d'))
addresses_df.to_csv(filename)