In [45]:
import requests
import re
from bs4 import BeautifulSoup
from mechanize import Browser
import pandas as pd
from siuba import *
import usaddress
from datetime import datetime

In [2]:
url = "http://ny.mypublicnotices.com/PublicNotice.asp"

# Create a Browser instance to interact with the webpage
br = Browser()
br.set_handle_robots(False)  # Ignore robots.txt
br.open(url)

<response_seek_wrapper at 0x1112dff90 whose wrapped object = <closeable_response at 0x11c8d51d0 whose fp = <_io.BufferedReader name=80>>>

In [3]:
# Find the form on the page
br.select_form(nr=1)
# for f in br.forms():
#     print(f)

In [4]:
# Set the Date Range input to "Last 7 days"
br.form["DateRange"] = ["Last7"]
# Set the Category input to "Auction and Sale"
br.form["Category"] = ["1"]
# Submit the form
response = br.submit()

In [5]:
# Find the form to change to full notices on the resulting page
br.select_form(nr=3)

# Set the input to 100 complete notices
br["FullTextType"] = ["0"]
br["Count"] = ["100"]
response = br.submit()

In [6]:
# Parse the resulting page using BeautifulSoup
soup = BeautifulSoup(response.read(), "html.parser")

In [7]:
table = soup.select("table.BorderedTable")[1]
all_rows = [child.get_text() for child in table.children]
raw_text_df = pd.DataFrame(all_rows, columns=['text'])

In [9]:
# Parse addresses

# text = df['text'].iloc[1]

def clean_address_tuple(t):
    address = t[0].replace('Plaintiff', '')
    address = address.replace('Dated', '')
    return (address, t[1])


def parse_addresses(text):
    allowed_types = [
        'AddressNumber', 'StreetName', 'StreetNamePostType', 'PlaceName', 'StateName', 'ZipCode'
    ]
    parsed = [clean_address_tuple(field) for field in usaddress.parse(text) if field[1] in allowed_types]
    address_start_indices = []
    for i, field in enumerate(parsed): 
        if field[1] == 'AddressNumber':
            address_start_indices.append(i)
            
    addresses = []

    for i, j in enumerate(address_start_indices):
        if i == len(address_start_indices) - 1:
            address_list = parsed[j:]
        else:        
            address_list = parsed[j:address_start_indices[i+1]]
        address = ' '.join([a[0] for a in address_list])
        addresses.append(address)

    return addresses

# TODO: Filter out all the known courthouse addresses (count the addresses once theyre parsed)
# TODO: Strip symbols from the end of addresses
# TODO: Parse the date(s)



In [36]:
# County <> Newspaper mapping
county_dict = [
    {
        'county': 'Orange',
        'newspaper': 'Cornwall Local, The'
    },
    {
        'county': 'Orange',
        'newspaper': 'Mid Hudson Times'
    },
    {
        'county': 'Orange',
        'newspaper': 'News of the Highlands'
    },
    {
        'county': 'Orange',
        'newspaper': 'Southern Ulster Times'
    },
    {
        'county': 'Ulster',
        'newspaper': 'Daily Freeman'
    },
    # Note: Greene county has no newspapers listed on MyPublicNotices...
    {
        'county': 'Greene',
        'newspaper': 'Catskill Daily Mail'
    },
    {
        'county': 'Greene',
        'newspaper': 'Greene County Daily World'
    },
    {
        'county': 'Greene',
        'newspaper': 'The Register Star'
    },
    {
        'county': 'Greene',
        'newspaper': 'Register Star, The'
    },
    {
        'county': 'Greene',
        'newspaper': 'Windham Journal'
    },
    {
        'county': 'Greene',
        'newspaper': 'Greene County Record'
    }
]

counties = pd.DataFrame(county_dict)

In [34]:
df1 = (
    raw_text_df
    >> filter(
        _.text != '\n', 
        ~_.text.str.contains('Select notice to print'),
        ~_.text.str.contains('vehicle', case=False),
        ~_.text.str.contains('storage', case=False),
        _.text.str.contains('premise', case=False)
    )
    >> mutate(
        addresses=_.text.apply(parse_addresses),
        newspaper=_.text.str.extract('Appeared in: (.+) on')
    )     
).reset_index()

In [44]:
addresses_only_df = pd.DataFrame(df1['addresses'].to_list())
# addresses_only_df
addresses_df = (
    pd.concat([df1, addresses_only_df], axis=1) 
    >> select(-_.index)
    >> gather('address_index', 'address', -_.text, -_.addresses, -_.newspaper)
    >> filter(_.address.notna())
    >> mutate(address=_.address.str.replace('\n', ''))
    >> left_join(_, counties, on="newspaper")
    >> filter(
        # Addresses must contain more than 3 letters
        _.address.str.contains('[a-zA-Z]{3,}'),
        # Remove courthouses
        (~_.address.str.contains('285 Wall Street') & ~_.address.str.contains('Kingston')),
        (~_.address.str.contains('138 Court Street') & ~_.address.str.contains('Wampsville')),
        # Only look in certain counties
        # (_.county == 'Ulster' | _.county == 'Orange' | _.county == 'Greene')
        _.county.notna()
    )
    >> arrange(_.text)
)

# addresses_df.to_csv("~/Desktop/test.csv")

# (
#     addresses_df 
#     >> distinct(_.address) 
# ).to_csv("~/Desktop/test.csv")

# count(_.address, sort=True)

# df1
# addresses_only_df
addresses_df 

Unnamed: 0,text,addresses,newspaper,address_index,address,county
65,\n\n\n\n\nLEGAL NOITCE SUPREME COURT OF THE ST...,"[285 Wall Street, Kingston, New York 12401,, 4...",Daily Freeman,1,"46 Butterville Road, NewPaltz, New York 12561.",Ulster
58,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,"[285 Wall Street, Kingston NY 12401, 18 Villag...",Daily Freeman,1,"18 Village Drive, Saugerties, NY 12477.",Ulster
92,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,"[285 Wall Street, Kingston NY 12401, 18 Villag...",Daily Freeman,2,"1775 Wehrle Drive Williamsville, NY 14221",Ulster
50,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,"[285 Wall Street, Kingston NY 12401, 675 Platt...",Daily Freeman,1,"675 Plattekill Ardonia Road, Plattekill, NY 12...",Ulster
84,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,"[285 Wall Street, Kingston NY 12401, 675 Platt...",Daily Freeman,2,"53 Gibson Street Bay Shore, NY 11706",Ulster
62,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,"[285 Wall Street, Kingston NY 12401, 393 MAIN ...",Daily Freeman,1,"393 MAIN STREET, ROSENDALE, NY 12472.",Ulster
96,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALE SUPREME ...,"[285 Wall Street, Kingston NY 12401, 393 MAIN ...",Daily Freeman,2,"1775 Wehrle Drive Williamsville, NY 14221",Ulster
34,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALESUPREME C...,"[285 Wall Street, Kingston, NY on, 44 STONY RO...",Daily Freeman,1,"44 STONY ROAD, ACCORD, NY",Ulster
68,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALESUPREME C...,"[285 Wall Street, Kingston, NY on, 44 STONY RO...",Daily Freeman,2,"242 Drexel Avenue, Westbury, NY",Ulster
69,\n\n\n\n\nLEGAL NOTICE NOTICE OF SALESUPREME C...,"[285 Wall Street,, 337 HasbrouckAvenue, Kingst...",Daily Freeman,2,"175 Mile Crossing BoulevardRochester, New York...",Ulster


In [47]:
filename = "~/Desktop/{}_auction_listings_search.csv".format(datetime.today().strftime('%Y-%m-%d'))
addresses_df.to_csv(filename)

In [None]:
zillow_url = "https://zillow.com"

# Create a Browser instance to interact with the webpage
brz = Browser()
brz.set_handle_robots(False)  # Ignore robots.txt
brz.open(zillow_url)

In [None]:
for f in brz.forms():
    print(f)

In [None]:
def parse_addresses(vec): 
    for v in vec:
        

In [None]:
# table = soup.find("table", class_="BorderedTable")
table = soup.select("table.BorderedTable")[1]
auctions = []
for child in table.children:
    text = child.get_text(strip=True)
    if text and text != '' and text !="Select notice to print" and not 'vehicle' in text:
        
        # Regular expression to find potential address substrings
        address_regex = r'\d+ [\w\s]+,(?: [A-Za-z]+,){1,2} \w{2} \d{5}'

        # Find all potential address substrings
        potential_addresses = re.findall(address_regex, child.get_text())
        
        auctions.append({
            'text': child.get_text(),
            'potential_addresses': potential_addresses
        })
        
        print(potential_addresses)
        print("============================")
        

In [None]:
# Get the top level tables

# main_div = soup.find("div", id="PublicNoticeContent")
# # findChildren("a" , recursive=False)
# print(
#     main_div
#     .select("table")[2]
#     .find("td")
#     .select("table")[6]
#     .get_text()
# )



# l1_tables = [child for child in main_div.children if child.name == "table"]
# # Get the second child table
# first_td = l1_tables[1].find("td")
# l2_tables = [child for child in first_td.children if child.name == "table"]
# # get the third child table
# first_td_2 = l2_tables[2].find('td')

# main_tr = [child for child in first_td_2.tbody.children][4]
# print(main_tr)
# len(main_tables)
# for child in main_div.children:
#     print(child.name)
# tables_l1 = main_div.find("table")
# len(tables_l1)

In [None]:
# Print the page content
# print(soup.prettify())

In [None]:
# find all td that contain "legal notice"
# legal_notices = soup.find_all("td", string=re.compile("legal notice", re.IGNORECASE))
all_td = soup.find_all("td")
pattern = re.compile("legal notice", re.IGNORECASE)
for td in all_td:
    text = td.get_text(strip=True)
    if pattern.search(text): 
        print(text)
        print("=====================")

In [None]:
# Get the pagination
rows = soup.find_all("tr")
pagination_nav = rows[-4]
pagination_nav.find_all("a")

In [None]:
def parse_auction_ids(soup):
    links = soup.find_all("a")
    auctions = [link.get("href") for link in links if "/Link.asp?ID=" in link.get("href")]
    return auctions

In [None]:
# Find all the links in the page
links = soup.find_all("a")

# Print the links
# for link in links:
#     print()
#     print("Link.asp?ID=" in link)
    
auctions = [link.get("href") for link in links if "/Link.asp?ID=" in link.get("href")]
print(auctions)