In [1]:
import usaddress
import string
from Levenshtein import distance as levenshtein_distance
import pandas as pd

In [2]:
punctuation_without_hash = string.punctuation.replace('#', '')
def remove_punctuation(text):
    try:
        return text.translate(str.maketrans('', '', punctuation_without_hash))
    except AttributeError:
        print(text)
        return text

directional_map = {
    'n': 'north', 'north': 'north',
    's': 'south', 'south': 'south',
    'e': 'east', 'east': 'east',
    'w': 'west', 'west': 'west',
    'ne': 'northeast', 'northeast': 'northeast',
    'nw': 'northwest', 'northwest': 'northwest',
    'se': 'southeast', 'southeast': 'southeast',
    'sw': 'southwest', 'southwest': 'southwest'
}
suffix_map = {
    'st': 'street', 'street': 'street',
    'ave': 'avenue', 'avenue': 'avenue',
    'blvd': 'boulevard', 'boulevard': 'boulevard',
    'rd': 'road', 'road': 'road',
    'dr': 'drive', 'drive': 'drive',
    'ct': 'court', 'court': 'court',
    'pl': 'place', 'place': 'place',
    'ln': 'lane', 'lane': 'lane',
    'ter': 'terrace', 'terrace': 'terrace'
}

def normalize_street_name(street_name):
    # Split the street name into parts
    parts = street_name.lower().split()
    normalized_parts = []

    for part in parts:
        # Remove punctuation
        part_clean = remove_punctuation(part)
        # Normalize directionals and suffixes
        if part_clean in directional_map:
            normalized_parts.append(directional_map[part_clean])
        elif part_clean in suffix_map:
            normalized_parts.append(suffix_map[part_clean])
        else:
            normalized_parts.append(part_clean)
    return ' '.join(normalized_parts)

def normalize_with_usaddress(address):
    try:
        # Clean the address before parsing
        # print(address)
        clean_address = remove_punctuation(address)
        # print(clean_address)
        try:
            parsed = usaddress.tag(clean_address)[0]
        except TypeError:
            print(f"Error parsing address '{address!r}'")

        # print(parsed)
        
        # Normalize StreetNamePreDirectional
        if 'StreetNamePreDirectional' in parsed:
            value = parsed['StreetNamePreDirectional'].lower()
            parsed['StreetNamePreDirectional'] = directional_map.get(value, value)
        
        # Normalize StreetNamePostType
        if 'StreetNamePostType' in parsed:
            value = parsed['StreetNamePostType'].lower()
            parsed['StreetNamePostType'] = suffix_map.get(value, value)
        
        # Normalize StreetName
        if 'StreetName' in parsed:
            parsed['StreetName'] = normalize_street_name(parsed['StreetName'])
        
        # Normalize OccupancyIdentifier
        occupancy_id = parsed.get('OccupancyIdentifier', '')
        if occupancy_id:
            parsed['OccupancyIdentifier'] = remove_punctuation(occupancy_id).lower()
        else:
            # Check for SubaddressIdentifier if OccupancyIdentifier is missing
            subaddress_id = parsed.get('SubaddressIdentifier', '')
            if subaddress_id:
                parsed['OccupancyIdentifier'] = remove_punctuation(subaddress_id).lower()
        
        # Reconstruct the normalized address without OccupancyType
        normalized_address = " ".join(filter(None, [
            parsed.get('AddressNumber', '').lower(),
            parsed.get('StreetNamePreDirectional', '').lower(),
            parsed.get('StreetName', '').lower(),
            parsed.get('StreetNamePostType', '').lower(),
            parsed.get('OccupancyIdentifier', '')
        ])).lower()
        
        return normalized_address
    
    except usaddress.RepeatedLabelError as e:
        # Log the error and return a cleaned, lowercased address
        print(f"Error parsing address '{address}': {e}")
        return remove_punctuation(address).lower()

def are_addresses_same(addr1, addr2, threshold=2):
    norm_addr1 = normalize_with_usaddress(addr1)
    norm_addr2 = normalize_with_usaddress(addr2)
    distance = levenshtein_distance(norm_addr1, norm_addr2)
    return distance <= threshold

In [3]:
# Example
address1 = "993 Dumont Avenue #2"
address2 = "993 Dumont Ave APT 2"

norm_addr1 = normalize_with_usaddress(address1)
norm_addr2 = normalize_with_usaddress(address2)

print("Normalized Address 1:", norm_addr1)
print("Normalized Address 2:", norm_addr2)
print("Are addresses the same?", are_addresses_same(address1, address2))

Normalized Address 1: 993 dumont avenue # 2
Normalized Address 2: 993 dumont avenue 2
Are addresses the same? True


In [4]:
streeteasy_df = pd.read_csv('streeteasy_listings.csv')
streeteasy_df.head()

Unnamed: 0,id,listedAt,daysOnMarket,availableFrom,address,price,borough,neighborhood,zipcode,propertyType,...,longitude,amenities,builtIn,description,building,agents,noFee,images,videos,floorplans
0,4597458,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,...,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926,Large newly renovated 1-bedroom \n\nFeatures: ...,{'id': '166259'},['Voro Purple LLC'],False,['https://photos.zillowstatic.com/fp/9deccb9a3...,[],[]
1,4596229,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,...,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926,Beautifully newly Renovated Two-Bedroom Apartm...,{'id': '165748'},['Kevin Cameron'],False,['https://photos.zillowstatic.com/fp/49380b70a...,[],[]
2,4596223,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,...,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning Newly Renovated One-Bedroom Apartment...,{'id': '140629'},['Kevin Cameron'],False,['https://photos.zillowstatic.com/fp/e941cdddc...,[],[]
3,4596221,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,...,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning newly renovated two-bedroom apt avail...,{'id': '140629'},['Kevin Cameron'],False,['https://photos.zillowstatic.com/fp/8189b5ea3...,[],[]
4,4594153,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,...,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,{'id': '167432'},['Eli Loebenstein'],False,['https://photos.zillowstatic.com/fp/f7a8a2c7d...,[],[]


In [5]:
zillow_df = pd.read_csv('rent_data_zillow.csv')
zillow_df.head()

Unnamed: 0,providerListingId,hasImage,price,addressStreet,addressCity,addressState,addressZipcode,isUndisclosedAddress,beds,baths,...,brokerName,carouselPhotos,marketingTreatments,timeOnZillow,houseType,latitude,longitude,timeOnZillowText,daysOnZillowHDP,timeOnZillowHDP
0,g3qqbttyw1g7,True,"$2,449/mo",498 Jefferson Ave APT 3B,Brooklyn,NY,11221,False,2,1.0,...,Listing by: Voro Purple LLC,5,paid,,Apartment,40.68445,-73.937904,2 days ago,2,198026000.0
1,4msp054rpy0w3,True,"$2,400/mo",371 Kosciuszko St APT 1,Brooklyn,NY,11221,False,2,1.0,...,Listing by: Miracle Capital,12,paid,,Apartment,40.69212,-73.940094,3 days ago,3,294307000.0
2,56cyrsnd6f4z1,True,"$2,395/mo",373 Kosciuszko St #1A,Brooklyn,NY,11221,False,2,1.0,...,Listing by: Skyhigh Realty NYC LLC,11,paid,,Apartment,40.692127,-73.940025,2 days ago,2,227011000.0
3,1v8znm7bxz78f,True,"$2,600/mo",48 Jefferson St #1E,Brooklyn,NY,11206,False,2,1.0,...,Listing by: Nooklyn NYC LLC,17,paid,,Apartment,40.698406,-73.933365,4 days ago,4,385412000.0
4,5at3ufj2pccje,True,"$2,100/mo",573 Evergreen Ave,Brooklyn,NY,11221,False,1,1.0,...,Listing by: Fifth & Forever LLC,10,paid,,Apartment,40.68941,-73.913506,2 days ago,2,239925000.0


## Remove stuff from zillow DF


In [6]:
zillow_df.drop(columns=['providerListingId', 'hasImage', 'variableData', 'hdpData', 'has3DModel', 'brokerName', 'marketingTreatments', 'timeOnZillow',
'daysOnZillowHDP', 'timeOnZillowHDP', 'daysOnZillowHDP'], inplace=True)

## Rename stuff for homegenity

In [7]:
def convert_to_days(time_str):
    if time_str is None or str(time_str) == 'nan':
        return -1
    try:
        borken_down = time_str.split()
    except AttributeError:
        print(time_str)
        return -1

    if borken_down[1].startswith('day'):
        return int(borken_down[0])
    elif borken_down[1].startswith('hour'):
        return 0
    else:
        return -1

In [8]:
zillow_df.rename(columns={'addressStreet': 'street', 'addressCity': 'borough', 'addressState': 'state', 'addressZipcode': 'zip', 'carouselPhotos' : 'PhotosNum', 'availabilityDate':
'availableFrom', 'houseType': 'propertyType', 'PhotosNum': 'photosNum'}, inplace=True)
zillow_df['daysOnMarket'] = zillow_df['timeOnZillowText'].apply(lambda x: convert_to_days(x))
zillow_df.drop(columns=['timeOnZillowText'], inplace=True)
zillow_df.head()


Unnamed: 0,price,street,borough,state,zip,isUndisclosedAddress,beds,baths,hasVideo,isFeaturedListing,availableFrom,PhotosNum,propertyType,latitude,longitude,daysOnMarket
0,"$2,449/mo",498 Jefferson Ave APT 3B,Brooklyn,NY,11221,False,2,1.0,False,True,2024-11-21 00:00:00,5,Apartment,40.68445,-73.937904,2
1,"$2,400/mo",371 Kosciuszko St APT 1,Brooklyn,NY,11221,False,2,1.0,False,True,,12,Apartment,40.69212,-73.940094,3
2,"$2,395/mo",373 Kosciuszko St #1A,Brooklyn,NY,11221,False,2,1.0,False,True,2024-11-21 00:00:00,11,Apartment,40.692127,-73.940025,2
3,"$2,600/mo",48 Jefferson St #1E,Brooklyn,NY,11206,False,2,1.0,False,True,2024-11-19 00:00:00,17,Apartment,40.698406,-73.933365,4
4,"$2,100/mo",573 Evergreen Ave,Brooklyn,NY,11221,False,1,1.0,False,True,2024-12-01 00:00:00,10,Apartment,40.68941,-73.913506,2


## Delete Stuff from streeteasy DF

In [9]:
streeteasy_df.drop(columns=['id', 'sqft', 'type', 'building', 'agents', 'noFee', 'floorplans',], inplace=True)

## Rename Stuff from streeteasy DF

In [10]:

streeteasy_df

Unnamed: 0,listedAt,daysOnMarket,availableFrom,address,price,borough,neighborhood,zipcode,propertyType,bedrooms,bathrooms,latitude,longitude,amenities,builtIn,description,images,videos
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,1,40.662186,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926,Large newly renovated 1-bedroom \n\nFeatures: ...,['https://photos.zillowstatic.com/fp/9deccb9a3...,[]
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,1,40.663817,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926,Beautifully newly Renovated Two-Bedroom Apartm...,['https://photos.zillowstatic.com/fp/49380b70a...,[]
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning Newly Renovated One-Bedroom Apartment...,['https://photos.zillowstatic.com/fp/e941cdddc...,[]
3,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,2,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning newly renovated two-bedroom apt avail...,['https://photos.zillowstatic.com/fp/8189b5ea3...,[]
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,1,40.657349,-73.917750,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,['https://photos.zillowstatic.com/fp/f7a8a2c7d...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4095,2024-10-25,32,2024-11-26,187 South Portland Avenue #3,3800,brooklyn,fort-greene,11217,rental,2,1,40.683906,-73.973647,"['fios_available', 'hardwood_floors', 'smoke_f...",2002,"NO PETS, Hardwood, High Ceilings, Stainless St...",['https://photos.zillowstatic.com/fp/50db98f1e...,[]
4096,2024-10-25,32,2024-10-25,253 Cumberland Street #302,2750,brooklyn,fort-greene,11205,rental,1,1,40.688336,-73.972567,"['city_view', 'elevator', 'hardwood_floors', '...",1928,Prime location 1 bed\r\n\r\n40x the rent after...,['https://photos.zillowstatic.com/fp/9e9362546...,[]
4097,2024-10-20,37,2024-11-01,87 Adelphi Street #1,3300,brooklyn,fort-greene,11205,rental,1,1,40.695201,-73.971962,"['central_ac', 'courtyard', 'dishwasher', 'fio...",2001,87 Adelphi St #1 is a completely renovated gar...,['https://photos.zillowstatic.com/fp/f9f64f35b...,[]
4098,2024-09-16,61,2024-12-01,125 South Oxford Street UNIT-2R,3500,brooklyn,fort-greene,11217,rental,1,1,40.685525,-73.972986,"['city_view', 'dishwasher', 'fios_available', ...",1930,Only Two Way to Move in (Rent Special the firs...,['https://photos.zillowstatic.com/fp/39817fd66...,[]


In [11]:
# streeteasy_df.rename(columns={})
streeteasy_df['hasVideo'] = streeteasy_df['videos'].apply(lambda x: 1 if len(x) > 0 else 0)
streeteasy_df['PhotosNum'] = streeteasy_df['images'].apply(lambda x: len(x))
streeteasy_df.drop(columns=['videos', 'images'], inplace=True)
streeteasy_df.rename(columns={'address': 'street', 'zipcode': 'zip', 'property_type': 'propertyType', 'bedrooms':'beds', 'bathrooms':'baths'}, inplace=True)
streeteasy_df.head()


Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,baths,latitude,longitude,amenities,builtIn,description,hasVideo,PhotosNum
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,1,40.662186,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926,Large newly renovated 1-bedroom \n\nFeatures: ...,1,320
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,1,40.663817,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926,Beautifully newly Renovated Two-Bedroom Apartm...,1,400
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning Newly Renovated One-Bedroom Apartment...,1,560
3,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,2,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning newly renovated two-bedroom apt avail...,1,400
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,1,40.657349,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,1,320


In [12]:
total_df = pd.concat([streeteasy_df, zillow_df], ignore_index=True)
total_df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,latitude,longitude,amenities,builtIn,description,hasVideo,PhotosNum,state,isUndisclosedAddress,isFeaturedListing
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,...,40.662186,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926.0,Large newly renovated 1-bedroom \n\nFeatures: ...,1,320,,,
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,...,40.663817,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926.0,Beautifully newly Renovated Two-Bedroom Apartm...,1,400,,,
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,...,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930.0,Stunning Newly Renovated One-Bedroom Apartment...,1,560,,,
3,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,2,...,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930.0,Stunning newly renovated two-bedroom apt avail...,1,400,,,
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,...,40.657349,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961.0,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,1,320,,,


In [13]:
addresses = []
total_df['duplicate'] = False
global failed
failed =  0
def remove_duplicate_custom(row):
    try:
        full_add = normalize_with_usaddress(row['street'])
    except (AttributeError, KeyError):
        print(row)
        failed += 1
        return row

    if full_add is None:
        failed += 1
        return row
    
    for address in addresses:
        if levenshtein_distance(full_add, address) < 3:
            row['duplicate'] = True
            return row
    addresses.append(full_add)
    return row

total_df.drop(6104, inplace=True)

In [14]:
total_df = total_df.apply(remove_duplicate_custom, axis=1)
print(failed)
print(total_df.columns)
indices_to_drop = total_df[total_df['duplicate'] == True].index
total_df.drop(indices_to_drop, inplace=True)
total_df

Error parsing address '312 E 116th St #PENTHOUSE 2B': 
ERROR: Unable to tag this string because more than one area of the string has the same label

ORIGINAL STRING:  312 E 116th St #PENTHOUSE 2B
PARSED TOKENS:    [('312', 'AddressNumber'), ('E', 'StreetNamePreDirectional'), ('116th', 'StreetName'), ('St', 'StreetNamePostType'), ('#', 'OccupancyIdentifier'), ('PENTHOUSE', 'OccupancyType'), ('2B', 'OccupancyIdentifier')]
UNCERTAIN LABEL:  OccupancyIdentifier

When this error is raised, it's likely that either (1) the string is not a valid person/corporation name or (2) some tokens were labeled incorrectly

To report an error in labeling a valid name, open an issue at https://github.com/datamade/usaddress/issues/new - it'll help us continue to improve probablepeople!

For more information, see the documentation at https://usaddress.readthedocs.io/
Error parsing address '547 W 47th St #PENTHOUSE 2': 
ERROR: Unable to tag this string because more than one area of the string has the same la

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,longitude,amenities,builtIn,description,hasVideo,PhotosNum,state,isUndisclosedAddress,isFeaturedListing,duplicate
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,...,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926.0,Large newly renovated 1-bedroom \n\nFeatures: ...,1,320,,,,False
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,...,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926.0,Beautifully newly Renovated Two-Bedroom Apartm...,1,400,,,,False
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,...,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930.0,Stunning Newly Renovated One-Bedroom Apartment...,1,560,,,,False
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,...,-73.917750,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961.0,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,1,320,,,,False
5,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,brooklyn,brownsville,11212,rental,3,...,-73.922602,"['central_ac', 'fios_available', 'hardwood_flo...",1910.0,Newly Renovated – 3BR/1BA in Brownsville on Cl...,1,720,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22345,,9,2024-11-14 00:00:00,12-1 26th Ave #2,"$4,400/mo",Astoria,,11102,Apartment,4,...,-73.925920,,,,False,0,NY,,True,False
22346,,-1,2024-11-15 00:00:00,12-20 Astoria Blvd #2A,"$3,500/mo",Astoria,,11102,Apartment,2,...,-73.929320,,,,False,0,NY,,True,False
22348,,5,2024-11-18 00:00:00,11 Maiden Ln APT 6B,"$3,750/mo",New York,,10038,Apartment,2,...,-74.009330,,,,False,5,NY,False,True,False
22352,,-1,,88 Fulton St #TOWNHOUSE 1,"$3,200/mo",New York,,10038,Apartment,1,...,-74.006035,,,,False,0,NY,,True,False
