In [10]:
import usaddress
import string
from Levenshtein import distance as levenshtein_distance
import pandas as pd

In [11]:
punctuation_without_hash = string.punctuation.replace('#', '')
def remove_punctuation(text):
    try:
        return text.translate(str.maketrans('', '', punctuation_without_hash))
    except AttributeError:
        print(text)
        return text

directional_map = {
    'n': 'north', 'north': 'north',
    's': 'south', 'south': 'south',
    'e': 'east', 'east': 'east',
    'w': 'west', 'west': 'west',
    'ne': 'northeast', 'northeast': 'northeast',
    'nw': 'northwest', 'northwest': 'northwest',
    'se': 'southeast', 'southeast': 'southeast',
    'sw': 'southwest', 'southwest': 'southwest'
}
suffix_map = {
    'st': 'street', 'street': 'street',
    'ave': 'avenue', 'avenue': 'avenue',
    'blvd': 'boulevard', 'boulevard': 'boulevard',
    'rd': 'road', 'road': 'road',
    'dr': 'drive', 'drive': 'drive',
    'ct': 'court', 'court': 'court',
    'pl': 'place', 'place': 'place',
    'ln': 'lane', 'lane': 'lane',
    'ter': 'terrace', 'terrace': 'terrace'
}

def normalize_street_name(street_name):
    # Split the street name into parts
    parts = street_name.lower().split()
    normalized_parts = []

    for part in parts:
        # Remove punctuation
        part_clean = remove_punctuation(part)
        # Normalize directionals and suffixes
        if part_clean in directional_map:
            normalized_parts.append(directional_map[part_clean])
        elif part_clean in suffix_map:
            normalized_parts.append(suffix_map[part_clean])
        else:
            normalized_parts.append(part_clean)
    return ' '.join(normalized_parts)

def normalize_with_usaddress(address):
    try:
        # Clean the address before parsing
        # print(address)
        clean_address = remove_punctuation(address)
        # print(clean_address)
        try:
            parsed = usaddress.tag(clean_address)[0]
        except TypeError:
            print(f"Error parsing address '{address!r}'")

        # print(parsed)
        
        # Normalize StreetNamePreDirectional
        if 'StreetNamePreDirectional' in parsed:
            value = parsed['StreetNamePreDirectional'].lower()
            parsed['StreetNamePreDirectional'] = directional_map.get(value, value)
        
        # Normalize StreetNamePostType
        if 'StreetNamePostType' in parsed:
            value = parsed['StreetNamePostType'].lower()
            parsed['StreetNamePostType'] = suffix_map.get(value, value)
        
        # Normalize StreetName
        if 'StreetName' in parsed:
            parsed['StreetName'] = normalize_street_name(parsed['StreetName'])
        
        # Normalize OccupancyIdentifier
        occupancy_id = parsed.get('OccupancyIdentifier', '')
        if occupancy_id:
            parsed['OccupancyIdentifier'] = remove_punctuation(occupancy_id).lower()
        else:
            # Check for SubaddressIdentifier if OccupancyIdentifier is missing
            subaddress_id = parsed.get('SubaddressIdentifier', '')
            if subaddress_id:
                parsed['OccupancyIdentifier'] = remove_punctuation(subaddress_id).lower()
        
        # Reconstruct the normalized address without OccupancyType
        normalized_address = " ".join(filter(None, [
            parsed.get('AddressNumber', '').lower(),
            parsed.get('StreetNamePreDirectional', '').lower(),
            parsed.get('StreetName', '').lower(),
            parsed.get('StreetNamePostType', '').lower(),
            parsed.get('OccupancyIdentifier', '')
        ])).lower()
        
        return normalized_address
    
    except usaddress.RepeatedLabelError as e:
        # Log the error and return a cleaned, lowercased address
        print(f"Error parsing address '{address}': {e}")
        return remove_punctuation(address).lower()

def are_addresses_same(addr1, addr2, threshold=2):
    norm_addr1 = normalize_with_usaddress(addr1)
    norm_addr2 = normalize_with_usaddress(addr2)
    distance = levenshtein_distance(norm_addr1, norm_addr2)
    return distance <= threshold

In [3]:
# Example
address1 = "993 Dumont Avenue #2"
address2 = "993 Dumont Ave APT 2"

norm_addr1 = normalize_with_usaddress(address1)
norm_addr2 = normalize_with_usaddress(address2)

print("Normalized Address 1:", norm_addr1)
print("Normalized Address 2:", norm_addr2)
print("Are addresses the same?", are_addresses_same(address1, address2))

Normalized Address 1: 993 dumont avenue # 2
Normalized Address 2: 993 dumont avenue 2
Are addresses the same? True


In [12]:
streeteasy_df = pd.read_csv('./streetezy_scraper/streeteasy_rentals.csv')
streeteasy_df.head()

Unnamed: 0,id,listedAt,daysOnMarket,availableFrom,address,price,borough,neighborhood,zipcode,propertyType,...,longitude,amenities,builtIn,description,building,agents,noFee,images,videos,floorplans
0,4597458,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,...,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926,Large newly renovated 1-bedroom \n\nFeatures: ...,{'id': '166259'},['Voro Purple LLC'],False,['https://photos.zillowstatic.com/fp/9deccb9a3...,[],[]
1,4596229,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,...,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926,Beautifully newly Renovated Two-Bedroom Apartm...,{'id': '165748'},['Kevin Cameron'],False,['https://photos.zillowstatic.com/fp/49380b70a...,[],[]
2,4596223,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,...,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning Newly Renovated One-Bedroom Apartment...,{'id': '140629'},['Kevin Cameron'],False,['https://photos.zillowstatic.com/fp/e941cdddc...,[],[]
3,4596221,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,...,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning newly renovated two-bedroom apt avail...,{'id': '140629'},['Kevin Cameron'],False,['https://photos.zillowstatic.com/fp/8189b5ea3...,[],[]
4,4594153,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,...,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,{'id': '167432'},['Eli Loebenstein'],False,['https://photos.zillowstatic.com/fp/f7a8a2c7d...,[],[]


In [13]:
zillow_df = pd.read_csv('./zillow_scraper/rent_data_zillow.csv')
zillow_df.head()

Unnamed: 0,providerListingId,hasImage,price,addressStreet,addressCity,addressState,addressZipcode,isUndisclosedAddress,beds,baths,...,brokerName,carouselPhotos,marketingTreatments,timeOnZillow,houseType,latitude,longitude,timeOnZillowText,daysOnZillowHDP,timeOnZillowHDP
0,g3qqbttyw1g7,True,"$2,449/mo",498 Jefferson Ave APT 3B,Brooklyn,NY,11221,False,2,1.0,...,Listing by: Voro Purple LLC,5,paid,,Apartment,40.68445,-73.937904,2 days ago,2,198026000.0
1,4msp054rpy0w3,True,"$2,400/mo",371 Kosciuszko St APT 1,Brooklyn,NY,11221,False,2,1.0,...,Listing by: Miracle Capital,12,paid,,Apartment,40.69212,-73.940094,3 days ago,3,294307000.0
2,56cyrsnd6f4z1,True,"$2,395/mo",373 Kosciuszko St #1A,Brooklyn,NY,11221,False,2,1.0,...,Listing by: Skyhigh Realty NYC LLC,11,paid,,Apartment,40.692127,-73.940025,2 days ago,2,227011000.0
3,1v8znm7bxz78f,True,"$2,600/mo",48 Jefferson St #1E,Brooklyn,NY,11206,False,2,1.0,...,Listing by: Nooklyn NYC LLC,17,paid,,Apartment,40.698406,-73.933365,4 days ago,4,385412000.0
4,5at3ufj2pccje,True,"$2,100/mo",573 Evergreen Ave,Brooklyn,NY,11221,False,1,1.0,...,Listing by: Fifth & Forever LLC,10,paid,,Apartment,40.68941,-73.913506,2 days ago,2,239925000.0


## Remove stuff from zillow DF


In [14]:
zillow_df.drop(columns=['providerListingId', 'hasImage', 'variableData', 'hdpData', 'has3DModel', 'brokerName', 'marketingTreatments', 'timeOnZillow',
'daysOnZillowHDP', 'timeOnZillowHDP', 'daysOnZillowHDP'], inplace=True)

## Rename stuff for homegenity

In [15]:
def convert_to_days(time_str):
    if time_str is None or str(time_str) == 'nan':
        return -1
    try:
        borken_down = time_str.split()
    except AttributeError:
        print(time_str)
        return -1

    if borken_down[1].startswith('day'):
        return int(borken_down[0])
    elif borken_down[1].startswith('hour'):
        return 0
    else:
        return -1

In [16]:
zillow_df.rename(columns={'addressStreet': 'street', 'addressCity': 'borough', 'addressState': 'state', 'addressZipcode': 'zip', 'carouselPhotos' : 'PhotosNum', 'availabilityDate':
'availableFrom', 'houseType': 'propertyType', 'PhotosNum': 'photosNum'}, inplace=True)
zillow_df['daysOnMarket'] = zillow_df['timeOnZillowText'].apply(lambda x: convert_to_days(x))
zillow_df.drop(columns=['timeOnZillowText'], inplace=True)
zillow_df.head(5)


Unnamed: 0,price,street,borough,state,zip,isUndisclosedAddress,beds,baths,hasVideo,isFeaturedListing,availableFrom,PhotosNum,propertyType,latitude,longitude,daysOnMarket
0,"$2,449/mo",498 Jefferson Ave APT 3B,Brooklyn,NY,11221,False,2,1.0,False,True,2024-11-21 00:00:00,5,Apartment,40.68445,-73.937904,2
1,"$2,400/mo",371 Kosciuszko St APT 1,Brooklyn,NY,11221,False,2,1.0,False,True,,12,Apartment,40.69212,-73.940094,3
2,"$2,395/mo",373 Kosciuszko St #1A,Brooklyn,NY,11221,False,2,1.0,False,True,2024-11-21 00:00:00,11,Apartment,40.692127,-73.940025,2
3,"$2,600/mo",48 Jefferson St #1E,Brooklyn,NY,11206,False,2,1.0,False,True,2024-11-19 00:00:00,17,Apartment,40.698406,-73.933365,4
4,"$2,100/mo",573 Evergreen Ave,Brooklyn,NY,11221,False,1,1.0,False,True,2024-12-01 00:00:00,10,Apartment,40.68941,-73.913506,2


## Delete Stuff from streeteasy DF

In [17]:
streeteasy_df.drop(columns=['id', 'sqft', 'type', 'building', 'agents', 'noFee', 'floorplans',], inplace=True)

## Rename Stuff from streeteasy DF

In [18]:

streeteasy_df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,address,price,borough,neighborhood,zipcode,propertyType,bedrooms,bathrooms,latitude,longitude,amenities,builtIn,description,images,videos
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,1,40.662186,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926,Large newly renovated 1-bedroom \n\nFeatures: ...,['https://photos.zillowstatic.com/fp/9deccb9a3...,[]
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,1,40.663817,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926,Beautifully newly Renovated Two-Bedroom Apartm...,['https://photos.zillowstatic.com/fp/49380b70a...,[]
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning Newly Renovated One-Bedroom Apartment...,['https://photos.zillowstatic.com/fp/e941cdddc...,[]
3,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,2,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning newly renovated two-bedroom apt avail...,['https://photos.zillowstatic.com/fp/8189b5ea3...,[]
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,1,40.657349,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,['https://photos.zillowstatic.com/fp/f7a8a2c7d...,[]


In [19]:
# streeteasy_df.rename(columns={})
streeteasy_df['hasVideo'] = streeteasy_df['videos'].apply(lambda x: 1 if len(x) > 0 else 0)
streeteasy_df['PhotosNum'] = streeteasy_df['images'].apply(lambda x: len(x))
streeteasy_df.drop(columns=['videos', 'images'], inplace=True)
streeteasy_df.rename(columns={'address': 'street', 'zipcode': 'zip', 'property_type': 'propertyType', 'bedrooms':'beds', 'bathrooms':'baths'}, inplace=True)
streeteasy_df.head()


Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,baths,latitude,longitude,amenities,builtIn,description,hasVideo,PhotosNum
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,1,40.662186,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926,Large newly renovated 1-bedroom \n\nFeatures: ...,1,320
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,1,40.663817,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926,Beautifully newly Renovated Two-Bedroom Apartm...,1,400
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning Newly Renovated One-Bedroom Apartment...,1,560
3,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,2,1,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930,Stunning newly renovated two-bedroom apt avail...,1,400
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,1,40.657349,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,1,320


In [20]:
total_df = pd.concat([streeteasy_df, zillow_df], ignore_index=True)
total_df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,latitude,longitude,amenities,builtIn,description,hasVideo,PhotosNum,state,isUndisclosedAddress,isFeaturedListing
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,...,40.662186,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926.0,Large newly renovated 1-bedroom \n\nFeatures: ...,1,320,,,
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,...,40.663817,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926.0,Beautifully newly Renovated Two-Bedroom Apartm...,1,400,,,
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,...,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930.0,Stunning Newly Renovated One-Bedroom Apartment...,1,560,,,
3,2024-11-22,4,2024-11-22,501 Saratoga Avenue #C,2000,brooklyn,brownsville,11212,rental,2,...,40.668506,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930.0,Stunning newly renovated two-bedroom apt avail...,1,400,,,
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,...,40.657349,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961.0,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,1,320,,,


In [21]:
addresses = []
total_df['duplicate'] = False
global failed
failed =  0
def remove_duplicate_custom(row):
    try:
        full_add = normalize_with_usaddress(row['street'])
    except (AttributeError, KeyError):
        print(row)
        failed += 1
        return row

    if full_add is None:
        failed += 1
        return row
    
    for address in addresses:
        if levenshtein_distance(full_add, address) < 3:
            row['duplicate'] = True
            return row
    addresses.append(full_add)
    return row

total_df.drop(6104, inplace=True)

In [22]:
total_df = total_df.apply(remove_duplicate_custom, axis=1)
print(failed)
print(total_df.columns)
indices_to_drop = total_df[total_df['duplicate'] == True].index
total_df.drop(indices_to_drop, inplace=True)
total_df.head()

Error parsing address '312 E 116th St #PENTHOUSE 2B': 
ERROR: Unable to tag this string because more than one area of the string has the same label

ORIGINAL STRING:  312 E 116th St #PENTHOUSE 2B
PARSED TOKENS:    [('312', 'AddressNumber'), ('E', 'StreetNamePreDirectional'), ('116th', 'StreetName'), ('St', 'StreetNamePostType'), ('#', 'OccupancyIdentifier'), ('PENTHOUSE', 'OccupancyType'), ('2B', 'OccupancyIdentifier')]
UNCERTAIN LABEL:  OccupancyIdentifier

When this error is raised, it's likely that either (1) the string is not a valid person/corporation name or (2) some tokens were labeled incorrectly

To report an error in labeling a valid name, open an issue at https://github.com/datamade/usaddress/issues/new - it'll help us continue to improve probablepeople!

For more information, see the documentation at https://usaddress.readthedocs.io/
Error parsing address '547 W 47th St #PENTHOUSE 2': 
ERROR: Unable to tag this string because more than one area of the string has the same la

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,longitude,amenities,builtIn,description,hasVideo,PhotosNum,state,isUndisclosedAddress,isFeaturedListing,duplicate
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,brooklyn,brownsville,11212,rental,1,...,-73.922293,"['fios_available', 'hardwood_floors', 'nyc_eva...",1926.0,Large newly renovated 1-bedroom \n\nFeatures: ...,1,320,,,,False
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,brooklyn,brownsville,11212,rental,2,...,-73.924122,"['fios_available', 'hardwood_floors', 'live_in...",1926.0,Beautifully newly Renovated Two-Bedroom Apartm...,1,400,,,,False
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,brooklyn,brownsville,11212,rental,1,...,-73.917055,"['fios_available', 'hardwood_floors', 'live_in...",1930.0,Stunning Newly Renovated One-Bedroom Apartment...,1,560,,,,False
4,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,brooklyn,brownsville,11212,rental,1,...,-73.91775,"['cats', 'doorman', 'elevator', 'fios_availabl...",1961.0,BIG 1 Bedroom LUXURY APARTMENT\n\nThis 1 bathr...,1,320,,,,False
5,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,brooklyn,brownsville,11212,rental,3,...,-73.922602,"['central_ac', 'fios_available', 'hardwood_flo...",1910.0,Newly Renovated – 3BR/1BA in Brownsville on Cl...,1,720,,,,False


In [23]:
pd.set_option('display.max_rows', None)
total_df['borough'].value_counts()

borough
New York                  1754
Brooklyn                  1693
manhattan                 1360
brooklyn                   831
Buffalo                    629
Bronx                      544
Rochester                  457
Staten Island              319
Southampton                244
East Hampton               243
Albany                     212
Yonkers                    200
Sag Harbor                 179
Jamaica                    174
Syracuse                   168
Flushing                   164
Astoria                    157
Ithaca                     138
bronx                      135
astoria                    129
Troy                       109
Long Beach                 106
Niagara Falls              100
Montauk                    100
Water Mill                  95
Binghamton                  94
Schenectady                 82
Amagansett                  80
Ridgewood                   72
Hampton Bays                70
Poughkeepsie                67
Saratoga Springs            66


In [60]:
pd.set_option('display.max_rows', None)

nyc_boroughs = {
    'Manhattan': [
        'Manhattan', 'Harlem', 'Greenwich Village', 'Astoria', 'Hudson Yards',
        'Upper East Side', 'Upper West Side', 'Inwood', 'Washington Heights',
        'Chelsea', 'East Village', 'Financial District', 'Tribeca', 'New York',
    ],
    'Brooklyn': [
        'Brooklyn', 'Williamsburg', 'Brooklyn Heights', 'Park Slope', 'Bushwick',
        'DUMBO', 'Fort Greene', 'Crown Heights', 'Greenpoint', 'Bensonhurst',
        'Bay Ridge', 'Coney Island', 'Brighton Beach'
    ],
    'Queens': [
        'Queens', 'Astoria', 'Flushing', 'Jamaica', 'Long Island City',
        'Forest Hills', 'Woodside', 'Ridgewood', 'Bayside', 'Corona',
        'Jackson Heights', 'Elmhurst', 'Sunnyside'
    ],
    'Bronx': [
        'Bronx', 'Riverdale', 'Fordham', 'Pelham Bay', 'Throgs Neck',
        'Morris Park', 'Belmont', 'Kingsbridge', 'Parkchester'
    ],
    'Staten Island': [
        'Staten Island', 'St. George', 'Tottenville', 'Great Kills',
        'New Dorp', 'Port Richmond'
    ]
}

# Flatten the dictionary to map each neighborhood to its borough
borough_mapping = {
    neighborhood.lower(): borough
    for borough, neighborhoods in nyc_boroughs.items()
    for neighborhood in neighborhoods
}

# Normalize column and map values
total_df['borough_normalized'] = total_df['borough'].str.lower().map(borough_mapping)

# Identify rows that will be removed
removed_rows = total_df[total_df['borough_normalized'].isna()]

# Filter the DataFrame to keep only valid boroughs
total_df_filtered = total_df.dropna(subset=['borough_normalized'])

# Reset the DataFrame with cleaned borough information
total_df_filtered['borough'] = total_df_filtered['borough_normalized']
total_df_filtered = total_df_filtered.drop(columns=['borough_normalized'])
print(total_df_filtered['borough'].value_counts())



borough
Manhattan        3122
Brooklyn         2525
Queens           1188
Bronx             679
Staten Island     332
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_df_filtered['borough'] = total_df_filtered['borough_normalized']


# CENSUS DATA

In [61]:
zip_code_df = pd.read_csv('./census_data/census_data.csv')

In [62]:
rows_with_negative_666666 = zip_code_df[(zip_code_df == -666666666).any(axis=1)]

zip_code_df_cleaned = zip_code_df[(zip_code_df != -666666666).all(axis=1)]

print("Dropped rows:")
print(rows_with_negative_666666)

Dropped rows:
     Total Population   Median Age  Male Population  Female Population  \
17                  0 -666666666.0                0                  0   
34              19138         39.3             8426              10712   
48                  0 -666666666.0                0                  0   
118                 0 -666666666.0                0                  0   
130                 0 -666666666.0                0                  0   

     White Alone  Black or African American Alone  Asian Alone  \
17             0                                0            0   
34          2544                            12593          731   
48             0                                0            0   
118            0                                0            0   
130            0                                0            0   

     Hispanic or Latino  Number of Households  Median Household Income  ...  \
17                    0                     0               -6666

In [63]:
zip_code_df_cleaned.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Total Population,134.0,54022.746269,26887.244673,3736.0,32704.0,51089.5,76292.0,112750.0
Median Age,134.0,38.315672,4.887853,25.8,34.8,37.55,41.675,51.9
Male Population,134.0,25978.91791,12903.088847,1996.0,16636.25,24825.5,36278.0,59755.0
Female Population,134.0,28043.828358,14115.569858,1740.0,16889.25,27023.0,40046.5,56197.0
White Alone,134.0,20875.261194,15097.630907,1074.0,9402.5,17577.0,28094.75,64293.0
Black or African American Alone,134.0,11954.022388,16392.382049,10.0,1589.75,4396.5,17095.5,81608.0
Asian Alone,134.0,7489.641791,9516.468054,52.0,1536.75,4041.0,9019.75,57749.0
Hispanic or Latino,134.0,16508.164179,16001.235103,203.0,6267.0,9820.5,24694.0,85529.0
Number of Households,134.0,20801.462687,9751.214977,1990.0,13813.0,20766.5,29451.5,41653.0
Median Household Income,134.0,88444.783582,42293.405948,26400.0,59374.25,79367.5,107289.5,250001.0


In [64]:
# Convert both columns to string type
total_df_filtered['zip'] = total_df_filtered['zip'].astype(str)
zip_code_df['zip code tabulation area'] = zip_code_df['zip code tabulation area'].astype(str)

# Now merge
df = pd.merge(total_df_filtered, zip_code_df, 
              left_on='zip', 
              right_on='zip code tabulation area',
              how='left')

In [65]:
df.columns

Index(['listedAt', 'daysOnMarket', 'availableFrom', 'street', 'price',
       'borough', 'neighborhood', 'zip', 'propertyType', 'beds', 'baths',
       'latitude', 'longitude', 'amenities', 'builtIn', 'description',
       'hasVideo', 'PhotosNum', 'state', 'isUndisclosedAddress',
       'isFeaturedListing', 'duplicate', 'Total Population', 'Median Age',
       'Male Population', 'Female Population', 'White Alone',
       'Black or African American Alone', 'Asian Alone', 'Hispanic or Latino',
       'Number of Households', 'Median Household Income', 'Per Capita Income',
       'Population Below Poverty Level', 'Employed', 'Unemployed',
       'Total Income Distribution', 'Median Gross Rent', 'Median Home Value',
       'Occupied Housing Units', 'Vacant Housing Units',
       'Owner-Occupied Units (value < $100,000)', 'Monthly Housing Costs',
       'High School Graduate (Age 25+)', 'Bachelor’s Degree (Age 25+)',
       'Graduate or Professional Degree (Age 25+)', 'English Only', 'Spanis

# POLICE PRECINCT

In [66]:
police_df = pd.read_csv('./external_data/police_p.csv')
print(police_df.iloc[0])

the_geom      MULTIPOLYGON (((-74.04387761573958 40.69018767...
Precinct                                                      1
Shape_Leng                                         79979.409545
Shape_Area                                        47182160.4145
Name: 0, dtype: object


In [67]:
police_df.columns

Index(['the_geom', 'Precinct', 'Shape_Leng', 'Shape_Area'], dtype='object')

In [68]:
from shapely import wkt
import geopandas as gpd

# Convert to GeoDataFrames
gdf = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df.longitude, df.latitude)
)
police_gdf = gpd.GeoDataFrame(
    police_df,
    geometry=police_df['the_geom'].apply(wkt.loads)
)

# Spatial join
df = gpd.sjoin(gdf, police_gdf, predicate='intersects')

In [69]:
df.columns

Index(['listedAt', 'daysOnMarket', 'availableFrom', 'street', 'price',
       'borough', 'neighborhood', 'zip', 'propertyType', 'beds', 'baths',
       'latitude', 'longitude', 'amenities', 'builtIn', 'description',
       'hasVideo', 'PhotosNum', 'state', 'isUndisclosedAddress',
       'isFeaturedListing', 'duplicate', 'Total Population', 'Median Age',
       'Male Population', 'Female Population', 'White Alone',
       'Black or African American Alone', 'Asian Alone', 'Hispanic or Latino',
       'Number of Households', 'Median Household Income', 'Per Capita Income',
       'Population Below Poverty Level', 'Employed', 'Unemployed',
       'Total Income Distribution', 'Median Gross Rent', 'Median Home Value',
       'Occupied Housing Units', 'Vacant Housing Units',
       'Owner-Occupied Units (value < $100,000)', 'Monthly Housing Costs',
       'High School Graduate (Age 25+)', 'Bachelor’s Degree (Age 25+)',
       'Graduate or Professional Degree (Age 25+)', 'English Only', 'Spanis

In [70]:
df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,Never Married,Currently Married,Divorced,zip code tabulation area,geometry,index_right,the_geom,Precinct,Shape_Leng,Shape_Area
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,Brooklyn,brownsville,11212,rental,1,...,15805.0,8426.0,2263.0,11212,POINT (-73.92229 40.66219),27,MULTIPOLYGON (((-73.90755209573513 40.65118592...,67,43288.943138,93744760.0
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,Brooklyn,brownsville,11212,rental,2,...,15805.0,8426.0,2263.0,11212,POINT (-73.92412 40.66382),27,MULTIPOLYGON (((-73.90755209573513 40.65118592...,67,43288.943138,93744760.0
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,Brooklyn,brownsville,11212,rental,1,...,15805.0,8426.0,2263.0,11212,POINT (-73.91705 40.66851),45,MULTIPOLYGON (((-73.90404639808897 40.67922059...,73,33034.66094,51771860.0
3,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,Brooklyn,brownsville,11212,rental,1,...,15805.0,8426.0,2263.0,11212,POINT (-73.91775 40.65735),27,MULTIPOLYGON (((-73.90755209573513 40.65118592...,67,43288.943138,93744760.0
4,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,Brooklyn,brownsville,11212,rental,3,...,15805.0,8426.0,2263.0,11212,POINT (-73.9226 40.65983),27,MULTIPOLYGON (((-73.90755209573513 40.65118592...,67,43288.943138,93744760.0


In [71]:
df = df.drop(columns=['the_geom', 'Shape_Leng', 'Shape_Area', 'index_right', 'geometry', 'zip code tabulation area'], axis=1)

# SCHOOLS

In [72]:
school_df = pd.read_csv('./external_data/schools.csv')
school_df.head()

Unnamed: 0,fiscal_year,system_code,location_code,location_name,BEDS,Managed_by_name,location_type_description,Location_Category_Description,Grades_text,Grades_final_text,...,Administrative_District_Name,community_school_sup_name,Tier_3_Support_Location_Name,Tier_3_Support_Leader_Name,Tier_2_Support_Location_Name,HighSchool_Network_Location_Code,HighSchool_Network_Name,HighSchool_Network_Superintendent,Community_district 1,Police_precinct
0,2020,15K001,K001,P.S. 001 The Bergen,331500010001,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 15,"SKOP, ANITA",NYCDOE Borough Office - Brooklyn North,,School Support Team 5- Brooklyn North,,,,307.0,72.0
1,2020,17K002,K002,Parkside Preparatory Academy,331700010002,DOE,General Academic,Junior High-Intermediate-Middle,"06,07,08,SE",060708,...,COMMUNITY SCHOOL DISTRICT 17,"ELLIS, CLARENCE",NYCDOE Borough Office - Brooklyn South,Mauriciere de Govia,School Support Team 2- Brooklyn South,,,,309.0,71.0
2,2020,13K003,K003,P.S. 003 The Bedford Village,331300010003,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 13,"SAMUELS, KAMAR",NYCDOE Borough Office - Brooklyn North,,School Support Team 3- Brooklyn North,,,,303.0,79.0
3,2020,75K004,K004,P.S. K004,307500013004,DOE,Special Education,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05,06,07,SE",...,CITYWIDE SPECIAL EDUCATION,"LOUISSAINT, KETLER",D75 CITYWIDE BCO,Tillman Roberto,Children First Network 752,,,,305.0,75.0
4,2020,16K005,K005,P.S. 005 Dr. Ronald McNair,331600010005,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 16,"MARTIN, YOLANDA",NYCDOE Borough Office - Brooklyn North,,School Support Team 4- Brooklyn North,,,,303.0,81.0


In [73]:
school_df.columns

Index(['fiscal_year', 'system_code', 'location_code', 'location_name', 'BEDS',
       'Managed_by_name', 'location_type_description',
       'Location_Category_Description', 'Grades_text', 'Grades_final_text',
       'open_date', 'Status_descriptions', 'Primary_building_code',
       'primary_address_line_1', 'State_code', 'X_COORDINATE', 'Y_COORDINATE',
       'LONGITUDE', 'LATITUDE', 'Community_district', 'Council-district',
       'Census_tract', 'Borough_block_lot', 'NTA', 'NTA_Name',
       'Principal_Name', 'Principal_title', 'Principal_phone_number',
       'fax_number', 'Geographical_District_code',
       'Administrative_District_Code', 'Administrative_District_Name',
       'community_school_sup_name', 'Tier_3_Support_Location_Name',
       'Tier_3_Support_Leader_Name', 'Tier_2_Support_Location_Name',
       'HighSchool_Network_Location_Code', 'HighSchool_Network_Name',
       'HighSchool_Network_Superintendent', 'Community_district 1',
       'Police_precinct'],
      dtype=

In [74]:
# Filter out rows with missing Police_precinct values
filtered_school_df = school_df.dropna(subset=['Police_precinct'])

# Group by Police_precinct and count
police_precinct_counts = filtered_school_df['Police_precinct'].value_counts().to_dict()

# Display the dictionary
print(police_precinct_counts)

{40.0: 72, 75.0: 71, 42.0: 69, 44.0: 63, 43.0: 58, 73.0: 56, 48.0: 51, 79.0: 46, 105.0: 42, 67.0: 41, 46.0: 39, 47.0: 38, 114.0: 37, 83.0: 37, 49.0: 36, 23.0: 36, 52.0: 36, 71.0: 35, 120.0: 34, 34.0: 34, 107.0: 34, 113.0: 34, 45.0: 34, 109.0: 33, 81.0: 33, 41.0: 33, 77.0: 32, 72.0: 31, 60.0: 30, 90.0: 29, 122.0: 28, 69.0: 28, 50.0: 28, 25.0: 28, 20.0: 28, 70.0: 27, 7.0: 27, 18.0: 26, 61.0: 26, 84.0: 26, 13.0: 26, 110.0: 25, 1.0: 25, 32.0: 25, 62.0: 24, 111.0: 24, 103.0: 24, 28.0: 23, 104.0: 23, 106.0: 23, 88.0: 23, 108.0: 22, 102.0: 22, 66.0: 21, 19.0: 20, 115.0: 20, 101.0: 20, 33.0: 19, 9.0: 19, 26.0: 19, 94.0: 18, 76.0: 17, 68.0: 17, 78.0: 16, 112.0: 16, 10.0: 15, 100.0: 15, 5.0: 15, 123.0: 15, 24.0: 15, 30.0: 11, 63.0: 11, 6.0: 10, 121.0: 7, 14.0: 6, 17.0: 4}


In [75]:
# Add new column using map function
df['schools_in_precinct'] = df['Precinct'].map(police_precinct_counts)

# Convert to integer type since counts should be whole numbers
df['schools_in_precinct'] = df['schools_in_precinct'].astype(int)

In [76]:
df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,High School Graduate (Age 25+),Bachelor’s Degree (Age 25+),Graduate or Professional Degree (Age 25+),English Only,Spanish,Never Married,Currently Married,Divorced,Precinct,schools_in_precinct
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,Brooklyn,brownsville,11212,rental,1,...,16815.0,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,Brooklyn,brownsville,11212,rental,2,...,16815.0,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,Brooklyn,brownsville,11212,rental,1,...,16815.0,6667.0,201.0,,,15805.0,8426.0,2263.0,73,56
3,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,Brooklyn,brownsville,11212,rental,1,...,16815.0,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41
4,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,Brooklyn,brownsville,11212,rental,3,...,16815.0,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41


# SUBWAYS

In [77]:
subway_df = pd.read_csv('./external_data/subway.csv')
subway_df.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,101,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,1,
1,101N,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101.0
2,101S,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101.0
3,103,,238 St,,40.884667,-73.90087,,,1,
4,103N,,238 St,,40.884667,-73.90087,,,0,103.0


In [78]:
subway_df.columns

Index(['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat',
       'stop_lon', 'zone_id', 'stop_url', 'location_type', 'parent_station'],
      dtype='object')

In [79]:
import numpy as np
def vectorized_count_stations(df, subway_df, radius=0.3):
    R = 3959.87433  # Earth's radius in miles
    
    # Convert to radians
    lat1 = np.radians(df['latitude'].values)[:, None]
    lon1 = np.radians(df['longitude'].values)[:, None]
    lat2 = np.radians(subway_df['stop_lat'].values)
    lon2 = np.radians(subway_df['stop_lon'].values)
    
    # Calculate distances
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distances = R * c
    
    # Count stations within radius
    return (distances <= radius).sum(axis=1)

df['nearby_subway_stations'] = vectorized_count_stations(df, subway_df)

In [80]:
df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,Bachelor’s Degree (Age 25+),Graduate or Professional Degree (Age 25+),English Only,Spanish,Never Married,Currently Married,Divorced,Precinct,schools_in_precinct,nearby_subway_stations
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,Brooklyn,brownsville,11212,rental,1,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,3
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,Brooklyn,brownsville,11212,rental,2,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,3
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,Brooklyn,brownsville,11212,rental,1,...,6667.0,201.0,,,15805.0,8426.0,2263.0,73,56,0
3,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,Brooklyn,brownsville,11212,rental,1,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,3
4,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,Brooklyn,brownsville,11212,rental,3,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,0


In [81]:
df.columns

Index(['listedAt', 'daysOnMarket', 'availableFrom', 'street', 'price',
       'borough', 'neighborhood', 'zip', 'propertyType', 'beds', 'baths',
       'latitude', 'longitude', 'amenities', 'builtIn', 'description',
       'hasVideo', 'PhotosNum', 'state', 'isUndisclosedAddress',
       'isFeaturedListing', 'duplicate', 'Total Population', 'Median Age',
       'Male Population', 'Female Population', 'White Alone',
       'Black or African American Alone', 'Asian Alone', 'Hispanic or Latino',
       'Number of Households', 'Median Household Income', 'Per Capita Income',
       'Population Below Poverty Level', 'Employed', 'Unemployed',
       'Total Income Distribution', 'Median Gross Rent', 'Median Home Value',
       'Occupied Housing Units', 'Vacant Housing Units',
       'Owner-Occupied Units (value < $100,000)', 'Monthly Housing Costs',
       'High School Graduate (Age 25+)', 'Bachelor’s Degree (Age 25+)',
       'Graduate or Professional Degree (Age 25+)', 'English Only', 'Spanis

# Crime

In [82]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("supreeth888/nypd-data")

In [83]:
import shutil
import os

# Source path (from cache)
# change this path according to your system
cache_path = '/Users/abhishekmahajan/.cache/kagglehub/datasets/supreeth888/nypd-data/versions/1'

# Destination path (your project directory)
project_path = './external_data'  # Change this to your desired project subdirectory

# Create the destination directory if it doesn't exist
os.makedirs(project_path, exist_ok=True)

# Copy all files from cache to project directory
for file in os.listdir(cache_path):
    src_file = os.path.join(cache_path, file)
    dst_file = os.path.join(project_path, file)
    shutil.copy2(src_file, dst_file)

In [84]:
complaints_data = pd.read_csv('./external_data/NYPD_Complaint_Data_Historic.csv')

  complaints_data = pd.read_csv('./external_data/NYPD_Complaint_Data_Historic.csv')


In [85]:
complaints_data.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,394506329,12/31/2019,17:30:00,,,32.0,12/31/2019,118,DANGEROUS WEAPONS,793.0,...,,,40.820927,-73.943324,"(40.82092679700002, -73.94332421899996)",PATROL BORO MAN NORTH,,UNKNOWN,UNKNOWN,E
1,968873685,12/29/2019,16:31:00,12/29/2019,16:54:00,47.0,12/29/2019,113,FORGERY,729.0,...,,,40.885701,-73.86164,"(40.885701406000074, -73.86164032499995)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E
2,509837549,12/15/2019,18:45:00,,,109.0,12/29/2019,578,HARRASSMENT 2,638.0,...,M,,40.742281,-73.819824,"(40.74228115600005, -73.81982408)",PATROL BORO QUEENS NORTH,,25-44,WHITE HISPANIC,F
3,352454313,12/28/2019,01:00:00,,,47.0,12/28/2019,126,MISCELLANEOUS PENAL LAW,117.0,...,M,,40.875311,-73.847545,"(40.87531145100007, -73.84754521099995)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E
4,248803469,09/05/2008,21:41:00,,,,09/05/2008,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,,,40.698827,-73.938819,"(40.698827283, -73.938819047)",,,25-44,BLACK,M


In [86]:
complaints_data['OFNS_DESC'].nunique()

71

In [87]:
complaints_data['OFNS_DESC'].unique()

array(['DANGEROUS WEAPONS', 'FORGERY', 'HARRASSMENT 2',
       'MISCELLANEOUS PENAL LAW', 'MURDER & NON-NEGL. MANSLAUGHTER',
       'BURGLARY', 'DANGEROUS DRUGS', 'PETIT LARCENY',
       'OFF. AGNST PUB ORD SENSBLTY &', 'GRAND LARCENY', 'FELONY ASSAULT',
       'ASSAULT 3 & RELATED OFFENSES', 'ARSON', 'RAPE', 'SEX CRIMES',
       'GRAND LARCENY OF MOTOR VEHICLE', 'ROBBERY',
       'CRIMINAL MISCHIEF & RELATED OF', 'THEFT-FRAUD',
       'VEHICLE AND TRAFFIC LAWS', 'CRIMINAL TRESPASS',
       'OFFENSES INVOLVING FRAUD', 'FRAUDS',
       'OFFENSES AGAINST PUBLIC ADMINI', 'OFFENSES AGAINST THE PERSON',
       'ADMINISTRATIVE CODE', 'INTOXICATED & IMPAIRED DRIVING',
       'ESCAPE 3', 'NYS LAWS-UNCLASSIFIED FELONY',
       'POSSESSION OF STOLEN PROPERTY', 'THEFT OF SERVICES',
       'KIDNAPPING & RELATED OFFENSES', 'OTHER OFFENSES RELATED TO THEF',
       'UNAUTHORIZED USE OF A VEHICLE', "BURGLAR'S TOOLS",
       'ENDAN WELFARE INCOMP', 'FRAUDULENT ACCOSTING',
       'AGRICULTURE & MRKTS LA

In [88]:
complaints_data.columns

Index(['CMPLNT_NUM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT',
       'CMPLNT_TO_TM', 'ADDR_PCT_CD', 'RPT_DT', 'KY_CD', 'OFNS_DESC', 'PD_CD',
       'PD_DESC', 'CRM_ATPT_CPTD_CD', 'LAW_CAT_CD', 'BORO_NM',
       'LOC_OF_OCCUR_DESC', 'PREM_TYP_DESC', 'JURIS_DESC', 'JURISDICTION_CODE',
       'PARKS_NM', 'HADEVELOPT', 'HOUSING_PSA', 'X_COORD_CD', 'Y_COORD_CD',
       'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT',
       'Latitude', 'Longitude', 'Lat_Lon', 'PATROL_BORO', 'STATION_NAME',
       'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX'],
      dtype='object')

In [89]:
# Convert ADDR_PCT_CD to integer, replacing any non-numeric values with NaN
complaints_data['ADDR_PCT_CD'] = pd.to_numeric(complaints_data['ADDR_PCT_CD'], errors='coerce')

# Drop any NaN values
complaints_df = complaints_data.dropna(subset=['ADDR_PCT_CD'])

# Convert to integer type
complaints_df['ADDR_PCT_CD'] = complaints_df['ADDR_PCT_CD'].astype(int)

# Calculate count of crimes per precinct
crime_count_per_precinct = complaints_df['ADDR_PCT_CD'].value_counts()

# Calculate total number of complaints
total_complaints = len(complaints_df)

# Calculate crime rate per precinct
crime_rate_per_precinct = (crime_count_per_precinct / total_complaints) * 100

# Sort by precinct number
crime_rate_per_precinct = crime_rate_per_precinct.sort_index()

# Display results
print("Crime Rate per Precinct:")
for precinct, rate in crime_rate_per_precinct.items():
    print(f"Precinct {precinct}: {rate:.2f}%")

Crime Rate per Precinct:
Precinct 1: 1.26%
Precinct 5: 0.89%
Precinct 6: 1.07%
Precinct 7: 0.85%
Precinct 9: 1.18%
Precinct 10: 0.84%
Precinct 13: 1.47%
Precinct 14: 2.26%
Precinct 17: 0.69%
Precinct 18: 1.59%
Precinct 19: 1.37%
Precinct 20: 0.83%
Precinct 22: 0.08%
Precinct 23: 1.30%
Precinct 24: 1.00%
Precinct 25: 1.36%
Precinct 26: 0.67%
Precinct 28: 1.07%
Precinct 30: 0.93%
Precinct 32: 1.36%
Precinct 33: 0.90%
Precinct 34: 1.10%
Precinct 40: 2.41%
Precinct 41: 1.35%
Precinct 42: 1.62%
Precinct 43: 2.56%
Precinct 44: 2.48%
Precinct 45: 1.26%
Precinct 46: 2.10%
Precinct 47: 1.91%
Precinct 48: 1.63%
Precinct 49: 1.35%
Precinct 50: 0.92%
Precinct 52: 2.09%
Precinct 60: 1.26%
Precinct 61: 1.22%
Precinct 62: 1.20%
Precinct 63: 1.03%
Precinct 66: 0.95%
Precinct 67: 1.89%
Precinct 68: 1.00%
Precinct 69: 0.96%
Precinct 70: 1.57%
Precinct 71: 1.38%
Precinct 72: 1.09%
Precinct 73: 2.04%
Precinct 75: 3.21%
Precinct 76: 0.55%
Precinct 77: 1.42%
Precinct 78: 0.73%
Precinct 79: 1.59%
Precinct 81

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complaints_df['ADDR_PCT_CD'] = complaints_df['ADDR_PCT_CD'].astype(int)


In [90]:
# Create a mapping dictionary of crime rates per precinct
crime_rate_per_precinct_mapping = crime_rate_per_precinct.to_dict()

# Add new column using map function
df['crime_rate'] = df['Precinct'].map(crime_rate_per_precinct_mapping)

In [91]:
df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,Graduate or Professional Degree (Age 25+),English Only,Spanish,Never Married,Currently Married,Divorced,Precinct,schools_in_precinct,nearby_subway_stations,crime_rate
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,Brooklyn,brownsville,11212,rental,1,...,201.0,,,15805.0,8426.0,2263.0,67,41,3,1.893937
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,Brooklyn,brownsville,11212,rental,2,...,201.0,,,15805.0,8426.0,2263.0,67,41,3,1.893937
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,Brooklyn,brownsville,11212,rental,1,...,201.0,,,15805.0,8426.0,2263.0,73,56,0,2.036421
3,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,Brooklyn,brownsville,11212,rental,1,...,201.0,,,15805.0,8426.0,2263.0,67,41,3,1.893937
4,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,Brooklyn,brownsville,11212,rental,3,...,201.0,,,15805.0,8426.0,2263.0,67,41,0,1.893937


In [92]:
len(df)

7842

In [93]:
df.to_csv('final_rental_merged.csv')