In [None]:
import usaddress
import string
from Levenshtein import distance as levenshtein_distance
import pandas as pd
import ast

In [None]:
punctuation_without_hash = string.punctuation.replace('#', '')
def remove_punctuation(text):
    try:
        return text.translate(str.maketrans('', '', punctuation_without_hash))
    except AttributeError:
        print(text)
        return text

directional_map = {
    'n': 'north', 'north': 'north',
    's': 'south', 'south': 'south',
    'e': 'east', 'east': 'east',
    'w': 'west', 'west': 'west',
    'ne': 'northeast', 'northeast': 'northeast',
    'nw': 'northwest', 'northwest': 'northwest',
    'se': 'southeast', 'southeast': 'southeast',
    'sw': 'southwest', 'southwest': 'southwest'
}
suffix_map = {
    'st': 'street', 'street': 'street',
    'ave': 'avenue', 'avenue': 'avenue',
    'blvd': 'boulevard', 'boulevard': 'boulevard',
    'rd': 'road', 'road': 'road',
    'dr': 'drive', 'drive': 'drive',
    'ct': 'court', 'court': 'court',
    'pl': 'place', 'place': 'place',
    'ln': 'lane', 'lane': 'lane',
    'ter': 'terrace', 'terrace': 'terrace'
}

def normalize_street_name(street_name):
    # Split the street name into parts
    parts = street_name.lower().split()
    normalized_parts = []

    for part in parts:
        # Remove punctuation
        part_clean = remove_punctuation(part)
        # Normalize directionals and suffixes
        if part_clean in directional_map:
            normalized_parts.append(directional_map[part_clean])
        elif part_clean in suffix_map:
            normalized_parts.append(suffix_map[part_clean])
        else:
            normalized_parts.append(part_clean)
    return ' '.join(normalized_parts)

def normalize_with_usaddress(address):
    try:
        # Clean the address before parsing
        # print(address)
        clean_address = remove_punctuation(address)
        # print(clean_address)
        try:
            parsed = usaddress.tag(clean_address)[0]
        except TypeError:
            print(f"Error parsing address '{address!r}'")

        # print(parsed)
        
        # Normalize StreetNamePreDirectional
        if 'StreetNamePreDirectional' in parsed:
            value = parsed['StreetNamePreDirectional'].lower()
            parsed['StreetNamePreDirectional'] = directional_map.get(value, value)
        
        # Normalize StreetNamePostType
        if 'StreetNamePostType' in parsed:
            value = parsed['StreetNamePostType'].lower()
            parsed['StreetNamePostType'] = suffix_map.get(value, value)
        
        # Normalize StreetName
        if 'StreetName' in parsed:
            parsed['StreetName'] = normalize_street_name(parsed['StreetName'])
        
        # Normalize OccupancyIdentifier
        occupancy_id = parsed.get('OccupancyIdentifier', '')
        if occupancy_id:
            parsed['OccupancyIdentifier'] = remove_punctuation(occupancy_id).lower()
        else:
            # Check for SubaddressIdentifier if OccupancyIdentifier is missing
            subaddress_id = parsed.get('SubaddressIdentifier', '')
            if subaddress_id:
                parsed['OccupancyIdentifier'] = remove_punctuation(subaddress_id).lower()
        
        # Reconstruct the normalized address without OccupancyType
        normalized_address = " ".join(filter(None, [
            parsed.get('AddressNumber', '').lower(),
            parsed.get('StreetNamePreDirectional', '').lower(),
            parsed.get('StreetName', '').lower(),
            parsed.get('StreetNamePostType', '').lower(),
            parsed.get('OccupancyIdentifier', '')
        ])).lower()
        
        return normalized_address
    
    except usaddress.RepeatedLabelError as e:
        # Log the error and return a cleaned, lowercased address
        print(f"Error parsing address '{address}': {e}")
        return remove_punctuation(address).lower()

def are_addresses_same(addr1, addr2, threshold=2):
    norm_addr1 = normalize_with_usaddress(addr1)
    norm_addr2 = normalize_with_usaddress(addr2)
    distance = levenshtein_distance(norm_addr1, norm_addr2)
    return distance <= threshold

In [None]:
# Example
address1 = "993 Dumont Avenue #2"
address2 = "993 Dumont Ave APT 2"

norm_addr1 = normalize_with_usaddress(address1)
norm_addr2 = normalize_with_usaddress(address2)

print("Normalized Address 1:", norm_addr1)
print("Normalized Address 2:", norm_addr2)
print("Are addresses the same?", are_addresses_same(address1, address2))

In [None]:
streeteasy_df = pd.read_csv('./streetezy_scraper/streeteasy_rentals.csv')
streeteasy_df.head()

In [None]:
zillow_df = pd.read_csv('./zillow_scraper/rent_data_zillow.csv')
zillow_df.head()

## Remove stuff from zillow DF


In [None]:
zillow_df.drop(columns=['providerListingId', 'hasImage', 'variableData', 'hdpData', 'has3DModel', 'brokerName', 'marketingTreatments', 'timeOnZillow',
'daysOnZillowHDP', 'timeOnZillowHDP', 'daysOnZillowHDP'], inplace=True)

## Rename stuff for homegenity

In [None]:
def convert_to_days(time_str):
    if time_str is None or str(time_str) == 'nan':
        return -1
    try:
        borken_down = time_str.split()
    except AttributeError:
        print(time_str)
        return -1

    if borken_down[1].startswith('day'):
        return int(borken_down[0])
    elif borken_down[1].startswith('hour'):
        return 0
    else:
        return -1

In [None]:
zillow_df.rename(columns={'addressStreet': 'street', 'addressCity': 'borough', 'addressState': 'state', 'addressZipcode': 'zip', 'carouselPhotos' : 'PhotosNum', 'availabilityDate':
'availableFrom', 'houseType': 'propertyType', 'PhotosNum': 'photosNum'}, inplace=True)
zillow_df['daysOnMarket'] = zillow_df['timeOnZillowText'].apply(lambda x: convert_to_days(x))
zillow_df.drop(columns=['timeOnZillowText'], inplace=True)
zillow_df.head(5)


## Delete Stuff from streeteasy DF

In [None]:
streeteasy_df.drop(columns=['id', 'sqft', 'type', 'building', 'agents', 'noFee', 'floorplans',], inplace=True)

## Rename Stuff from streeteasy DF

In [None]:
streeteasy_df.head()

streeteasy_df['images'] = streeteasy_df['images'].apply(lambda x: len(ast.literal_eval(x)))
streeteasy_df['images'].isnull().sum()

In [None]:
# streeteasy_df.rename(columns={})
streeteasy_df['hasVideo'] = streeteasy_df['videos'].apply(lambda x: 1 if len(x) > 0 else 0)
streeteasy_df['PhotosNum'] = streeteasy_df['images']
streeteasy_df.drop(columns=['videos', 'images'], inplace=True)
streeteasy_df.rename(columns={'address': 'street', 'zipcode': 'zip', 'property_type': 'propertyType', 'bedrooms':'beds', 'bathrooms':'baths'}, inplace=True)
streeteasy_df.head()


In [None]:
total_df = pd.concat([streeteasy_df, zillow_df], ignore_index=True)
total_df.head()

In [None]:
pd.set_option('display.max_rows', None)
total_df['PhotosNum'].value_counts()


In [None]:
addresses = []
total_df['duplicate'] = False
global failed
failed =  0
def remove_duplicate_custom(row):
    try:
        full_add = normalize_with_usaddress(row['street'])
    except (AttributeError, KeyError):
        print(row)
        failed += 1
        return row

    if full_add is None:
        failed += 1
        return row
    
    for address in addresses:
        if levenshtein_distance(full_add, address) < 3:
            row['duplicate'] = True
            return row
    addresses.append(full_add)
    return row

total_df.drop(6104, inplace=True)

In [None]:
total_df = total_df.apply(remove_duplicate_custom, axis=1)
print(failed)
print(total_df.columns)
indices_to_drop = total_df[total_df['duplicate'] == True].index
total_df.drop(indices_to_drop, inplace=True)
total_df.head()

In [None]:
pd.set_option('display.max_rows', None)
total_df['borough'].value_counts()

In [None]:
pd.set_option('display.max_rows', None)

nyc_boroughs = {
    'Manhattan': [
        'Manhattan', 'Harlem', 'Greenwich Village', 'Astoria', 'Hudson Yards',
        'Upper East Side', 'Upper West Side', 'Inwood', 'Washington Heights',
        'Chelsea', 'East Village', 'Financial District', 'Tribeca', 'New York',
    ],
    'Brooklyn': [
        'Brooklyn', 'Williamsburg', 'Brooklyn Heights', 'Park Slope', 'Bushwick',
        'DUMBO', 'Fort Greene', 'Crown Heights', 'Greenpoint', 'Bensonhurst',
        'Bay Ridge', 'Coney Island', 'Brighton Beach'
    ],
    'Queens': [
        'Queens', 'Astoria', 'Flushing', 'Jamaica', 'Long Island City',
        'Forest Hills', 'Woodside', 'Ridgewood', 'Bayside', 'Corona',
        'Jackson Heights', 'Elmhurst', 'Sunnyside'
    ],
    'Bronx': [
        'Bronx', 'Riverdale', 'Fordham', 'Pelham Bay', 'Throgs Neck',
        'Morris Park', 'Belmont', 'Kingsbridge', 'Parkchester'
    ],
    'Staten Island': [
        'Staten Island', 'St. George', 'Tottenville', 'Great Kills',
        'New Dorp', 'Port Richmond'
    ]
}

# Flatten the dictionary to map each neighborhood to its borough
borough_mapping = {
    neighborhood.lower(): borough
    for borough, neighborhoods in nyc_boroughs.items()
    for neighborhood in neighborhoods
}

# Normalize column and map values
total_df['borough_normalized'] = total_df['borough'].str.lower().map(borough_mapping)

# Identify rows that will be removed
removed_rows = total_df[total_df['borough_normalized'].isna()]

# Filter the DataFrame to keep only valid boroughs
total_df_filtered = total_df.dropna(subset=['borough_normalized'])

# Reset the DataFrame with cleaned borough information
total_df_filtered['borough'] = total_df_filtered['borough_normalized']
total_df_filtered = total_df_filtered.drop(columns=['borough_normalized'])
print(total_df_filtered['borough'].value_counts())



# CENSUS DATA

In [None]:
zip_code_df = pd.read_csv('./census_data/census_data.csv')

In [None]:
rows_with_negative_666666 = zip_code_df[(zip_code_df == -666666666).any(axis=1)]

zip_code_df_cleaned = zip_code_df[(zip_code_df != -666666666).all(axis=1)]

print("Dropped rows:")
print(rows_with_negative_666666)

In [None]:
zip_code_df_cleaned.describe(include='all').T

In [None]:
# Convert both columns to string type
total_df_filtered['zip'] = total_df_filtered['zip'].astype(str)
zip_code_df['zip code tabulation area'] = zip_code_df['zip code tabulation area'].astype(str)

# Now merge
df = pd.merge(total_df_filtered, zip_code_df, 
              left_on='zip', 
              right_on='zip code tabulation area',
              how='left')

In [None]:
df.columns

# POLICE PRECINCT

In [None]:
police_df = pd.read_csv('./external_data/police_p.csv')
print(police_df.iloc[0])

In [None]:
police_df.columns

In [None]:
from shapely import wkt
import geopandas as gpd

# Convert to GeoDataFrames
gdf = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df.longitude, df.latitude)
)
police_gdf = gpd.GeoDataFrame(
    police_df,
    geometry=police_df['the_geom'].apply(wkt.loads)
)

# Spatial join
df = gpd.sjoin(gdf, police_gdf, predicate='intersects')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df = df.drop(columns=['the_geom', 'Shape_Leng', 'Shape_Area', 'index_right', 'geometry', 'zip code tabulation area'], axis=1)

# SCHOOLS

In [None]:
school_df = pd.read_csv('./external_data/schools.csv')
school_df.head()

In [None]:
school_df.columns

In [None]:
# Filter out rows with missing Police_precinct values
filtered_school_df = school_df.dropna(subset=['Police_precinct'])

# Group by Police_precinct and count
police_precinct_counts = filtered_school_df['Police_precinct'].value_counts().to_dict()

# Display the dictionary
print(police_precinct_counts)

In [None]:
# Add new column using map function
df['schools_in_precinct'] = df['Precinct'].map(police_precinct_counts)

# Convert to integer type since counts should be whole numbers
df['schools_in_precinct'] = df['schools_in_precinct'].astype(int)

In [None]:
df.head()

In [None]:
df['PhotosNum'].value_counts()

# SUBWAYS

In [52]:
subway_df = pd.read_csv('./external_data/subway.csv')
subway_df.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,101,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,1,
1,101N,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101.0
2,101S,,Van Cortlandt Park - 242 St,,40.889248,-73.898583,,,0,101.0
3,103,,238 St,,40.884667,-73.90087,,,1,
4,103N,,238 St,,40.884667,-73.90087,,,0,103.0


In [53]:
subway_df.columns

Index(['stop_id', 'stop_code', 'stop_name', 'stop_desc', 'stop_lat',
       'stop_lon', 'zone_id', 'stop_url', 'location_type', 'parent_station'],
      dtype='object')

In [54]:
import numpy as np
def vectorized_count_stations(df, subway_df, radius=0.3):
    R = 3959.87433  # Earth's radius in miles
    
    # Convert to radians
    lat1 = np.radians(df['latitude'].values)[:, None]
    lon1 = np.radians(df['longitude'].values)[:, None]
    lat2 = np.radians(subway_df['stop_lat'].values)
    lon2 = np.radians(subway_df['stop_lon'].values)
    
    # Calculate distances
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distances = R * c
    
    # Count stations within radius
    return (distances <= radius).sum(axis=1)

df['nearby_subway_stations'] = vectorized_count_stations(df, subway_df)

In [55]:
df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,Bachelor’s Degree (Age 25+),Graduate or Professional Degree (Age 25+),English Only,Spanish,Never Married,Currently Married,Divorced,Precinct,schools_in_precinct,nearby_subway_stations
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,Brooklyn,brownsville,11212,rental,1,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,3
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,Brooklyn,brownsville,11212,rental,2,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,3
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,Brooklyn,brownsville,11212,rental,1,...,6667.0,201.0,,,15805.0,8426.0,2263.0,73,56,0
3,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,Brooklyn,brownsville,11212,rental,1,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,3
4,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,Brooklyn,brownsville,11212,rental,3,...,6667.0,201.0,,,15805.0,8426.0,2263.0,67,41,0


In [56]:
df.columns

Index(['listedAt', 'daysOnMarket', 'availableFrom', 'street', 'price',
       'borough', 'neighborhood', 'zip', 'propertyType', 'beds', 'baths',
       'latitude', 'longitude', 'amenities', 'builtIn', 'description',
       'hasVideo', 'PhotosNum', 'state', 'isUndisclosedAddress',
       'isFeaturedListing', 'duplicate', 'Total Population', 'Median Age',
       'Male Population', 'Female Population', 'White Alone',
       'Black or African American Alone', 'Asian Alone', 'Hispanic or Latino',
       'Number of Households', 'Median Household Income', 'Per Capita Income',
       'Population Below Poverty Level', 'Employed', 'Unemployed',
       'Total Income Distribution', 'Median Gross Rent', 'Median Home Value',
       'Occupied Housing Units', 'Vacant Housing Units',
       'Owner-Occupied Units (value < $100,000)', 'Monthly Housing Costs',
       'High School Graduate (Age 25+)', 'Bachelor’s Degree (Age 25+)',
       'Graduate or Professional Degree (Age 25+)', 'English Only', 'Spanis

In [57]:
df['PhotosNum'].value_counts()

PhotosNum
0     3793
6      431
7      415
8      364
9      355
5      299
10     291
20     245
11     238
12     215
4      207
13     164
14     159
15     125
16     101
17      97
18      74
3       72
19      55
1       33
2       22
23      11
22      11
25      10
21       8
24       8
32       6
27       5
28       5
26       5
29       3
33       2
30       2
36       2
48       1
46       1
34       1
44       1
35       1
51       1
73       1
37       1
31       1
Name: count, dtype: int64

# Crime

In [58]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("supreeth888/nypd-data")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import shutil
import os

# Source path (from cache)
# change this path according to your system
cache_path = '/Users/abhishekmahajan/.cache/kagglehub/datasets/supreeth888/nypd-data/versions/1'

# Destination path (your project directory)
project_path = './external_data'  # Change this to your desired project subdirectory

# Create the destination directory if it doesn't exist
os.makedirs(project_path, exist_ok=True)

# Copy all files from cache to project directory
for file in os.listdir(cache_path):
    src_file = os.path.join(cache_path, file)
    dst_file = os.path.join(project_path, file)
    shutil.copy2(src_file, dst_file)

In [59]:
complaints_data = pd.read_csv('./external_data/NYPD_Complaint_Data_Historic.csv')

  complaints_data = pd.read_csv('./external_data/NYPD_Complaint_Data_Historic.csv')


In [60]:
complaints_data.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,...,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,394506329,12/31/2019,17:30:00,,,32.0,12/31/2019,118,DANGEROUS WEAPONS,793.0,...,,,40.820927,-73.943324,"(40.82092679700002, -73.94332421899996)",PATROL BORO MAN NORTH,,UNKNOWN,UNKNOWN,E
1,968873685,12/29/2019,16:31:00,12/29/2019,16:54:00,47.0,12/29/2019,113,FORGERY,729.0,...,,,40.885701,-73.86164,"(40.885701406000074, -73.86164032499995)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E
2,509837549,12/15/2019,18:45:00,,,109.0,12/29/2019,578,HARRASSMENT 2,638.0,...,M,,40.742281,-73.819824,"(40.74228115600005, -73.81982408)",PATROL BORO QUEENS NORTH,,25-44,WHITE HISPANIC,F
3,352454313,12/28/2019,01:00:00,,,47.0,12/28/2019,126,MISCELLANEOUS PENAL LAW,117.0,...,M,,40.875311,-73.847545,"(40.87531145100007, -73.84754521099995)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E
4,248803469,09/05/2008,21:41:00,,,,09/05/2008,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,,,40.698827,-73.938819,"(40.698827283, -73.938819047)",,,25-44,BLACK,M


In [61]:
complaints_data['OFNS_DESC'].nunique()

71

In [62]:
complaints_data['OFNS_DESC'].unique()

array(['DANGEROUS WEAPONS', 'FORGERY', 'HARRASSMENT 2',
       'MISCELLANEOUS PENAL LAW', 'MURDER & NON-NEGL. MANSLAUGHTER',
       'BURGLARY', 'DANGEROUS DRUGS', 'PETIT LARCENY',
       'OFF. AGNST PUB ORD SENSBLTY &', 'GRAND LARCENY', 'FELONY ASSAULT',
       'ASSAULT 3 & RELATED OFFENSES', 'ARSON', 'RAPE', 'SEX CRIMES',
       'GRAND LARCENY OF MOTOR VEHICLE', 'ROBBERY',
       'CRIMINAL MISCHIEF & RELATED OF', 'THEFT-FRAUD',
       'VEHICLE AND TRAFFIC LAWS', 'CRIMINAL TRESPASS',
       'OFFENSES INVOLVING FRAUD', 'FRAUDS',
       'OFFENSES AGAINST PUBLIC ADMINI', 'OFFENSES AGAINST THE PERSON',
       'ADMINISTRATIVE CODE', 'INTOXICATED & IMPAIRED DRIVING',
       'ESCAPE 3', 'NYS LAWS-UNCLASSIFIED FELONY',
       'POSSESSION OF STOLEN PROPERTY', 'THEFT OF SERVICES',
       'KIDNAPPING & RELATED OFFENSES', 'OTHER OFFENSES RELATED TO THEF',
       'UNAUTHORIZED USE OF A VEHICLE', "BURGLAR'S TOOLS",
       'ENDAN WELFARE INCOMP', 'FRAUDULENT ACCOSTING',
       'AGRICULTURE & MRKTS LA

In [63]:
complaints_data.columns

Index(['CMPLNT_NUM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT',
       'CMPLNT_TO_TM', 'ADDR_PCT_CD', 'RPT_DT', 'KY_CD', 'OFNS_DESC', 'PD_CD',
       'PD_DESC', 'CRM_ATPT_CPTD_CD', 'LAW_CAT_CD', 'BORO_NM',
       'LOC_OF_OCCUR_DESC', 'PREM_TYP_DESC', 'JURIS_DESC', 'JURISDICTION_CODE',
       'PARKS_NM', 'HADEVELOPT', 'HOUSING_PSA', 'X_COORD_CD', 'Y_COORD_CD',
       'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT',
       'Latitude', 'Longitude', 'Lat_Lon', 'PATROL_BORO', 'STATION_NAME',
       'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX'],
      dtype='object')

In [64]:
# Convert ADDR_PCT_CD to integer, replacing any non-numeric values with NaN
complaints_data['ADDR_PCT_CD'] = pd.to_numeric(complaints_data['ADDR_PCT_CD'], errors='coerce')

# Drop any NaN values
complaints_df = complaints_data.dropna(subset=['ADDR_PCT_CD'])

# Convert to integer type
complaints_df['ADDR_PCT_CD'] = complaints_df['ADDR_PCT_CD'].astype(int)

# Calculate count of crimes per precinct
crime_count_per_precinct = complaints_df['ADDR_PCT_CD'].value_counts()

# Calculate total number of complaints
total_complaints = len(complaints_df)

# Calculate crime rate per precinct
crime_rate_per_precinct = (crime_count_per_precinct / total_complaints) * 100

# Sort by precinct number
crime_rate_per_precinct = crime_rate_per_precinct.sort_index()

# Display results
print("Crime Rate per Precinct:")
for precinct, rate in crime_rate_per_precinct.items():
    print(f"Precinct {precinct}: {rate:.2f}%")

Crime Rate per Precinct:
Precinct 1: 1.26%
Precinct 5: 0.89%
Precinct 6: 1.07%
Precinct 7: 0.85%
Precinct 9: 1.18%
Precinct 10: 0.84%
Precinct 13: 1.47%
Precinct 14: 2.26%
Precinct 17: 0.69%
Precinct 18: 1.59%
Precinct 19: 1.37%
Precinct 20: 0.83%
Precinct 22: 0.08%
Precinct 23: 1.30%
Precinct 24: 1.00%
Precinct 25: 1.36%
Precinct 26: 0.67%
Precinct 28: 1.07%
Precinct 30: 0.93%
Precinct 32: 1.36%
Precinct 33: 0.90%
Precinct 34: 1.10%
Precinct 40: 2.41%
Precinct 41: 1.35%
Precinct 42: 1.62%
Precinct 43: 2.56%
Precinct 44: 2.48%
Precinct 45: 1.26%
Precinct 46: 2.10%
Precinct 47: 1.91%
Precinct 48: 1.63%
Precinct 49: 1.35%
Precinct 50: 0.92%
Precinct 52: 2.09%
Precinct 60: 1.26%
Precinct 61: 1.22%
Precinct 62: 1.20%
Precinct 63: 1.03%
Precinct 66: 0.95%
Precinct 67: 1.89%
Precinct 68: 1.00%
Precinct 69: 0.96%
Precinct 70: 1.57%
Precinct 71: 1.38%
Precinct 72: 1.09%
Precinct 73: 2.04%
Precinct 75: 3.21%
Precinct 76: 0.55%
Precinct 77: 1.42%
Precinct 78: 0.73%
Precinct 79: 1.59%
Precinct 81

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complaints_df['ADDR_PCT_CD'] = complaints_df['ADDR_PCT_CD'].astype(int)


In [65]:
# Create a mapping dictionary of crime rates per precinct
crime_rate_per_precinct_mapping = crime_rate_per_precinct.to_dict()

# Add new column using map function
df['crime_rate'] = df['Precinct'].map(crime_rate_per_precinct_mapping)

In [66]:
df.head()

Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,Graduate or Professional Degree (Age 25+),English Only,Spanish,Never Married,Currently Married,Divorced,Precinct,schools_in_precinct,nearby_subway_stations,crime_rate
0,2024-11-25,1,2024-11-25,171 East 96th Street #4A,1855,Brooklyn,brownsville,11212,rental,1,...,201.0,,,15805.0,8426.0,2263.0,67,41,3,1.893937
1,2024-11-22,4,2024-11-22,93 East 96th Street #11,2235,Brooklyn,brownsville,11212,rental,2,...,201.0,,,15805.0,8426.0,2263.0,67,41,3,1.893937
2,2024-11-22,4,2024-11-22,501 Saratoga Avenue #B,1750,Brooklyn,brownsville,11212,rental,1,...,201.0,,,15805.0,8426.0,2263.0,73,56,0,2.036421
3,2024-11-20,6,2024-11-20,1115 Willmohr Street #2P,1850,Brooklyn,brownsville,11212,rental,1,...,201.0,,,15805.0,8426.0,2263.0,67,41,3,1.893937
4,2024-11-19,7,2024-11-19,1075 Clarkson Avenue #2B,3000,Brooklyn,brownsville,11212,rental,3,...,201.0,,,15805.0,8426.0,2263.0,67,41,0,1.893937


In [67]:
len(df)

7842

In [69]:
df.to_csv('final_rental_merged.csv')

In [68]:
df['PhotosNum'].value_counts()

PhotosNum
0     3793
6      431
7      415
8      364
9      355
5      299
10     291
20     245
11     238
12     215
4      207
13     164
14     159
15     125
16     101
17      97
18      74
3       72
19      55
1       33
2       22
23      11
22      11
25      10
21       8
24       8
32       6
27       5
28       5
26       5
29       3
33       2
30       2
36       2
48       1
46       1
34       1
44       1
35       1
51       1
73       1
37       1
31       1
Name: count, dtype: int64