In [170]:
import pandas as pd

In [171]:
biz = pd.read_csv("../datasets/biz/biz.csv", dtype='string')
biz.shape

(281413, 27)

In [172]:
biz.dropna(subset=['Address Street Name'], inplace=True)
biz.dropna(subset=['Borough Code'], inplace=True)
biz.shape

(170627, 27)

In [173]:
# Define column widths
colspecs = [(2, 34), (36, 37), (37, 42)]

# Read the file
sc = pd.read_fwf('../datasets/street_dict.txt', colspecs=colspecs, header=None, names=["address_name", "borough", "street_code"], dtype='string')

In [174]:
biz['Address Street Name'] = biz['Address Street Name'].str.lower()
sc['address_name'] = sc['address_name'].str.lower()

# Remove extra whitespaces between words in 'Address Street Name' and 'address_name'
biz['Address Street Name'] = biz['Address Street Name'].str.replace('\s+', ' ', regex=True)
sc['address_name'] = sc['address_name'].str.replace('\s+', ' ', regex=True)

# Remove suffixes like 'th', 'st', 'nd', 'rd' from 'Address Street Name'
biz['Address Street Name'] = biz['Address Street Name'].str.replace('(\d+)(st|nd|rd|th)', r'\1', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' ave. ', ' avenue ')
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' ave ', ' avenue ')
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' ave$', ' avenue', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' st ', ' street ')
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' st$', ' street', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace('st.', 'st')
biz['Address Street Name'] = biz['Address Street Name'].str.replace('^w ', 'west ', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' w$', ' west', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace('^e ', 'east ', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace('blvd', 'boulevard', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' rd$', ' road', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' rd ', ' road ', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace('saint', 'st', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' pl$', ' place', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace('pkwy', 'parkway', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' dr$', ' drive', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace('^s ', 'south ', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' cir$', ' circle', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' ter$', ' terrace', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' expy$', ' expressway', regex=True)
biz['Address Street Name'] = biz['Address Street Name'].str.replace(' hwy$', ' highway', regex=True)

In [175]:
# Merge 'biz' and 'sc' on 'address_name'/'Address Street Name' and 'borough'/'Borough Code'
merged = sc.merge(biz, left_on=['address_name', 'borough'], right_on=['Address Street Name', 'Borough Code'])

In [176]:
merged.shape

(164108, 30)

In [177]:
merged.columns

Index(['address_name', 'borough', 'street_code', 'DCA License Number',
       'License Type', 'License Expiration Date', 'License Status',
       'License Creation Date', 'Industry', 'Business Name', 'Business Name 2',
       'Address Building', 'Address Street Name',
       'Secondary Address Street Name', 'Address City', 'Address State',
       'Address ZIP', 'Contact Phone Number', 'Address Borough',
       'Borough Code', 'Community Board', 'Council District', 'BIN', 'BBL',
       'NTA', 'Census Tract', 'Detail', 'Longitude', 'Latitude', 'Location'],
      dtype='object')

Keep only how many businesses are on each street and save that

In [178]:
counts = merged.groupby(['street_code', 'borough', 'address_name']).size().reset_index(name='num_businesses')
merged_counts = counts.merge(sc, on=['address_name', 'borough'], how='right')
merged_counts.fillna({'num_businesses': 0}, inplace = True)
merged_counts['num_businesses'] = merged_counts['num_businesses'].astype(int)
merged_counts.drop(columns=['street_code_x'], inplace=True)
merged_counts.rename(columns={'street_code_y': 'street_code'}, inplace=True)

merged_counts[merged_counts['num_businesses'] > 0]

Unnamed: 0,borough,address_name,num_businesses,street_code
2,1,1 avenue,1183,10010
7,1,2 avenue,1617,10110
10,1,3 avenue,1337,10210
15,1,4 avenue,46,10350
17,1,5 avenue,839,10410
...,...,...,...,...
21811,5,yeomalt avenue,1,56485
21814,5,yetman avenue,11,56500
21816,5,york avenue,1,56600
21817,5,york terrace,1,56615


In [179]:
# Mappin based on documentation from here: https://data.cityofnewyork.us/City-Government/Street-Name-Dictionary/w4v2-rv6b/about_data
merged.loc[merged.borough == "1", "borough"] = 'NY'
merged.loc[merged.borough == "2", "borough"] = 'BX'
merged.loc[merged.borough == "3", "borough"] = 'K'
merged.loc[merged.borough == "4", "borough"] = 'Q'
merged.loc[merged.borough == "5", "borough"] = 'R'


# Mappin based on documentation from here: https://data.cityofnewyork.us/City-Government/Street-Name-Dictionary/w4v2-rv6b/about_data
merged_counts.loc[merged_counts.borough == "1", "borough"] = 'NY'
merged_counts.loc[merged_counts.borough == "2", "borough"] = 'BX'
merged_counts.loc[merged_counts.borough == "3", "borough"] = 'K'
merged_counts.loc[merged_counts.borough == "4", "borough"] = 'Q'
merged_counts.loc[merged_counts.borough == "5", "borough"] = 'R'

In [181]:
merged_counts.to_parquet("../datasets/biz/biz_per_street.parquet", compression='snappy')
merged.to_parquet("../datasets/biz/biz_with_sc.parquet", compression='snappy')

The below code is used to see which streets are missing so I can add rules to fix them

In [182]:
# Right join 'biz' and 'sc' on 'address_name'/'Address Street Name' and 'borough'/'Borough Code'
merged = sc.merge(biz, left_on=['address_name', 'borough'], right_on=['Address Street Name', 'Borough Code'], how='right')

# Filter rows where 'address_name' is NA
filtered = merged[merged['address_name'].isna()]

In [183]:
filtered[['Address Street Name', 'Borough Code']].head(10)

Unnamed: 0,Address Street Name,Borough Code
77,6 avenue,1
197,dr martin l king jr boulevard,2
198,frederick douglass boulevard,1
207,vanderwater avenue,4
245,whitestone expressway,4
261,6 avenue,1
319,mcclean avenue,5
351,main avenue,2
358,union tpke,4
365,horace harding expressway,4
