In [1]:
import pandas as pd
import numpy as np
import sys
import os

# Add project root to path
sys.path.append(r"D:\data-problem-solving")

from db import load_table, engine

sf_df = pd.read_sql(
    'SELECT * FROM "street_count_per_zip".sf_restaurant_health_violations',
    engine
)

In [3]:
# Drop rows with missing postal codes
df = sf_df.dropna(subset=['business_postal_code', 'business_address']).copy()

# Function to extract normalized street name
def extract_street_name(address):
    parts = str(address).split()
    if len(parts) < 1:
        return np.nan
    # Handle reversed structures like "39 Pier" vs "Pier 39"
    if parts[0].isdigit() and len(parts) > 1:
        name = parts[1]
    elif len(parts) > 1 and parts[1].isdigit():
        name = parts[0]
    else:
        name = parts[0]
    return name.lower()

# Apply extraction
df['street_name'] = df['business_address'].apply(extract_street_name)

# Count unique street names per postal code
result = (
    df.groupby('business_postal_code')['street_name']
    .nunique()
    .reset_index(name='n_streets')
    .sort_values(['n_streets', 'business_postal_code'], ascending=[False, True])
    .reset_index(drop=True)
)

print(result)

    business_postal_code  n_streets
0                94103.0         16
1                94133.0         11
2                94102.0         10
3                94109.0          9
4                94107.0          8
5                94108.0          8
6                94110.0          8
7                94112.0          8
8                94104.0          7
9                94105.0          7
10               94114.0          6
11               94111.0          5
12               94115.0          5
13               94122.0          5
14               94118.0          4
15               94121.0          4
16               94132.0          4
17               94134.0          4
18               94117.0          3
19               94123.0          3
20               94124.0          3
21               94116.0          2
22               94127.0          2
23               94131.0          1
