In [None]:
import requests
import os
import pandas as pd
import time
from dotenv import load_dotenv
import logging

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

load_dotenv()

GEOCLIENT_KEY = os.getenv("GEOCLIENT_KEY")

url = "https://api.nyc.gov/geoclient/v2/address.json"

In [None]:
def get_lat_lon(house_number: str, street: str, borough: str) -> tuple:
    params = {
        "houseNumber": house_number,
        "street": street,
        "borough": borough
    }
    headers = {
        "Ocp-Apim-Subscription-Key": GEOCLIENT_KEY
    }
    response = requests.get(url, params=params, headers=headers)
    data = response.json()
    if "address" in data:
        lat = data["address"].get("latitude")
        lon = data["address"].get("longitude")
        return (lat, lon)
    return (None, None)

In [None]:
def process_listings_with_coordinates(input_file_path, batch_size=100, requests_per_minute=2000):
    """
    Process listings CSV file to add latitude and longitude coordinates.

    Args:
        input_file_path (str): Path to the input CSV file
        batch_size (int): Number of addresses to process in each batch
        requests_per_minute (int): Maximum requests per minute to stay under API limits

    Returns:
        str: Path to the output CSV file with coordinates
    """
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('geocoding.log'),
            logging.StreamHandler()
        ]
    )

    # Load the CSV data
    logging.info(f"Loading data from {input_file_path}")
    df = pd.read_csv(input_file_path)
    total_rows = len(df)
    logging.info(f"Loaded {total_rows} listings")

    # Add latitude and longitude columns if they don't exist
    if 'LATITUDE' not in df.columns:
        df['LATITUDE'] = None
    if 'LONGITUDE' not in df.columns:
        df['LONGITUDE'] = None

    # Calculate time needed between requests to stay under rate limit
    seconds_per_request = 60 / requests_per_minute

    # Process in batches
    processed_count = 0
    error_count = 0

    # Find rows without coordinates
    rows_to_process = df[(df['LATITUDE'].isna()) | (df['LONGITUDE'].isna())].index
    logging.info(f"Found {len(rows_to_process)} listings without coordinates")

    # Generate output filename with timestamp
    output_file = "listings_with_coordinates.csv"

    try:
        for i, idx in enumerate(rows_to_process):
            row = df.loc[idx]

            # Clean building number (handle ranges like "303 TO 309")
            building_no = str(row['BUILDING_NO']).split(' TO ')[0].strip() if pd.notna(row['BUILDING_NO']) else ""

            # Process the address
            if pd.notna(row['BOROUGH']) and pd.notna(row['STREET']) and building_no:
                start_time = time.time()

                try:
                    lat, lon = get_lat_lon(
                        house_number=building_no,
                        street=str(row['STREET']),
                        borough=str(row['BOROUGH'])
                    )
                    df.at[idx, 'LATITUDE'] = lat
                    df.at[idx, 'LONGITUDE'] = lon
                    processed_count += 1

                    # Log progress periodically
                    if processed_count % 10 == 0:
                        logging.info(f"Processed {processed_count}/{len(rows_to_process)} addresses")

                except Exception as e:
                    logging.error(f"Error processing {building_no} {row['STREET']}, {row['BOROUGH']}: {str(e)}")
                    error_count += 1

                # Save progress after each batch
                if (i + 1) % batch_size == 0:
                    df.to_csv(output_file, index=False)
                    logging.info(f"Saved progress to {output_file} after {i+1} addresses")

                # Control rate limiting
                elapsed = time.time() - start_time
                if elapsed < seconds_per_request:
                    time.sleep(seconds_per_request - elapsed)

            # Save final progress
            if (i + 1) == len(rows_to_process):
                df.to_csv(output_file, index=False)

    except KeyboardInterrupt:
        logging.info("Process interrupted by user. Saving current progress...")
        df.to_csv(output_file, index=False)

    logging.info(f"Processing complete. Processed {processed_count} addresses with {error_count} errors")
    logging.info(f"Results saved to {output_file}")

    return output_file


In [None]:
# Test the get_lat_lon function with a single address
lat, long = get_lat_lon("246", "10th Avenue", "Manhattan")
print(f"Test coordinates: {lat}, {long}")

In [None]:
# Process the full dataset
input_path = '../2_flatten_and_clean_addresses/listings_flattened_and_cleaned_address.csv'
process_listings_with_coordinates(input_path)

In [21]:
df = pd.read_csv('listings_with_coordinates.csv')

# Filter rows where Latitude and Longitude are empty or NaN
filtered = df[(df['LATITUDE'].isna() | (df['LATITUDE'] == '')) &
              (df['LONGITUDE'].isna() | (df['LONGITUDE'] == ''))]
filtered

Unnamed: 0,BOROUGH,ZIP,BUILDING_NO,CLEAN_BUILDING_NO,STREET,BLOCK,LOT,COUNTY,CITY,STATUS1,STATUS2,STATUS3,LATITUDE,LONGITUDE
148,MANHATTAN,10002,37.5,37 1/2,Allen Street,300.0,18.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
234,MANHATTAN,10002,123.5,123 1/2,Chrystie Street,423.0,19.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
293,MANHATTAN,10002,168.5,168 1/2,Delancey Street,348.0,76.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
401,MANHATTAN,10002,27.5,27 1/2,Essex Street,310.0,32.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
1267,MANHATTAN,10003,47.5,47 1/2,East 7th Street,449.0,1.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
1268,MANHATTAN,10003,48.5,48 1/2,East 7th Street,448.0,7.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
1387,MANHATTAN,10003,34.5,34 1/2,Saint Marks Place,463.0,27.0,62,NEW YORK,MULTIPLE DWELLING B,,,,
1390,MANHATTAN,10003,37.5,37 1/2,Saint Marks Place,450.0,1.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
1473,MANHATTAN,10009,120.5,120 1/2,1st Avenue,435.0,2.0,62,NEW YORK,MULTIPLE DWELLING A,,,,
1979,MANHATTAN,10009,93.5,93 1/2,East 7th Street,435.0,57.0,62,NEW YORK,MULTIPLE DWELLING A,,,,


In [None]:
import re

def fix_building_no(building_no):
    if pd.isna(building_no):
        return None
    s = str(building_no).strip()
    # Handle range: take first number (but don't update df)
    if " TO " in s:
        return s.split(" TO ")[0].strip()
    # Handle decimal: convert to "n 1/2" if .5, else keep as int
    m = re.match(r'^(\d+)\.5$', s)
    if m:
        return f"{int(float(s))} 1/2"
    # Just return as string
    return s

# Load data (if not already loaded)
try:
    df
except NameError:
    df = pd.read_csv("listings_with_coordinates.csv")

# Find rows with missing coordinates and problematic BUILDING_NO
mask = (df['LATITUDE'].isna() | df['LONGITUDE'].isna())
problem_rows = df[mask].copy()

for idx, row in problem_rows.iterrows():
    orig_building_no = row['BUILDING_NO']
    s = str(orig_building_no).strip() if pd.notna(orig_building_no) else ""
    fixed_building_no = fix_building_no(orig_building_no)
    if fixed_building_no and fixed_building_no != s:
        print(f"Processing: BUILDING_NO='{orig_building_no}' (using '{fixed_building_no}') on STREET='{row['STREET']}'")
        lat, lon = get_lat_lon(
            house_number=fixed_building_no,
            street=str(row['STREET']),
            borough=str(row['BOROUGH'])
        )
        if lat and lon:
            df.at[idx, 'LATITUDE'] = lat
            df.at[idx, 'LONGITUDE'] = lon
            if re.match(r'^\d+\.5$', s):
                df.at[idx, 'BUILDING_NO'] = fixed_building_no

# Save updated DataFrame
df.to_csv("listings_with_coordinates.csv", index=False)
print("Done fixing and updating coordinates.")