In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

from shapely.geometry import Point
import geopandas as gpd

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

In [None]:
all_data = pd.read_csv('data.csv')

In [None]:
missing_loc_data = all_data[(all_data['Latitude'].isna()) | 
                            (all_data['Longitude'].isna()) |
                            (all_data['PostalCode'].str.len() != 5)]

addresses = missing_loc_data.loc[:, ['UnparsedAddress', 'City', 'StateOrProvince']]
addresses['FullAddress'] = addresses[['UnparsedAddress', 'City', 'StateOrProvince']] \
    .apply(lambda row: ' '.join([str(x).strip() for x in row if pd.notna(x)]), axis=1)

# Geocoding to fill missing 
geolocator = Nominatim(user_agent="idx_cleaning_script", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.5, max_retries=1, error_wait_seconds=0.5)

lat = {}
lon = {}
zip_code = {}
city = {}

for i in addresses['FullAddress']:
    try:
        location = geocode(i)
        if location is not None:
            lat[i] = location.latitude
            lon[i] = location.longitude

            raw_data = location.raw
            if 'address' in raw_data:
                zip_code[i] = raw_data['address'].get('postcode', '')
                city[i] = raw_data['address'].get('city', '')
            else:
                zip_code[i] = ''
                city[i] = ''
        else:
            print(f"Address not found: {i}")
    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"Geocoding timed out for: {i} – {e}")
        lat[i] = ''
        lon[i] = ''
        zip_code[i] = ''
        city[i] = ''

all_data['Latitude'] = all_data['Latitude'].fillna(all_data['UnparsedAddress'].map(lat))
all_data['Longitude'] = all_data['Longitude'].fillna(all_data['UnparsedAddress'].map(lon))
all_data['PostalCode'] = all_data['PostalCode'].fillna(all_data['UnparsedAddress'].map(zip_code))
all_data['City'] = all_data['City'].fillna(all_data['UnparsedAddress'].map(city))

# Drop rows where geocoding failed
all_data = all_data.dropna(subset=['Latitude', 'Longitude'])
all_data = all_data.dropna(subset=['PostalCode', 'City'])

# Remove outliers in isolated areas of California - Jun
all_data = all_data[(all_data['Latitude']>30)&(all_data['Longitude']<-50)]