In this notebook Longitude and latitude columns are added in order for further visualisations and processing. This proccess take a lot of time, therefore it is done in separate notebook and columns added before all further manipulation with the dataset.

Set Google API key

In [None]:
import googlemaps
import pandas as pd
GOOGLE_API_KEY = "YOUR_API_KEY_HERE"
gmaps = googlemaps.Client(key=GOOGLE_API_KEY)

Uploading dataset:

In [2]:
df = pd.read_parquet("../data/raw/pollution_dataset.parquet", engine="pyarrow")
df.head()

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,1.145833,4.2,21,
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,0.878947,2.2,23,25.0
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,1.145833,4.2,21,
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,0.878947,2.2,23,25.0
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,Parts per billion,1.958333,3.0,22,4.0,Parts per million,0.85,1.6,23,


Address Cleaning Function

In [3]:
import re

def clean_address(addr, city, state):
    if pd.isna(addr):
        return None

    # Remove extra characters
    addr = re.sub(r"\s+", " ", addr.strip())

    # Build normalized searchable address
    parts = [addr]

    if isinstance(city, str) and city.lower() != "not in a city":
        parts.append(city)

    if isinstance(state, str):
        parts.append(state)

    parts.append("USA")

    return ", ".join(parts)


Google Geocoding Function with Error Handling

In [4]:
import time

def geocode_google(address, retries=3):
    if not address:
        return None, None
    
    for attempt in range(retries):
        try:
            result = gmaps.geocode(address)

            if result:
                loc = result[0]["geometry"]["location"]
                return loc["lat"], loc["lng"]

        except Exception as e:
            print(f"Google error: {e}, retrying...")
            time.sleep(1)

    return None, None


Build a unique address table (prevents 1M unnecessary geocoding calls)

In [5]:
site_keys = df[["State Code", "County Code", "Site Num", "Address", "City", "State"]].drop_duplicates()

site_keys["CleanAddress"] = site_keys.apply(
    lambda row: clean_address(row["Address"], row["City"], row["State"]),
    axis=1
)

site_keys["Latitude"] = None
site_keys["Longitude"] = None


Run Google Geocoding Only Once Per Site

In [6]:
for i, row in site_keys.iterrows():
    lat, lon = geocode_google(row["CleanAddress"])
    site_keys.at[i, "Latitude"] = lat
    site_keys.at[i, "Longitude"] = lon
    print(f"{i+1}/{len(site_keys)} → {lat}, {lon}")
    time.sleep(0.1)   # Avoid rate limits


1/204 → 33.4584262, -112.0465744
1397/204 → 33.4798999, -111.9172923
2129/204 → 32.2062306, -110.8793786
3517/204 → 37.9362718, -122.025806
4969/204 → 38.006241, -121.643328
6385/204 → 37.9630537, -122.3337857
7849/204 → 38.0291956, -121.8969494
9301/204 → 32.6760968, -115.4883507
10553/204 → 35.356401, -119.0626585
10667/204 → 34.1760151, -118.3169943
12131/204 → 34.0652706, -118.2274436
13355/204 → 33.84205910000001, -118.1921311
14819/204 → 33.92358189999999, -118.3711918
16267/204 → 33.6743124, -117.9254494
17731/204 → 34.0005504, -117.4151657
19075/204 → 38.7101347, -121.373737
20183/204 → 38.6135997, -121.3677609
21135/204 → 34.510834, -117.3253137
22527/204 → 32.6315405, -117.0589723
23949/204 → 32.7094142, -117.1538038
25333/204 → 32.5520361, -116.9374431
26789/204 → 37.7661221, -122.3993836
28233/204 → 34.4794353, -120.042365
29673/204 → 34.6378258, -120.4576809
31117/204 → 34.445138, -119.8296878
32553/204 → 34.7420267, -120.5724404
34001/204 → 37.0115806, -122.1941483
35401/

Save geocoded results for reuse

In [7]:
site_keys.to_parquet("../data/raw/google_geocoded.parquet", index=False)
print("Saved geocoded site metadata.")


Saved geocoded site metadata.


Merge latitude + longitude back into your pollution dataset

In [8]:
df = df.merge(
    site_keys[["State Code", "County Code", "Site Num", "Latitude", "Longitude"]],
    on=["State Code", "County Code", "Site Num"],
    how="left"
)


Check missing coordinates

In [9]:
df[["Latitude", "Longitude"]].isna().sum()


Latitude     0
Longitude    0
dtype: int64

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,Latitude,Longitude
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,9.0,21,13.0,Parts per million,1.145833,4.2,21,,33.458426,-112.046574
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,9.0,21,13.0,Parts per million,0.878947,2.2,23,25.0,33.458426,-112.046574
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,6.6,23,,Parts per million,1.145833,4.2,21,,33.458426,-112.046574
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,6.6,23,,Parts per million,0.878947,2.2,23,25.0,33.458426,-112.046574
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,3.0,22,4.0,Parts per million,0.85,1.6,23,,33.458426,-112.046574


In [None]:
../data/preprocessed/pollution_dataset_geocoded.parquetdf.to_parquet("", index=False)