In [1]:
import numpy as np
import pandas as pd
import overpy

https://wiki.openstreetmap.org/wiki/Overpass_API/Overpass_QL#By_tag_.28has-kv.29

https://towardsdatascience.com/loading-data-from-openstreetmap-with-python-and-the-overpass-api-513882a27fd0

https://wiki.openstreetmap.org/wiki/Key:man_made
    A tag for identifying man-made (artificial) structures added to the landscape.


In [2]:
# api = overpy.Overpass()

# # Define bounding box for California (this is approximate and you might need to adjust)
# # Format: (south latitude, west longitude, north latitude, east longitude)
# bbox = "32.5343,-124.4096,42.0095,-114.1308"

# # Fetch all ways with wastewater treatment tags in California
# query = f"""
#     way["man_made"="wastewater_plant"]({bbox});
#     (._;>;);
#     out body;
#     """

# result = api.query(query)

# # Store results in desired format
# plants = {}

# for way in result.ways:
#     # Use the name of the plant or its ID if the name is not available
#     plant_name = way.tags.get("name", f"Plant_{way.id}")

#     # Extract nodes
#     nodes_coords = {f"node{node.id}": (node.lon, node.lat) for node in way.nodes}
    
#     plants[plant_name] = nodes_coords


In [3]:
api = overpy.Overpass()

# Fetch all wastewater treatment plants within California's boundary
query = f"""
    area[admin_level=4]["name"="California"]->.searchArea;
    (
      way["man_made"="wastewater_plant"](area.searchArea);
    );
    (._;>;);
    out body;
    """

result = api.query(query)

# Store results in desired format
plants = {}

for way in result.ways:
    # Use the name of the plant or its ID if the name is not available
    plant_name = way.tags.get("name", f"Plant_{way.id}")

    # Extract nodes lat and lon without id
    nodes_coords = [(node.lon, node.lat) for node in way.nodes]

    # get rid of "Decimal"  in the coordinates
    nodes_coords = [tuple(map(float, i)) for i in nodes_coords]
    
    plants[plant_name] = nodes_coords


In [4]:
print(f"Number of plants: {len(plants)}")
plants

Number of plants: 3104


{'Plant_24298754': [(-121.8093956, 37.6930507),
  (-121.805817, 37.6930115),
  (-121.8056706, 37.6930099),
  (-121.8056706, 37.6904833),
  (-121.8058937, 37.6899807),
  (-121.8059778, 37.6894001),
  (-121.8060333, 37.6892735),
  (-121.8061508, 37.6892261),
  (-121.8071187, 37.6892498),
  (-121.8095244, 37.6896004),
  (-121.8096322, 37.6904367),
  (-121.809724, 37.6914042),
  (-121.8097223, 37.6915312),
  (-121.809522, 37.6915461),
  (-121.8093956, 37.6930507)],
 'Plant_24396910': [(-121.7836023, 36.8042341),
  (-121.7836343, 36.8037659),
  (-121.7833675, 36.8022232),
  (-121.7828697, 36.7990947),
  (-121.782329, 36.7983538),
  (-121.7828526, 36.7974329),
  (-121.7828135, 36.7971656),
  (-121.7827041, 36.7968984),
  (-121.7821745, 36.7964432),
  (-121.7809986, 36.7959414),
  (-121.7799, 36.7950548),
  (-121.7790588, 36.795261),
  (-121.7784065, 36.796072),
  (-121.7771877, 36.7992885),
  (-121.7776512, 36.8002576),
  (-121.7762693, 36.800938),
  (-121.7764839, 36.8015634),
  (-121.77790

In [5]:
# filter the plants that have an official name
plants_with_official_name = {key: value for key, value in plants.items() if not key.startswith("Plant_")}
print(f"Number of official plants: {len(plants_with_official_name)}")
plants_with_official_name

Number of official plants: 214


{'EBMUD Wastewater Treatment Plant': [(-122.2930951, 37.8228457),
  (-122.2923804, 37.8241395),
  (-122.2923464, 37.8241322),
  (-122.2922493, 37.8243101),
  (-122.2921713, 37.8246004),
  (-122.2921681, 37.8247452),
  (-122.2921969, 37.8251887),
  (-122.2920161, 37.8251593),
  (-122.2919721, 37.8254359),
  (-122.2919162, 37.8257025),
  (-122.2918483, 37.8260651),
  (-122.2916564, 37.826488),
  (-122.2917244, 37.8266896),
  (-122.2921959, 37.8269055),
  (-122.2925522, 37.8270548),
  (-122.2926746, 37.8270912),
  (-122.292804, 37.8271548),
  (-122.2928341, 37.82724),
  (-122.2932189, 37.8271637),
  (-122.293225, 37.8272237),
  (-122.2934566, 37.8272571),
  (-122.2937212, 37.8272765),
  (-122.2938809, 37.8272546),
  (-122.2941777, 37.8271973),
  (-122.2943274, 37.8271752),
  (-122.2953185, 37.8269706),
  (-122.2953333, 37.8269667),
  (-122.2953491, 37.8269619),
  (-122.2953661, 37.8269544),
  (-122.2953807, 37.8269469),
  (-122.2954638, 37.8269015),
  (-122.2954865, 37.8268899),
  (-122.2

In [6]:
print(*plants_with_official_name, sep = "\n")

EBMUD Wastewater Treatment Plant
Hyperion Wastewater Treatment Plant
San José–Santa Clara Regional Wastewater Facility
Sunnyvale Water Pollution Control Plant
Burlingame Wastewater Treatment Plant
SMD3
Farmersville Wastewater Treatment Plant
Wastewater Treatment
Wastewater Treatment Plant
Regional Water Quality Control Plant
Harry Tracy Water Treatment Plant
Davis Wetlands
Sewage Treatment Plant and Regional Pump Station
Sewage Disposal Plant
Lode Street Wastewater Facility
Pleasant Grove Wastewater Treatment Plant
Sewer Plant #3
Sewer Plant #2
Los Alisos Water Reclamation Plant
Fresno Clovis Regional Waste Water Treatment Plant
Wood Duck Pond
aeration pond
Pismo Wastewater Treatment
settling pond
Tejon Reservoir Number Two
Tejon Reservoir Number One
61/60
93
5/4
53/52
72/73
33/32
Holding Pond
MPUD
81
69
Percolation Basin
23
77
89
24
36/37
9
14/15/16
Ventura Avenue Water Purification Plant
Mustang Pond
Jackrabbit Lake
Kingbird Pond
Pacific Placer Reservoir
Greenhorn Reservoir
South Oxi

In [7]:
# filter out the name with "pond" or "Pond" or "Westland" or "lake"
plants_with_official_name = {key: value for key, value in plants_with_official_name.items() if 'pond' not in key.lower() and 'westland' not in key.lower() and 'lake' not in key.lower()}
print(f"Number of official plants reduced to: {len(plants_with_official_name)}")

Number of official plants reduced to: 202


In [8]:
# save to csv
import pandas as pd
df = pd.DataFrame.from_dict(plants_with_official_name, orient='index')
df = df.stack().reset_index()
# rename the columns
df.columns = ['plant_name', 'node', 'coordinates']
df.head()

# save to csv
df.to_csv('wastewater_plants_filtered_1.csv', index=False)
