# Preprocessing landvote data
---

Note: Landvote data can be downloaded from [<u>TPL's site</u>](https://tpl.quickbase.com/db/bbqna2qct?a=dbpage&pageID=8)

The raw data only lists the location of the measure and doesn't have a geometry column. For that reason, I join the landvote data with census spatial data (state, county, subdivision, and places) so we can map Landvote measures. 

The preprocessing scripts for the census data can be found in the `datasets/census` directory.  

In [None]:
import ibis
from ibis import _
import ibis.expr.datatypes as dt  
import re
from cng.utils import *
from cng.h3 import *
from minio import Minio

duckdb_install_h3()
con = ibis.duckdb.connect(extensions = ["spatial"])
set_secrets(con)

In [None]:
landvote = con.read_csv('s3://public-tpl/landvote/raw/Measures.csv',encoding='cp1252')

landvote_df = (
    landvote
        .rename(jurisdiction="Jurisdiction Type", state="State", name="Jurisdiction Name")
        .rename(conservation_funds_at_stake='Conservation Funds at Stake', 
                conservation_funds_approved='Conservation Funds Approved',
                total_funds_at_stake='Total Funds at Stake',
                total_funds_approved='Total Funds Approved',
                description='Description',
                finance_mechanism='Finance Mechanism',
                purpose="Purpose",
                date='Date',
                status="Status",
                percent_yes='% Yes', 
                percent_no='% No',
                notes="Notes",
                other_comment='"Other" Comment',
                voted_acq_measure = 'Voted Acq. Measure'
        )
        .mutate(state = _.state.substitute({'Ore':'OR'}))
        .mutate(conservation_funds_at_stake=_.conservation_funds_at_stake.replace('$', '').replace(',', '').cast('float'))
        .mutate(year=_['date'].year().cast('int32'))
        .mutate(id=ibis.row_number().over())
        # ---- YES percentage parsing ----
        .mutate(
            percent_yes=ibis.ifelse(
                _.percent_yes.isin(['Pass','None','Fail']),
                None,
                (
                    _.percent_yes.replace('%', '')
                         .cast('float')
                         .round(2)
                         .cast(dt.float64)
                         .cast(dt.string)
                    + '%'
                )
            )
        )

        # ---- Municipal field ----
        .mutate(
            municipal=ibis.ifelse(
                _.jurisdiction.isin(['State','County']),
                None,
                _.name
            )
        )

        # ---- County field ----
        .mutate(
            county=ibis.ifelse(
                _.jurisdiction.isin(['State','Municipal','Special District']),
                None,
                _.name
            )
        )
        .drop(['Pass?'])
)


In [None]:
# using the same set of columns to make joining easier
cols = ['id','state','county','municipal',
        'jurisdiction','status','percent_yes',
        'percent_no','year','date',
        'total_funds_at_stake',
        'total_funds_approved',
        'conservation_funds_at_stake',
        'conservation_funds_approved',
        'finance_mechanism','other_comment',
        'purpose','description','notes',
        'voted_acq_measure','geometry']

In [None]:
# helper functions to make sure every entry is joined to a geometry 

def check_matches(df1, df2, id_col="id"):
    """
    Return True if all IDs in df1[id_col] are present in df2[id_col].
    Otherwise return False and also return the missing IDs.
    """
    df1 = df1.execute()
    df2 = df2.execute()

    ids1 = set(df1[id_col])
    ids2 = set(df2[id_col])

    missing_ids = ids1 - ids2
    # print(f'There are {len(ids1)} ids')
    print(f'There are {len(missing_ids)} ids missing')
    dup_ids = check_duplicates(ids1, ids2, df1, df2, id_col="id")
    # return df1[df1[id_col].isin(missing_ids)]
    # return df2[df2[id_col].isin(dup_ids)]

def check_duplicates(ids1, ids2, df1, df2, id_col="id"):
    """
    Return True if all IDs in df1[id_col] are present in df2[id_col].
    Otherwise return False and also return the missing IDs.
    """
    dup_counts = df2[id_col].value_counts()
    dup_ids = set(dup_counts[dup_counts > 1].index)

    # only report duplicates relevant to df1
    dup_ids = dup_ids.intersection(ids1)
    print(f"There are {len(dup_ids)} ids from df1 that appear more than once in df2.")
    return dup_ids

# State data

In [None]:
state_boundaries = 's3://public-census/2024/state/2024_us_state.parquet'
landvote_states = landvote_df.filter(_.jurisdiction == "State")

state = (con
        .read_parquet(state_boundaries)
        .rename(state_name = "state", state = "state_id", geometry='geom')
        .select('state','geometry','state_name')
        )

df_state = (state
            .join(landvote_states, "state",how = "inner")
            .drop('state_name','name')
            .select(cols)
            )

# checking if any ids didn't match
check_matches(landvote_states, df_state)

# County data

Note that we need to rename a few columns in Landvote to ensure they join with the correct county. 

In [None]:
collapse_spaces = r"\s+"
county_match_pattern = r"(?i)\s*(Great Parks of |Columbus and|Recreation|Flood Control|Park and|Park and Recreation|Agricultural Preservation and|Metropolitan Parks|Regional|District|Historical Parks District|Open Space District|Open Space Authority|Metroparks|Conservation District|Metro Parks|Park District|Metroparks District|Forest Preserve District|Regional Park and Open Space District|Forest Preserves|of|Preservation Parks|Preservation Park)\b"
county_vals = {'Athens-Clarke County': 'Clarke County',
                'City and County of San Francisco':'San Francisco County',
                'Cleveland Metropolitan Park District':'Cuyahoga County',
                'Denver City and County':'Denver County',
                'Five Rivers MetroParks':'Montgomery County',
                'Jacksonville':'Duval County',
                'Johnny Appleseed Park District':'Allen County',
                'Licking Park District':'Licking County',
                'Marin County Community Facilities District No. 2022-1':'Marin County',
                'Metropolitan Park District of the Toledo Area':'Lucas County',
                'Metropolitan Park District of Toledo Area':'Lucas County',
                'Metropolitan Park District of Toledo Area ':'Lucas County',
                'Portage Park District':'Portage County',
                'Santa Clara Valley Water District': 'Santa Clara County',
                'Summit Metro Parks':'Summit County'}

landvote_county = (landvote_df
    .filter(_.jurisdiction == "County")
    .mutate(name=_.name.re_replace(collapse_spaces, " ").strip())
    .mutate(name=_.name.re_replace(county_match_pattern, "").strip())
    .mutate(county = _.county.substitute(value = county_vals,else_= _.name))
    )


county_boundaries = "s3://public-census/2024/county/2024_us_county.parquet"

county_geom = (con
    .read_parquet(county_boundaries)
    .rename(state_name = "state", state = "state_id", geometry='geom')
    .join(state.drop('geometry','state'), "state_name", how = "inner")
         )

df_county = (county_geom
            .select('state','state_name','county','geometry')
            .join(landvote_county, ["county","state"],how = "inner")
            .select(cols)
             )

# checking if any ids didn't match
check_matches(landvote_county, df_county)

# City data

The municipal data is messy because we have to join with multiple census sources (places + subdivisions) and some municipals span multiple counties OR a state might have multiple municipals with the same name and different counties. In the latter scenario, the correct county is usually specified in the `notes` column. 

TL;DR we need to do lots of regex so it all matches up to the right location. 

In [None]:
@ibis.udf.scalar.python
def extract_counties_udf(note: str) -> list[str]:
    import re
    if not note:
        return []

    counties = []
    # County name = capitalized words
    cname = r"[A-Z][A-Za-z.'-]*(?:\s+[A-Z][A-Za-z.'-]*)*"

    # ---------------------------------------------------------------
    # 1. SAFE plural list matcher (no recursion, no nested repeats)
    # Matches ONLY comma/and separated capitalized names before "counties"
    # Example: "San Mateo, Santa Clara, and Santa Cruz counties"
    # ---------------------------------------------------------------
    plural_list_pattern = (
        rf"({cname}"
        rf"(?:\s*,\s*{cname})*"     # zero or more comma elements
        rf"(?:\s+and\s+{cname})?"   # optional final "and <name>"
        rf")\s+counties\b"
    )

    m = re.search(plural_list_pattern, note)
    if m:
        block = m.group(1)
        parts = re.split(r"\s*,\s*|\s+and\s+", block)
        for p in parts:
            counties.append(p.strip() + " County")

    # ---------------------------------------------------------------
    # 2. Full singular "X County"
    # ---------------------------------------------------------------
    singular_pattern = rf"({cname})\s+County\b"
    for m in re.finditer(singular_pattern, note):
        counties.append(m.group(1) + " County")

    # ---------------------------------------------------------------
    # 3. Abbreviated "X Co" or "X Co."
    # ---------------------------------------------------------------
    abbrev_pattern = rf"({cname})\s+Co\.?\b"
    for m in re.finditer(abbrev_pattern, note, flags=re.IGNORECASE):
        counties.append(m.group(1) + " County")

    # ---------------------------------------------------------------
    # Deduplicate while preserving order
    # ---------------------------------------------------------------
    seen = set()
    out = []
    for c in counties:
        if c not in seen:
            seen.add(c)
            out.append(c)

    return out


In [None]:
# making sure landvote data map to the correct location 
city_vals = {
    "FL": {
        "North Bay Village City": "North Bay Village",
    },
    "ID": {
        "Boise": "Boise City",
    },
    "IL": {
        "Godfrey Village": "Godfrey",
    },
    "MA": {
        "Agawam": "Agawam Town",
        "Amherst": "Amherst Town",
        "Barnstable": "Barnstable Town",
        "Braintree": "Braintree Town",
        "Bridgewater": "Bridgewater Town",
        "Franklin": "Franklin Town",
        "North Attleborough": "North Attleborough Town",
        "Palmer": "Palmer Town",
        "Randolph": "Randolph Town",
        "West Springfield": "West Springfield Town",
        "Weymouth": "Weymouth Town",
        "Winthrop": "Winthrop Town",

    },
    "MI": {
        "Augusta Charter": "Augusta",
        "Berlin Charter": "Berlin",
        "Charter of Oakland": "Oakland",
        "Charter of Meridian": "Meridian",
    },
    "NJ": {
        "Carney's Point": "Carneys Point",
        "Loch Arbor Village": "Loch Arbour",
        "Peapack-Gladstone": "Peapack and Gladstone",
        "Ridgewood Village": "Ridgewood",
        "West Paterson": "Woodland Park",
    },
    "NV": {
        "Moapa": "Moapa Town",
    },
    "NY": {
        "Ardsley Village": "Ardsley",
    },
    "OR": {
        "Willamalane": "Springfield",
    },
    "UT": {
        "Alpine City": "Alpine",
        "South Salt Lake City": "South Salt Lake",
        "West Valley": "West Valley City",
    },
}

city_match_pattern = r"(?i)\s*( Township| Borough|Village of | Park District| Recreation and Park District| Park and Recreation District| Park & Recreation District| Metropolitan| Open Space Park Facilities District No. 1)\b"

# if (state & name match) -> newvalue else expr
expr = _.name
for state, mapping in city_vals.items():
    for old, new in mapping.items():
        expr = ibis.ifelse((_.state == state) & (_.name == old), new, expr)

# cleaning up the notes column so it matches to the right county 
notes_vals ={'Carrabus County ': 'Cabarrus County',
            'Dekalb County':'DeKalb County',
            'District encompasses all of Springfield, Oregon.  Lane County, Measures 20-199':'Lane County',
            'Merrimack County': '',
            'Lewis and Clark County':'',
            'Middlessex Co':'Middlesex County',
            'Prop 1 needs to pass as well as Prop 2, Grand Traverse and Leelenau County':'',
            'St. Louis is independent city (not part of county)':''}


In [None]:
# rename cities to match with census data 
landvote_city = (landvote_df
    .filter(_.jurisdiction == "Municipal")
    .mutate(name=_.name.re_replace(collapse_spaces, " ").strip())
    .mutate(name=_.name.re_replace(city_match_pattern, "").strip())
    .mutate(notes = _.notes.substitute(value = notes_vals,else_= _.notes))
    .mutate(name=ibis.ifelse(_.id.isin([2196,2548]), "Robbinsville", _.name)) # some edge cases where the notes are missing, so I manually add the counties in 
    .mutate(notes=ibis.ifelse(_.id.isin([2665]), "Roanoke city", _.notes))
    .mutate(notes=ibis.ifelse(_.id.isin([3022]), "Fulton County", _.notes))
    .mutate(notes=ibis.ifelse(_.id.isin([1526]), "Orange County", _.notes))
    .mutate(notes=ibis.ifelse(_.id.isin([2793]), "Summit County", _.notes))
    .mutate(notes=ibis.ifelse(_.id.isin([1287]), "Sussex County", _.notes))
    .mutate(name=expr)  
                )

# get counties and subdivisions 
places_url = 's3://public-census/2025/places/*.parquet'
places_geom = con.read_parquet(places_url).select('state','name','county','geometry')
cousub_url = 's3://public-census/2025/cousub/*.parquet'
cousub_geom = con.read_parquet(cousub_url).select('state','name','county','geometry')
city_geom = places_geom.union(cousub_geom).distinct(on=['state','name','county'], keep='first')

# join with counties and subdivisions 
df_city = (city_geom
            .join(landvote_city, ["name","state"], how = "inner")
            # if county is specified in notes, use that county 
            .mutate(county_list=extract_counties_udf(_.notes)) 
            .unnest(["county_list"],keep_empty = True)
            .filter((_.county_list == None) | (_.county == _.county_list))
            .select(cols)
          )

df = check_matches(landvote_city, df_city)


# Special Districts

Same deal as the municipal data! Lots of regex...


In [None]:
#manually map to all locations in these counties
east_bay_regional_cities = city_geom.filter(_.state=='CA').filter(_.county.isin(['Alameda County','Contra Costa County'])).select('name').execute()['name'].tolist()
mont_oh_places = city_geom.filter(_.state=='OH').filter(_.county.isin(['Montgomery County'])).select('name').execute()['name'].tolist()

midpen_cities = city_geom.filter(_.state=='CA').filter(_.county.isin(['Santa Clara County','San Mateo County',' Santa Cruz County County'])).select('name').execute()['name'].tolist()
excluded_midpen = {
    'Daly City',
    'Brisbane',
    'Broadmoor',
    'Colma',
    'Millbrae',
    'Burlingame',
    'San Mateo',
    'San Bruno',
    'South San Francisco',
    'Foster City',
    'Belmont',
    'Alum Rock',
    'East Foothills',
    'Milpitas',
    'Diablo Range',
    'Gilroy',
    'Morgan Hill',
    'San Martin',
    'South Santa Clara Valley'
}
midpen_cities = [c for c in midpen_cities if c not in excluded_midpen]

greater_clark_cities = city_geom.filter(_.state=='WA').filter(_.county=='Clark County').execute()['name'].to_list()
excluded_greater_clark = {'Washougal', 'La Center', 'Vancouver', 'Battle Ground', 'Camas', 'Woodland'}
greater_clark_cities = [c for c in greater_clark_cities if c not in excluded_greater_clark]

sd_vals = {
    "Tri-Lakes Park and Recreation District": ["Monument"],
    "Urban Drainage and Flood Control District": ["Denver"],
    "Blue Heron Recreation District": ["Phoenix"],
    "Mountains Recreation and Conservation Authority": ["Santa Monica"],
    "St. Helena Parish Recreation and Parks District": ["Greensburg"],
    "West Geauga Park and Recreation District": ["Chardon"],
    "Marin County Open Space District": ["San Rafael"],
    "Marinwood Community Services District": ["San Rafael"],
    "Tualatin Hills Park and Recreation District": ["Tualatin"],
    "Portland Metro": ["Portland"],
    "South Whidbey Parks and Recreation": ["Langley"],
    "Dundee Township Park District": ["Dundee"],
    "Monterey Peninsula Regional Park District": ["Monterey"],
    "South Suburban Park and Recreation District": ["Littleton"],
    "Prospect Recreation and Park District": ["Wheat Ridge"],
    "Butterfield Park District": ["York", "Milton"],
    "Plainfield Township Park District":["Plainfield"],
    "East Bay Regional Park District":east_bay_regional_cities,
    "Foothills Park and Recreation District":["Littleton"],
    "Foothills Park and Recreation District (Jefferson County)":["Dakota Ridge"],
    "North Clackamas Parks and Recreation District":["Oregon City"],
    "Bethel Lake Wylie Land Acquisition & Preservation Tax District":["York"],
    "Midpeninsula Regional Open Space District":midpen_cities,
    "Roaring Fork Open Space Parks and Recreation District":["Basalt","El Jebel","Eagle-Gypsum"],
    "Roaring Fork Open Space Park and Recreation District":["Basalt","El Jebel","Eagle-Gypsum"],
    "Perry Park Metroplitan District (in Douglas County)":["Larkspur"],
    "Hyland Hills Park and Recreation District":["Federal Heights"],
    "Ken-Caryl Ranch Metropolitan District":["Ken Caryl"],
    "Coal Creek Canyon Park and Recreation District":["Coal Creek"],
    "Recreational Authority of Traverse City and Garfield Township":["Traverse City","Garfield"],
    "Montgomery County Park District":mont_oh_places,
    "Ward's Creek Park District":['Downers Grove'],
    "Si View Metropolitan Park District":['North Bend'],
    "Howell Area Parks and Recreation Authority":["Howell"],
    "Fox Valley Park District":["Aurora", "North Aurora", "Montgomery", "Sugar Grove"],
    "Greater Clark Parks District":greater_clark_cities,
    "Chuckanut Community Forest Park District":["Bellingham"],
    "Peninsula Metropolitan Park District":["Gig Harbor Peninsula","Gig Harbor"],
    "Northwest Park and Recreation District 2":["Blaine"],
    "Homewood-Flossmoor Park District":["Homewood","Flossmoor"],
    "Bellbrook-Sugarcreek Park District":["Bellbrook"],
    "Grandwood Park District":["Grandwood Park"],
    "Arapahoe Parks and Recreation District":["Centennial"],
    "Atfalati Recreation Partnership (cities of Tigard and Tualatin, the Tigard-Tualatin School District, Washington County)":["Tigard","Tualatin"]
}

sd_map = []
for k, vals in sd_vals.items():
    for v in vals:
        sd_map.append({"sd_name": k, "mapped_name": v})

sd_map = ibis.memtable(sd_map)

sd_notes_vals = {'Woodland Hills, Encino, and Tarzana, Measure FF  Community Facilities District':'',
                'Santa Monica Mountains and Hollywood Hills east of 405\r\nfreeway, Measure GG  Community Facilities District':'',
                "The measure requires a two-thirds vote in Midpenâ€™s jurisdiction in San Mateo and Santa Clara counties and a portion of unincorporated Santa Cruz County":'',
                 'Western Summit County':''
}

sd_match_pattern = r"(?i)\s*( Assessment District for Park and Recreation| Community Park District|Metro Parks | County Regional Park and Open Space District| Valley Open Space Authority| Parks And Recreation District| Mountains Open Space Preservation Assessment District 1| Mountains Open Space Preservation Assessment District 2| Park and Recreation District| Park District| Basin Special Recreation District)\b"


landvote_sd = (
    landvote_df
    .filter(_.jurisdiction == "Special District")
    .drop("county")
    .mutate(name = _.name.re_replace(collapse_spaces, " ").strip())
    .left_join(sd_map, _.name == sd_map.sd_name)
    .mutate(name = _.mapped_name.coalesce(_.name))
    .drop("sd_name", "mapped_name")
    .mutate(name = _.name.re_replace(sd_match_pattern, "").strip())
    .mutate(notes = _.notes.substitute(value=sd_notes_vals, else_= _.notes))
    .mutate(name=ibis.ifelse(_.id.isin([1399]), "Vancouver", _.name)) # some edge cases where the notes are missing, so I manually add the counties in 
)


df_sd = (city_geom
            .join(landvote_sd, ["name","state"], how = "inner")
            # if county is specified in notes, use that county 
            .mutate(county_list=extract_counties_udf(_.notes)) 
            .unnest(["county_list"],keep_empty = True)
            .filter((_.county_list == None) | (_.county == _.county_list))
            .select(cols)
          )

df = check_matches(landvote_sd, df_sd)

# Combine all 3 jurisdiction types into a single file

In [None]:
save_url='s3://public-tpl/landvote/landvote_geom_v2.parquet'
df_final = df_state.union(df_county).union(df_city).union(df_sd)
df_final.to_parquet(save_url)

# Save as pmtiles

In [None]:
geobuf_file = 'landvote_geom_v2.fgb'
df_final.execute().to_file(geobuf_file)
pmtiles_file = 'landvote_geom_v2.pmtiles'
pmtiles = f"s3://public-tpl/landvote/{pmtiles_file}"
source_layer_name = re.sub(r'\W+', '', os.path.splitext(os.path.basename(pmtiles_file))[0])

options =[f'--layer={source_layer_name}',
            '--extend-zooms-if-still-dropping',  
         ]
new_pmtiles = to_pmtiles(geobuf_file, pmtiles_file, options = options )
s3_cp(new_pmtiles,pmtiles)

mapping it just to make sure it worked...

In [None]:
import leafmap.maplibregl as leafmap

m = leafmap.Map(style="positron")

pmtiles = f'https://minio.carlboettiger.info/public-tpl/landvote/{pmtiles_file}'

style_dict = {
    "source-layer": source_layer_name,
    "type": "fill",             
    "paint": {
        "fill-color": "#1f78b4",
        "fill-opacity": 0.6
    },
}

m.add_pmtiles(pmtiles)
m
