In [110]:
import copy
import datetime
import multiprocessing as mp
import pathlib

import dask.dataframe as dd
import geopandas as gpd
import joblib
import numpy as np
import pandas as pd
import pyarrow as pa
from dask.distributed import Client, LocalCluster
from shapely.geometry import Point
from shapely.strtree import STRtree


In [111]:
from src.utils.id_powerline_endpoints import load_states, load_counties, load_geography

In [112]:
powerlines = load_geography("powerlines.geojson")
boundary = powerlines.geometry.boundary

In [113]:
(powerlines["VOLTAGE"] < 0).sum()

14062

In [114]:
(powerlines["VOLT_CLASS"].isin([""])).sum()

0

In [115]:
fill_val = powerlines[powerlines["VOLTAGE"] > 0].groupby("VOLT_CLASS")["VOLTAGE"].mean()

# Function to replace negative voltages
def replace_negative_voltage(row):
    if row["VOLTAGE"] < 0:
        return fill_val.get(row["VOLT_CLASS"], row["VOLTAGE"])  # Use class avg or keep original
    return row["VOLTAGE"]

# Apply function to the dataframe
powerlines["VOLTAGE"] = powerlines.apply(replace_negative_voltage, axis=1)

In [116]:
start_end = boundary.explode(index_parts=True)

In [117]:
states = load_states()
counties = load_counties()

In [118]:
ids = pd.DataFrame()
for _, state in states.iterrows():
    ids[state["STUSPS"]] = start_end.within(state["geometry"])


In [119]:
county_ids = []
for _, county in counties.iterrows():
    county_ids.append(pd.Series(start_end.within(county["geometry"]), name=county["GEOID"]))
county_ids = pd.concat(county_ids, axis=1)

In [120]:
from collections import defaultdict

df = county_ids
# Step 1: collect the sub-DataFrames for each higher-level index
grouped = df.groupby(level=0)

# Step 2: convert each group to a 2D numpy array and store with its group index
group_arrays = {k: grouped.get_group(k).to_numpy() for k in grouped.groups}

# Step 3: group indices whose arrays are equal
seen = {}
result = defaultdict(list)
for k, arr in group_arrays.items():
    # Use tuple of flattened array for hashing
    arr_key = tuple(arr.flatten())
    seen.setdefault(arr_key, []).append(k)

# Convert to list of lists
groups = list(seen.values())

In [131]:
keep = []
graph = []
trash_count = 0
for group in groups:
    allequal = df.loc[group]
    nonzero = allequal.columns[allequal.sum() > 0]
    total_voltage = powerlines["VOLTAGE"].loc[group].sum()
    try:
        nonzero_sum = allequal[nonzero].sum().values
        assert len(nonzero_sum) == 2 
        nonzero_sum = nonzero_sum.max()
        nrows = allequal.count().iloc[0]
        assert nrows == 2 * nonzero_sum
        keep.append(group)
        graph.append((*nonzero, total_voltage))
        
    except AssertionError:
        trash_count += 1
    except ValueError:
        print("No start or endpoint:", group)
        # print(f"{nrows} == {nonzero_sum}")
        pass
print(trash_count)

3116


In [132]:
graph

[('39095', '39173', 890.0),
 ('01117', '01073', 1265.0),
 ('45091', '45021', 593.5257191139194),
 ('23015', '23005', 690.0),
 ('42111', '42063', 230.0),
 ('21155', '21217', 230.0),
 ('46115', '46025', 69.0),
 ('47123', '47105', 322.0),
 ('37057', '37151', 338.0),
 ('01073', '01127', 1426.0),
 ('51650', '51700', 575.0),
 ('37191', '37195', 230.0),
 ('27077', '27135', 230.0),
 ('21237', '21025', 69.0),
 ('13085', '13227', 115.0),
 ('40103', '40047', 483.0),
 ('12057', '12101', 207.0),
 ('01087', '01081', 460.0),
 ('54007', '54041', 169.0),
 ('47121', '47107', 161.0),
 ('21211', '21185', 69.0),
 ('42037', '42093', 230.0),
 ('51069', '51043', 276.0),
 ('51147', '51011', 115.0),
 ('29109', '29209', 161.0),
 ('45065', '45001', 66.7628595569597),
 ('13121', '13063', 960.0),
 ('13009', '13319', 345.0),
 ('21031', '21227', 161.0),
 ('51187', '51139', 138.0),
 ('24003', '24033', 870.9769940828403),
 ('05045', '05029', 822.0),
 ('12099', '12085', 1506.0),
 ('26057', '26081', 345.0),
 ('17165', '1

In [133]:
badidx = [i for i, tup in enumerate(graph) if len(tup) != 3]

In [134]:
[graph[i] for i in badidx]

[]

In [135]:
df = pd.DataFrame(graph, columns=["src", "dest", "total_voltage"])

In [136]:
df.to_csv("graph.csv")

In [123]:
states["geometry"]

0     POLYGON ((-104.05351 41.15726, -104.05267 41.2...
1     MULTIPOLYGON (((-122.32834 48.02134, -122.3217...
2     POLYGON ((-109.05017 31.48, -109.04984 31.4995...
3     POLYGON ((-104.0577 44.99743, -104.05021 44.99...
4     POLYGON ((-106.64548 31.89867, -106.64084 31.9...
5     MULTIPOLYGON (((-118.60338 33.4781, -118.59878...
6     MULTIPOLYGON (((-89.40565 36.52816, -89.39868 ...
7     MULTIPOLYGON (((-82.73571 41.60336, -82.7188 4...
8     MULTIPOLYGON (((-88.04374 30.51742, -88.03661 ...
9     POLYGON ((-85.60516 34.98468, -85.55259 34.984...
10    MULTIPOLYGON (((-86.9562 45.35201, -86.95339 4...
11    POLYGON ((-124.55244 42.84057, -124.50014 42.9...
12    POLYGON ((-80.51989 40.90666, -80.51909 40.921...
13    MULTIPOLYGON (((-88.51067 30.21702, -88.49238 ...
14    POLYGON ((-95.77355 40.5782, -95.76853 40.5833...
15    MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ...
16    POLYGON ((-103.00256 36.52659, -103.00219 36.6...
17    POLYGON ((-82.643 38.16956, -82.63905 38.1