In [2]:
# ======================================
# BLOCK A — INSPECT THE GRAPH
# Purpose: Load the graph and understand
# what one road edge contains
# ======================================

import re
import numpy as np
import pandas as pd
import osmnx as ox
import networkx as nx


In [3]:
# Load the road network
place = "Atlanta, Georgia, USA"
G = ox.graph_from_place(place, network_type="drive", simplify=True)

print("Nodes:", len(G.nodes))
print("Edges:", len(G.edges(keys=True)))


Nodes: 12912
Edges: 33645


In [4]:
# Inspect one single edge (raw structure)
u, v, k, data = next(iter(G.edges(keys=True, data=True)))

print("u, v, k:", u, v, k)
print("Edge keys:", list(data.keys()))
print("Length:", data.get("length"))
print("Highway:", data.get("highway"))
print("Oneway:", data.get("oneway"))
print("Lanes:", data.get("lanes"))
print("Maxspeed:", data.get("maxspeed"))

data


u, v, k: 68145665 462857057 0
Edge keys: ['osmid', 'name', 'highway', 'oneway', 'reversed', 'length']
Length: 74.605
Highway: tertiary
Oneway: False
Lanes: None
Maxspeed: None


{'osmid': 9167391,
 'name': 'Flat Shoals Road Southeast',
 'highway': 'tertiary',
 'oneway': False,
 'reversed': True,
 'length': np.float64(74.605)}

In [5]:
# ================================
# END BLOCK A
# ================================


In [14]:
# ======================================
# BLOCK B — DEFINE BASELINE EDGE TIME
# Purpose: Convert each edge (road segment)
# from distance (meters) into time (seconds)
# using: time = distance / speed
# 1 row = 1 road segmenet
# ======================================

def normalize_highway(hw):
    # OSM sometimes stores highway as a list; take the first item
    if isinstance(hw, list) and len(hw) > 0:
        hw = hw[0]
    if hw is None:
        return "road"
    return str(hw).strip().lower()

def normalize_oneway(x):
    # your data already has booleans, but keep this robust
    if x is None:
        return False
    if isinstance(x, bool):
        return x
    s = str(x).strip().lower()
    if s in {"yes", "true", "1", "-1"}:
        return True
    return False

def normalize_lanes(x):
    # lanes can be None, string, list; parse first number if possible
    if x is None:
        return np.nan
    if isinstance(x, (int, float)) and not pd.isna(x):
        return float(x)
    if isinstance(x, list) and len(x) > 0:
        x = x[0]
    s = str(x).strip().lower()
    m = re.search(r"(\d+(\.\d+)?)", s)
    if not m:
        return np.nan
    return float(m.group(1))

rows = []
for u, v, k, data in G.edges(keys=True, data=True):
    rows.append({
        "u": u,
        "v": v,
        "key": k,
        "length_m": float(data.get("length")),
        "highway": normalize_highway(data.get("highway")),   # CLEANED HERE ✅
        "oneway": normalize_oneway(data.get("oneway")),
        "lanes": normalize_lanes(data.get("lanes")),
        "maxspeed_raw": data.get("maxspeed"),
        "name": data.get("name"),
    })

df_edges = pd.DataFrame(rows)

print("df_edges shape:", df_edges.shape)
df_edges.head()

df_edges shape: (33645, 9)


Unnamed: 0,u,v,key,length_m,highway,oneway,lanes,maxspeed_raw,name
0,68145665,462857057,0,74.605,tertiary,False,,,Flat Shoals Road Southeast
1,68145665,68155527,0,134.654,tertiary,False,,,Flat Shoals Road Southeast
2,68147058,69555000,0,291.43,trunk,False,5.0,35 mph,Moreland Avenue Northeast
3,68147058,2511829141,0,18.206,trunk,True,2.0,35 mph,Moreland Avenue Northeast
4,68147083,2511829139,0,217.498,trunk_link,True,1.0,,


In [15]:
# Quick sanity checks on the edge table
print("df_edges shape:", df_edges.shape)
print("length_m null %:", df_edges["length_m"].isna().mean())
print("highway null %:", df_edges["highway"].isna().mean())
print("oneway null %:", df_edges["oneway"].isna().mean())
print("lanes present %:", df_edges["lanes"].notna().mean())
print("maxspeed present %:", df_edges["maxspeed_raw"].notna().mean())

df_edges.head(5)


df_edges shape: (33645, 9)
length_m null %: 0.0
highway null %: 0.0
oneway null %: 0.0
lanes present %: 0.1985138950809927
maxspeed present %: 0.11906672611086343


Unnamed: 0,u,v,key,length_m,highway,oneway,lanes,maxspeed_raw,name
0,68145665,462857057,0,74.605,tertiary,False,,,Flat Shoals Road Southeast
1,68145665,68155527,0,134.654,tertiary,False,,,Flat Shoals Road Southeast
2,68147058,69555000,0,291.43,trunk,False,5.0,35 mph,Moreland Avenue Northeast
3,68147058,2511829141,0,18.206,trunk,True,2.0,35 mph,Moreland Avenue Northeast
4,68147083,2511829139,0,217.498,trunk_link,True,1.0,,


In [16]:
# Default speeds (mph) when maxspeed is missing
HIGHWAY_SPEED_MPH = {
    "motorway": 65,
    "motorway_link": 55,
    "trunk": 55,
    "trunk_link": 45,
    "primary": 45,
    "primary_link": 35,
    "secondary": 35,
    "secondary_link": 30,
    "tertiary": 30,
    "tertiary_link": 25,
    "residential": 25,
    "living_street": 15,
    "service": 15,
    "unclassified": 25,
    "road": 25,
}

# Fallback if we see a highway type we didn’t list
DEFAULT_MPH = 25


In [17]:
# Parse maxspeed into a numeric mph value when possible
# (handles: 35, "35 mph", "50", "80 km/h", lists, etc.)
def parse_maxspeed_mph(x):
    if x is None:
        return np.nan
    if isinstance(x, (int, float)) and not pd.isna(x):
        return float(x)

    # sometimes OSM has a list like ["35 mph", "40 mph"]
    if isinstance(x, list) and len(x) > 0:
        x = x[0]

    s = str(x).lower().strip()

    # pull first number we see
    m = re.search(r"(\d+(\.\d+)?)", s)
    if not m:
        return np.nan

    val = float(m.group(1))

    # if units look like km/h, convert to mph
    if "km" in s or "kph" in s:
        return val * 0.621371

    # otherwise assume mph
    return val

df_edges["maxspeed_mph"] = df_edges["maxspeed_raw"].apply(parse_maxspeed_mph)

print("maxspeed_mph present %:", df_edges["maxspeed_mph"].notna().mean())
df_edges[["maxspeed_raw", "maxspeed_mph"]].head(10)


maxspeed_mph present %: 0.11906672611086343


Unnamed: 0,maxspeed_raw,maxspeed_mph
0,,
1,,
2,35 mph,35.0
3,35 mph,35.0
4,,
5,35 mph,35.0
6,35 mph,35.0
7,,
8,,
9,35 mph,35.0


In [18]:
# Baseline speed rule:
# 1) Use maxspeed_mph if available
# 2) Else use default for the highway type
# 3) Else use DEFAULT_MPH

def baseline_speed_mph(row):
    ms = row["maxspeed_mph"]
    if not pd.isna(ms) and ms > 0:
        return ms

    hw = row["highway"]
    if hw in HIGHWAY_SPEED_MPH:
        return HIGHWAY_SPEED_MPH[hw]

    return DEFAULT_MPH

df_edges["speed_mph"] = df_edges.apply(baseline_speed_mph, axis=1)

# Convert mph -> m/s
df_edges["speed_mps"] = df_edges["speed_mph"] * 0.44704

df_edges[["highway", "maxspeed_mph", "speed_mph", "speed_mps"]].head(10)


Unnamed: 0,highway,maxspeed_mph,speed_mph,speed_mps
0,tertiary,,30.0,13.4112
1,tertiary,,30.0,13.4112
2,trunk,35.0,35.0,15.6464
3,trunk,35.0,35.0,15.6464
4,trunk_link,,45.0,20.1168
5,secondary,35.0,35.0,15.6464
6,secondary,35.0,35.0,15.6464
7,trunk_link,,45.0,20.1168
8,trunk_link,,45.0,20.1168
9,trunk,35.0,35.0,15.6464


In [19]:
# Baseline travel time (seconds) = distance (m) / speed (m/s)
df_edges["tt_base_s"] = df_edges["length_m"] / df_edges["speed_mps"]


In [20]:
print("NaN % tt_base_s:", df_edges["tt_base_s"].isna().mean())
print("negatives:", (df_edges["tt_base_s"] < 0).sum())

# What do edge travel times look like?
df_edges["tt_base_s"].describe(percentiles=[0.25, 0.5, 0.75, 0.9]).round(2)


NaN % tt_base_s: 0.0
negatives: 0


count    33645.00
mean        12.44
std         11.46
min          0.04
25%          5.36
50%          9.52
75%         15.67
90%         25.44
max        192.87
Name: tt_base_s, dtype: float64

In [21]:
# ================================
# END BLOCK B
# You now have tt_base_s for every edge
# ================================


In [24]:
# ======================================
# BLOCK C — LEARN EDGE TIME WITH ML
# Goal: predict tt_base_s from edge features
# (Week 1: plumbing test; later we’ll swap target to real observed time)
# ======================================


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# lanes: fill missing with median (or 1 if median is NaN)
lanes_median = df_edges["lanes"].median()
if pd.isna(lanes_median):
    lanes_median = 1.0

df_edges["lanes_filled"] = df_edges["lanes"].fillna(lanes_median)
df_edges.loc[df_edges["lanes_filled"] <= 0, "lanes_filled"] = lanes_median

# feature table
X = df_edges[["length_m", "highway", "oneway", "lanes_filled"]].copy()
y = df_edges["tt_base_s"].copy()

# one-hot encode highway
X = pd.get_dummies(X, columns=["highway"], drop_first=False)
print("X shape:", X.shape)


X shape: (33645, 18)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    min_samples_leaf=2,
)

model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print("MAE (sec):", round(mae, 3))
print("R^2:", round(r2, 4))
print("pred min/median/max:", round(pred.min(),2), round(np.median(pred),2), round(pred.max(),2))


MAE (sec): 0.195
R^2: 0.9837
pred min/median/max: 0.17 9.49 158.53


In [27]:
import os, pickle

os.makedirs("artifacts", exist_ok=True)

artifact = {
    "model": model,
    "feature_columns": list(X.columns),
    "lanes_median": float(lanes_median),
}

with open("artifacts/edge_tt_model.pkl", "wb") as f:
    pickle.dump(artifact, f)

print("Saved artifacts/edge_tt_model.pkl")


Saved artifacts/edge_tt_model.pkl


In [28]:
# ======================================
# BLOCK D — INJECT COST INTO ROUTING
# Purpose: write tt_base_s and tt_pred_s into G
# and show routing changes
# ====================

In [29]:
# Build feature matrix for all edges (must match training columns)
X_all = df_edges[["length_m", "highway", "oneway", "lanes_filled"]].copy()
X_all = pd.get_dummies(X_all, columns=["highway"], drop_first=False)

# Align columns to training features
for col in artifact["feature_columns"]:
    if col not in X_all.columns:
        X_all[col] = 0
X_all = X_all[artifact["feature_columns"]]

# Predict edge times
df_edges["tt_pred_s"] = model.predict(X_all)

# Ensure positivity
df_edges["tt_pred_s"] = np.maximum(df_edges["tt_pred_s"], 0.01)

df_edges[["length_m", "tt_base_s", "tt_pred_s"]].head()


Unnamed: 0,length_m,tt_base_s,tt_pred_s
0,74.605,5.562888,5.562645
1,134.654,10.040414,10.053411
2,291.43,18.62601,17.446028
3,18.206,1.16359,1.142293
4,217.498,10.811759,11.141747


In [30]:
# Build lookup dictionaries
tt_base_lookup = {
    (r.u, r.v, r.key): float(r.tt_base_s)
    for r in df_edges.itertuples(index=False)
}
tt_pred_lookup = {
    (r.u, r.v, r.key): float(r.tt_pred_s)
    for r in df_edges.itertuples(index=False)
}

# Inject into graph edges
for u, v, k, data in G.edges(keys=True, data=True):
    key = (u, v, k)
    if key in tt_base_lookup:
        data["tt_base_s"] = tt_base_lookup[key]
        data["tt_pred_s"] = tt_pred_lookup[key]

# Verify one edge
u0, v0, k0, d0 = next(iter(G.edges(keys=True, data=True)))
print(d0["tt_base_s"], d0["tt_pred_s"])


5.562887735623956 5.562644523985873


In [31]:
nodes = list(G.nodes)
orig = nodes[len(nodes)//5]
dest = nodes[len(nodes)*4//5]

print("Origin:", orig)
print("Destination:", dest)

route_base = nx.shortest_path(G, orig, dest, weight="tt_base_s")
route_ai = nx.shortest_path(G, orig, dest, weight="tt_pred_s")

print("Baseline route length:", len(route_base))
print("AI route length:", len(route_ai))
print("Routes identical?", route_base == route_ai)



Origin: 69202524
Destination: 2507712042
Baseline route length: 33
AI route length: 33
Routes identical? True


In [32]:
def path_time(G, path, weight):
    total = 0.0
    for a, b in zip(path[:-1], path[1:]):
        best = min(G[a][b][k][weight] for k in G[a][b])
        total += best
    return total

t_base = path_time(G, route_base, "tt_base_s")
t_ai = path_time(G, route_ai, "tt_pred_s")

print("Baseline time (s):", round(t_base, 2))
print("AI time (s):", round(t_ai, 2))


Baseline time (s): 491.58
AI time (s): 491.79


In [33]:
import random

def find_different_pair(G, tries=300):
    nodes = list(G.nodes)
    for _ in range(tries):
        o, d = random.sample(nodes, 2)
        try:
            rb = nx.shortest_path(G, o, d, weight="tt_base_s")
            ra = nx.shortest_path(G, o, d, weight="tt_pred_s")
            if rb != ra:
                return o, d, rb, ra
        except:
            pass
    return None

res = find_different_pair(G)

if res is None:
    print("Could not find differing routes quickly (still OK).")
else:
    o, d, rb, ra = res
    print("Found different routing:")
    print("Baseline nodes:", len(rb))
    print("AI nodes:", len(ra))
    print("Baseline time:", round(path_time(G, rb, "tt_base_s"),2))
    print("AI time:", round(path_time(G, ra, "tt_pred_s"),2))


Found different routing:
Baseline nodes: 101
AI nodes: 74
Baseline time: 878.23
AI time: 879.22


In [35]:
df_edges.to_csv("outputs/df_edges_week1.csv", index=False)
print("Saved outputs/df_edges_week1.csv")


Saved outputs/df_edges_week1.csv


In [36]:
print("Different paths?", rb != ra, "| base_time:", round(path_time(G, rb, "tt_base_s"),2), "| ai_time:", round(path_time(G, ra, "tt_pred_s"),2))


Different paths? True | base_time: 878.23 | ai_time: 879.22
