# Node and Edge features
This notebooks provide additional information about which geo-spatial features correspond to node feature matrix in TAP

In [4]:
import pickle

import numpy as np
import osmnx as ox
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

np.random.seed(17)
print('OSMnx version:', ox.__version__)

OSMnx version: 1.3.0


In [9]:
us_state_to_abbrev = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
    "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
    "Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID",
    "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
    "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
    "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS",
    "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV",
    "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY",
    "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK",
    "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT",
    "Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV",
    "Wisconsin": "WI", "Wyoming": "WY", "District of Columbia": "DC", "American Samoa": "AS",
    "Guam": "GU", "Northern Mariana Islands": "MP", "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM", "U.S. Virgin Islands": "VI",
}
    
# invert the dictionary
us_abbrev_to_state = dict(map(reversed, us_state_to_abbrev.items()))

In [5]:
df = pd.read_csv('datasets/bing_crash_geocoded.csv')
df.loc[df.City.isna(), 'City'] = df.loc[df.City.isna(), 'City_Old']
df.shape

State null values: 0 
City null values: 1632549


(2845342, 23)

In [3]:
with open('datasets/v16_1k_city_ls.pkl', 'rb') as fp:
    all_city_ls = pickle.load(fp)
len(all_city_ls)

1000

In [19]:
# df1 = df[df.apply(lambda row: (row['City'], row['State']) in all_city_ls, axis=1)]
# df1.shape

(1952926, 23)

In [15]:
all_city_ls[:10]

[('Miami', 'Florida'),
 ('Los Angeles', 'California'),
 ('Orlando', 'Florida'),
 ('Dallas', 'Texas'),
 ('Houston', 'Texas'),
 ('New York', 'New York'),
 ('Charlotte', 'North Carolina'),
 ('San Diego', 'California'),
 ('Nashville', 'Tennessee'),
 ('Sacramento', 'California')]

In [16]:
for i, e in enumerate(all_city_ls[5:6]):
    print('Current City:', e)
    df_city = df[(df.State == e[1]) & (df.City == e[0])]
    df_city = df_city[['Severity','Start_Lat','Start_Lng','End_Lat','End_Lng','Start_Time','End_Time','osm_id']]
    G_city = ox.graph_from_place({"city": e[0], "state": e[1]}, simplify=True, network_type='drive')
    
    nearest_nodes = ox.distance.nearest_nodes(G_city, df_city['Start_Lng'], df_city['Start_Lat'])
    # Map OSM node to accident node? or delete node with no accident
    if all(ele == nearest_nodes[0] for ele in nearest_nodes[:10]):
        nearest_nodes_bug_check_ls.append(e)
        print('WARNING: may have a nearest_nodes bug')
    df_city['osmid'] = nearest_nodes
    
    gdf_nodes, gdf_edges = ox.graph_to_gdfs(G_city)
    gdf_nodes['start_time'] = np.empty((len(gdf_nodes), 0)).tolist()  # Add time stamps
    gdf_nodes['end_time'] = np.empty((len(gdf_nodes), 0)).tolist()    
    for idx in gdf_nodes.index:
        gdf_nodes.loc[idx, 'accident_cnt'] = df_city[df_city['osmid'] == idx]['Severity'].count()
        gdf_nodes.loc[idx, 'severity'] = df_city[df_city['osmid'] == idx]['Severity'].mean()
        gdf_nodes.at[idx, 'start_time'] += list(df_city[df_city['osmid'] == idx]['Start_Time'])
        gdf_nodes.at[idx, 'end_time'] += list(df_city[df_city['osmid'] == idx]['End_Time'])       
    gdf_nodes.severity.fillna(0, inplace=True)  # 0 denotes no accident
        
    le = LabelEncoder()
    gdf_edges['highway'].fillna('nan', inplace=True)
    try:
        gdf_nodes['highway'].fillna('nan', inplace=True)
        gdf_nodes = pd.concat([gdf_nodes, pd.get_dummies(gdf_nodes.highway, prefix='high')], axis=1)
    except:
        cities_no_highway_node_attr_ls.append(e)
        continue
    
    # Some cells contain list values
    for i in gdf_edges.index:
        if type(gdf_edges.loc[i, 'highway']) is list:
            gdf_edges.loc[i, 'highway'] = 'residential'
        if type(gdf_edges.loc[i, 'oneway']) is list:
            gdf_edges.loc[i, 'oneway'] = False
    
    edge_attrs = ['highway', 'oneway', 'length', 'bridge', 'lanes']
    if 'bridge' in gdf_edges.columns:
        gdf_edges['bridge'].fillna('nan', inplace=True)
        for i in gdf_edges.index:
            if type(gdf_edges.loc[i, 'bridge']) is list:
                gdf_edges.loc[i, 'bridge'] = 'viaduct'
        gdf_edges.loc[:, 'bridge'] = le.fit_transform(gdf_edges.loc[:, 'bridge'])
    else:
        edge_attrs.remove('bridge')
        
    if 'lanes' in gdf_edges.columns:
        gdf_edges['lanes'].fillna('-1', inplace=True)
        for i in gdf_edges.index:
            lanes_val = gdf_edges.loc[i, 'lanes']
            try:
                if type(lanes_val) is list:
                    lanes_val = [int(i) for i in lanes_val]
                    gdf_edges.loc[i, 'lanes'] = str(int(np.mean(lanes_val)))
            except:
                gdf_edges.loc[i, 'lanes'] = gdf_edges.lanes.value_counts().index[0]
        gdf_edges.loc[:, 'lanes'] = le.fit_transform(gdf_edges.loc[:, 'lanes'])
    else:
        edge_attrs.remove('lanes')
        
    gdf_edges = gdf_edges[edge_attrs]  
    gdf_edges = pd.concat([gdf_edges, pd.get_dummies(gdf_edges.oneway, prefix='oneway')], axis=1)
    gdf_edges = pd.concat([gdf_edges, pd.get_dummies(gdf_edges.highway, prefix='high')], axis=1)
    
    crash_time = gdf_nodes[['start_time', 'end_time']]
    
    # Some cities don't have the 'ref' feature
    node_attrs_remove = ['ref', 'geometry', 'highway', 'start_time', 'end_time']
    if 'ref' not in gdf_nodes.columns:
        node_attrs_remove.remove('ref')
    if 'highway' not in gdf_nodes.columns:
        node_attrs_remove.remove('highway')
    gdf_nodes.drop(node_attrs_remove, axis=1, inplace=True)

    node_idx_map = {}
    my_edge_index = []
    node_ls = gdf_nodes.index.tolist()
    for i, j in enumerate(node_ls):
        node_idx_map[j] = i
    for edge in gdf_edges.index:
        my_edge_index.append((node_idx_map[edge[0]], node_idx_map[edge[1]]))
    my_edge_attr = gdf_edges.drop(['highway', 'oneway'], axis=1)

    labels = gdf_nodes.accident_cnt > 0  # for the occurrence prediction task only
    labels = labels.astype(int)
    features = gdf_nodes.drop(['accident_cnt', 'x', 'y', 'severity'], axis=1)
    print(f'Node shape: {features.shape} features: {features.columns}')
    print(f'Edge shape: {my_edge_attr.shape} features: {my_edge_attr.columns}')
    
    # print(labels.value_counts())
    # print('Crash %:', round(100 * labels.values.sum() / len(labels.values)))
    
    # accident_percent = 100 * labels.sum() / labels.shape[0]
    # x=features.values, occur_labels=labels.values, edge_attr=my_edge_attr.values

Current City: ('New York', 'New York')
Node shape: (55292, 12) features: Index(['street_count', 'high_crossing', 'high_give_way',
       'high_motorway_junction', 'high_nan', 'high_priority', 'high_stop',
       'high_toll_gantry', 'high_traffic_signals',
       'high_traffic_signals;crossing', 'high_turning_circle',
       'high_turning_loop'],
      dtype='object')
Edge shape: (139463, 19) features: Index(['length', 'bridge', 'lanes', 'oneway_False', 'oneway_True',
       'high_busway', 'high_living_street', 'high_motorway',
       'high_motorway_link', 'high_primary', 'high_primary_link',
       'high_residential', 'high_secondary', 'high_secondary_link',
       'high_tertiary', 'high_tertiary_link', 'high_trunk', 'high_trunk_link',
       'high_unclassified'],
      dtype='object')
