In [None]:
import numpy as np
import pandas as pd
import shapely.geometry as geom
import data_utils as utils
import time 
import reverse_geocoder as rg
import importlib
import osmnx as ox
import re
import math

In [None]:
data = pd.read_csv("joined_aadt.csv")
nodesdf = pd.read_csv("accident_node_data.csv")

In [None]:
display(data.head())
print(len(data))
display(nodesdf.head())

# Selecting points from manhattan

Here we will select only the sections that are relevant to us. Namely, those pertaining to manhattan. First I select the nodes that are in manhattan

In [None]:
import geopandas as gpd

In [None]:
boro = gpd.read_file('borough_boundaries.geojson')
manhattan_poly = boro['geometry'].iloc[2]

In [None]:
boro.head()

In [None]:
# Select nodes that are located in manhattan
points = nodesdf[['longitude','latitude']].to_numpy()
coordinates = [geom.Point(point) for point in points]
inside_manhattan = [point.within(manhattan_poly) for point in coordinates]


In [None]:
manhattan_accidents = nodesdf[inside_manhattan]
manhattan_accidents.to_csv("manhattan_accidents_node_data.csv")

In [None]:
manhattan_accidents = pd.read_csv("manhattan_accidents_node_data.csv")

In [None]:
manhattan_accidents.head()

Now, I will choose the points per each of the nodes inside of manhattan. 

In [None]:
unique_nodes, index_unique = np.unique(manhattan_accidents[['node']].to_numpy(), return_index = True)


I will use the graphml file containing latitudes and longitudes to obtain this information

In [None]:
graph_file = 'newyork.graphml'
graph = ox.io.load_graphml(graph_file)

In [None]:
# Latitude and longitude coordinates of the nodes
unique_x = []
unique_y = []
num_connect = {}
for node in unique_nodes: 
    num_connect[node] = len(graph[node])
    unique_x.append(graph.nodes[node]['x'])
    unique_y.append(graph.nodes[node]['y'])
unique_x = np.expand_dims(np.array(unique_x), 1)
unique_y = np.expand_dims(np.array(unique_y), 1)
unique_points = np.concatenate((unique_x, unique_y), axis = 1)


Now we will repeat something very similar but with the road sections. We will select only those that are located in Manhattan

In [None]:
manhattan_roads = data[data['County'].apply(lambda x: x == 'New York')]
manhattan_roads.to_csv('manhattan_road_segments_aadt.csv')

# Extracting line segments

In [None]:
p = re.compile(r'[-+]?[0-9]*\.?[0-9]+')
xy = manhattan_roads['geometry'].apply(lambda x: p.findall(x))

In [None]:
def transform(x): 
    points = []
    j = 0
    point = []
    for i in range(len(x)):
        if j == 0:
            point = [float(x[i])]
            j += 1
        elif j == 1: 
            point.append(float(x[i]))
            points.append(point)
            j = 0
    return np.array(points)
    

In [None]:
def find_min_segment(point,segments,cutoff=True): 
    """
    returns index of closest segment. 
    Assumes point is np.array([x,y])
    Segments is an array of road semgments each item in 
    the list represets one road. 
    """
    def return_min(segment):
        return np.nanmin(utils.lineseg_dists(point,segment[:-1],segment[1:]))
    
    # We are looking for all roads closer than 0.01 km. 
    # A unit in long-lat coordinates is equal to about 111 km
    distances = np.array([return_min(segment) for segment in segments])
    if cutoff: 
        return (distances < (0.01/111)).nonzero()[0]
    else:
        return np.nanargmin(distances)


In [None]:
# This is to get a represenation of the coordinates the segments as an array of arrays
# Each array represents a road segment. 

segments = xy.apply(transform)
segments = segments.to_numpy()

In [None]:
minim = []
start = time.time()
nodes_roads_data = {}

cols = []
for n in range(4, 10):
    cols.append('Count_201' + str(n))

data['Count_mean'] = data[cols].mean(axis=1)
cols.append('Count_mean')
display(data.head())
for col in cols:
    nodes_roads_data[col] = []   


for i in range(len(unique_points)):
    if i % 100 == 0:
        print(i)
    road_ids = list(find_min_segment(unique_points[i],segments, cutoff=True))
    for col in cols:
        max_AADT = data.iloc[road_ids].dropna(subset=[col])[col].max()
        nodes_roads_data[col].append(max_AADT)
        
    minim.append(",".join(map(str, road_ids)))
print(time.time() - start)    


In [None]:
num_connect_arr = []
for node in unique_nodes:
    num_connect_arr.append(num_connect[node])

In [None]:
nodes_roads_data['nodes'] = unique_nodes
nodes_roads_data['num_connect'] = num_connect_arr
nodes_roads_data['roads'] = minim

intersections = pd.DataFrame(data=nodes_roads_data)
intersections = intersections.dropna(subset=cols, how="all")

In [None]:
num = 0
for i in range(len(minim)):
    if len(minim[i]) == 0:
        num += 1
print(num)

In [None]:
intersections.to_csv("nodes_roads.csv")