Setting local docker image for ORSM back-end - https://hub.docker.com/r/osrm/osrm-backend/

- wget http://download.geofabrik.de/europe/germany/berlin-latest.osm.pbf  
    
- docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-extract -p /opt/car.lua /data/berlin-latest.osm.pbf   

- docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/berlin-latest.osrm

- docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/berlin-latest.osrm

- docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/berlin-latest.osrm  

- Test: curl "http://localhost:5000/route/v1/driving/13.388860,52.517037;13.385983,52.496891?steps=true"

API Documentation

http://project-osrm.org/docs/v5.5.1/api/#general-options

In [1]:
import requests
import folium
import pandas as pd

server = 'localhost:5000'

# ROUTE API

In [2]:
# constants
SERVICE = 'route'
OPTIONALS = 'geometries=geojson&alternatives=true&steps=true&annotations=true'

In [3]:
optionals = f'?{OPTIONALS}' if OPTIONALS != '' else ''
pickup = [13.401115, 52.506327]
dropoff = [13.385983, 52.496891]

coords = [pickup, dropoff]
coords = ';'.join([f'{lon},{lat}' for lon,lat in coords])

query = f'http://{server}/{SERVICE}/v1/driving/{coords}{optionals}'
print(f'query: {query}')

response = requests.get(query)
print(f'response: {response}\n')
response = response.json()

print('----------------------------------------------------')
print('Fields of interest:\n')

main_route = response['routes'][0]
coordinates = [pickup] + main_route['geometry']['coordinates']
print(f'main_route coordinates [lon, lat]: {coordinates}\n')

distance = main_route['distance']
print(f'main_route distance: {distance}')

duration = main_route['duration']
print(f'main_route duration: {duration}')

main_route_steps = main_route['legs'][0]['steps']
main_route_intersections = [step['intersections'] for step in main_route_steps]
main_route_n_intersections = len(main_route_intersections)
print(f'main_route_n_intersections: {main_route_n_intersections}\n')

n_routes = len(response['routes'])
print(f'number of possible routes: {n_routes}\n')

query: http://localhost:5000/route/v1/driving/13.401115,52.506327;13.385983,52.496891?geometries=geojson&alternatives=true&steps=true&annotations=true
response: <Response [200]>

----------------------------------------------------
Fields of interest:

main_route coordinates [lon, lat]: [[13.401115, 52.506327], [13.401101, 52.506307], [13.398282, 52.50707], [13.394545, 52.502516], [13.394236, 52.501744], [13.39429, 52.501101], [13.395725, 52.498439], [13.395024, 52.496279], [13.394057, 52.496283], [13.39025, 52.495967], [13.385856, 52.49655]]

main_route distance: 2126.1
main_route duration: 222.9
main_route_n_intersections: 6

number of possible routes: 2



# Map Matching

In [2]:
def get_distance_haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

def __segment_traces(df, min_size_trace=5):
    # segment traces based on:
    # 1 - points can't be separated for more than 30 seconds
    # 2 - change of state
    df['trace_rank'] = np.zeros(df.shape[0])
    inv_timediff = df['diff_time_sec'] > 30
    df.loc[inv_timediff, 'trace_rank'] = 1

    changed_status = df['onlinestatus'] != df['onlinestatus'].shift(1)
    df.loc[changed_status, 'trace_rank'] = 1
    df.loc[0,'trace_rank'] = 0

    df['trace_rank'] = df['trace_rank'].cumsum()
    
    valid_traces = df['trace_rank'].value_counts()
    valid_traces = valid_traces[valid_traces > min_size_trace].index.values
    return df.loc[df['trace_rank'].isin(valid_traces)].copy().reset_index(drop=True)

def __filter_bad_points(df, speed_lim_km_h=100, accel_lim_km_h=10):

    # remove all POOLING:false status
    df = df.loc[df['locationforfleettypeupdated'] != 'POOLING:false']

    # removed duplicated timestamps
    no_duplicates_ix = df['lastupdate'].drop_duplicates().index
    df = df.loc[no_duplicates_ix].reset_index(drop=True)

    lat_lon2 = df[['coordinatelatitude', 'coordinatelongitude', 'lastupdate']].copy()

    # calculate and filter based on unreal speed and acceleration
    lat_lon2 = lat_lon2.merge(lat_lon2.shift(-1), how='left', left_index=True, right_index=True).iloc[:-1]

    lat_lon2['distance_meters'] = get_distance_haversine(lat_lon2['coordinatelatitude_x'], lat_lon2['coordinatelongitude_x'], 
                                                  lat_lon2['coordinatelatitude_y'], lat_lon2['coordinatelongitude_y'])*1000
    
    lat_lon2['diff_time_sec'] = (lat_lon2['lastupdate_y'] - lat_lon2['lastupdate_x']).dt.seconds
    
    lat_lon2['avg_speed_km_h'] = (lat_lon2['distance_meters']/lat_lon2['diff_time_sec']) * 3.6

    lat_lon2['diff_avg_speed_km_h'] = lat_lon2['avg_speed_km_h'].diff(1)

    lat_lon2['avg_acceleration_km_h'] = lat_lon2['diff_avg_speed_km_h']/lat_lon2['diff_time_sec']

    copy_cols_name = ['distance_meters', 'diff_time_sec', 'avg_speed_km_h', 'diff_avg_speed_km_h', 'avg_acceleration_km_h']
    df[copy_cols_name] = lat_lon2[copy_cols_name]

    ### filter
    ix1 = lat_lon2.loc[np.abs(lat_lon2['avg_speed_km_h']) < speed_lim_km_h].index.values 
    ix2 = lat_lon2.loc[lat_lon2['avg_acceleration_km_h'] < accel_lim_km_h].index.values
    ix_valid = list(set(ix1.tolist() + ix2.tolist()))

    return df.loc[ix_valid].copy().reset_index(drop=True) 
 

def read_process_gps(path):

    df = pd.read_csv(path).sort_values('lastupdate').reset_index(drop=True)
    df['lastupdate'] = pd.to_datetime(df['lastupdate'])
    df.drop(['currentfleettypes', 'activefleettypes', 
             'outdatedfleettypes', 's2cell'], axis=1, inplace=True) 
    
    # remove phisically wrong datapoints
    df = __filter_bad_points(df)

    # segment traces based on:
    # 1 - points can't be separated for more than 30 seconds
    # 2 - change of state
    df = __segment_traces(df) 

    return df

def trajectory_map(lat, lon, markers=None):
    coords_lat_lon = [[lat_, lon_] for lon_,lat_ in zip(lon,lat)]
    match_map = folium.Map(location=coords_lat_lon[0], zoom_start=20)

    folium.PolyLine(coords_lat_lon, color="red", weight=2.5, opacity=1).add_to(match_map)

    if(markers is not None):
        for lat_lon_text in markers:
            folium.Marker(
                location=[lat_lon_text[0], lat_lon_text[1]],
                popup=lat_lon_text[2]
            ).add_to(match_map)
    return match_map




In [3]:
df = read_process_gps('gps driver berlin.csv')

display(df.head(2))

Unnamed: 0,accuracy,coordinatelatitude,coordinatelongitude,heading,id,lastupdate,locationforfleettypeupdated,onlinestatus,lastupdate_part,distance_meters,diff_time_sec,avg_speed_km_h,diff_avg_speed_km_h,avg_acceleration_km_h,trace_rank
0,5.0,52.453137,13.298937,325.581636,1073699,2019-05-31 05:16:55,POOLING:true|TAXI:true,FREE,2019-05-31,43.315903,5.0,31.18745,-2.538245,-0.507649,1.0
1,5.0,52.453458,13.298576,325.591528,1073699,2019-05-31 05:17:00,POOLING:true|TAXI:true,FREE,2019-05-31,72.46741,10.0,26.088268,-5.099183,-0.509918,1.0


In [4]:
traces_overview = df['trace_rank'].drop_duplicates(keep='first').index
traces_overview = df.loc[traces_overview][['onlinestatus', 'trace_rank']]
traces_overview


Unnamed: 0,onlinestatus,trace_rank
0,FREE,1.0
35,OCCUPIED,2.0
46,OCCUPIED,4.0
73,FREE,5.0
139,FREE,8.0
649,FREE,11.0
1192,FREE,14.0


In [5]:
trace_rank = 8
df_filtered = df.loc[df['trace_rank'] == trace_rank].copy()

N_FILTER = 100 # limit 100 in ORSM Match
df_filtered = df_filtered.iloc[:N_FILTER]
lat = df_filtered['coordinatelatitude'].values
lon = df_filtered['coordinatelongitude'].values

trajectory_map(df_filtered['coordinatelatitude'], df_filtered['coordinatelongitude'])



In [198]:
SERVICE = 'match'

timestamp_unix = df_filtered['lastupdate'].dt.strftime('%s')
timestamp_unix = ';'.join(timestamp_unix)

# increase chance of finding correct candidate by doubling std (95% chance)
RADIUS_TOLERANCE_MULT = 1
radiuses = (df_filtered['accuracy'] * RADIUS_TOLERANCE_MULT).astype(str) 
radiuses = ';'.join(radiuses)

OPTIONALS = f'timestamps={timestamp_unix}'
OPTIONALS += f'&radiuses={radiuses}'
OPTIONALS += f'&overview=simplified'
# OPTIONALS += f'&annotations=nodes'
OPTIONALS += '&geometries=geojson'
# OPTIONALS = 'geometries=geojson'

In [199]:
coords = [[lon_, lat_] for lon_,lat_ in zip(df_filtered['coordinatelongitude'],df_filtered['coordinatelatitude'])]
coords_str = ';'.join([f'{lon},{lat}' for lon,lat in coords])

optionals = f'?{OPTIONALS}' if OPTIONALS != '' else ''
query = f'http://{server}/{SERVICE}/v1/driving/{coords_str}{optionals}'
print(query)

response = requests.get(query).json()
print(response)


http://localhost:5000/match/v1/driving/13.3708156272769,52.511569394587404;13.370850160717962,52.51148185728203;13.370532989501948,52.51102355889302;13.36909532546997,52.51077685933975;13.36821421980858,52.51063361380235;13.36722582578659,52.51046812585522;13.36624145507812,52.510303453506296;13.36526077240705,52.51013816837203;13.3642878010869,52.509974923184814;13.36323738098145,52.5098090246421;13.362311348319052,52.509786782354425;13.361370228230948,52.50981555448586;13.36044654250145,52.509844122541;13.35955068469048,52.50986942566003;13.358561284840109,52.5098967693368;13.357576914131641,52.509895544993455;13.35658885538578,52.50985493758562;13.355577997863287,52.50976862121214;13.35461139678955,52.50968128437803;13.353613279759882,52.509605782769434;13.352687917649751,52.509647818816156;13.351726345717909,52.50968373307667;13.35165962576866,52.509679243795716;13.351654261350632,52.50967883567925;13.351654261350632,52.50967883567925;13.351654261350632,52.50967883567925;13.3517045

In [145]:
matched_lon_lat = response['matchings'][0]['geometry']['coordinates']
lon = [elem[0] for elem in matched_lon_lat]
lat = [elem[1] for elem in matched_lon_lat]

lat_lon_text = [[lat_, lon_, str(text)] for lat_, lon_, text in zip(df_filtered['coordinatelatitude'],
                                                             df_filtered['coordinatelongitude'],
                                                             list(enumerate(df_filtered['lastupdate'])))]

lat_lon_text2 = [[lat_, lon_, str(text)] for lat_, lon_, text in zip(lat,
                                                             lon,
                                                             range(len(lat)))]


display(trajectory_map(df_filtered['coordinatelatitude'], df_filtered['coordinatelongitude'], lat_lon_text))
display(trajectory_map(lat, lon, lat_lon_text2))

# Some quality metrics?
