In [105]:
import pandas as pd

id,
node_start, node_start_lat, node_start_lon,  # Local node start
node_finish, node_finish_lat, node_finish_lon,  # Local node finish
global_node_start, global_node_start_lat, global_node_start_lon,  # Full ride node start
global_node_finish, global_node_finish_lat, global_node_finish_lon,  # Full ride node finish
distance, avg_speed, temp, weather

In [107]:
orders_df = pd.read_csv('../../../data/initial/orders.csv')
nodes_df = pd.read_csv('../../../data/initial/nodes.csv')

In [108]:
orders_df.head()

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time
0,-4773019581999572651,2022-01-24 18:30:21,2022-01-24 18:44:43,3.74,862.0
1,-7575630690398473489,2022-01-24 06:53:53,2022-01-24 07:06:26,3.526,753.0
2,-6264582368520213833,2022-01-24 10:00:59,2022-01-24 10:15:58,5.071,899.0
3,5964315354301636538,2022-01-24 14:28:05,2022-01-24 14:35:08,2.867,423.0
4,1372379574816145639,2022-01-24 11:57:29,2022-01-24 12:06:29,3.751,540.0


Complete nodes set

In [110]:
nodes_df.head()

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,-2627062893189810184,10980432,2133368107,17.414917,32.0
1,-2627062893189810184,10980433,5212387954,17.186539,26.0
2,-2627062893189810184,10980445,5221700954,28.513481,26.0
3,-2627062893189810184,10980498,10980445,154.26612200000002,25.0
4,-2627062893189810184,10980647,1986137911,8.542823999999998,29.0


In [111]:
all_nodes = set(nodes_df['node_start'].to_list() + nodes_df['node_finish'].to_list())
len(all_nodes)

20368

Get all nodes lat and lon

http://overpass-turbo.eu/

```
/*
This has been generated by the overpass-turbo wizard.
The original search was:
“highway=* and type:way”
*/
[out:json][timeout:250];
// gather results
area[name="Odesa"];
(
  // query part for: “highway=*”
  node["highway"]({{bbox}});
);
// print results
out body;
>;
out skel qt;
```

In [112]:
node_ids_string = ','.join([str(node) for node in all_nodes])
query = f'[out:json];' \
        f'node(id:{node_ids_string});' \
        f'out body;'

In [113]:
import json

In [114]:
with open('../../../data/nodes_uklon.geojson', 'r', encoding='utf-8') as f:
    data = json.load(f)
    nodes_json = data['features']

In [115]:
all_odesa_nodes = {int(node['properties']['@id'].replace('node/', '')): (node['geometry']['coordinates'][1], node['geometry']['coordinates'][0]) for node in nodes_json}

In [116]:
not_in_nodes = []

for node in all_nodes:
    if node not in all_odesa_nodes:
        not_in_nodes.append(node)

len(not_in_nodes), 'Nodes not exist in OSM'

(716, 'Nodes not exist in OSM')

In [117]:
import pickle

with open('../../../data/processed/all_odesa_nodes.pickle', 'wb') as f:
    pickle.dump(all_odesa_nodes, f)

In [118]:
min_date = min(orders_df['running_time'])
max_date = max(orders_df['completed_time'])

min_date, max_date

('2022-01-24 00:30:04', '2022-01-25 00:07:09')

In [119]:
from datetime import datetime

from meteostat import Point, Hourly

In [120]:
meteo_station = 33837  # Odesa meteo-station

In [121]:
weather_data = Hourly(meteo_station, datetime(2022, 1, 24, 0, 0, 0), datetime(2022, 1, 25, 0, 10, 0))

In [122]:
weather_df = weather_data.fetch()

In [123]:
weather_df.to_pickle('../../../data/processed/weather_df.pickle')

Processing

1. Add lat and lon to nodes

In [126]:
nodes_df['node_start_lat'] = nodes_df['node_start'].apply(lambda x: all_odesa_nodes[x][0] if x in all_odesa_nodes else None)
nodes_df['node_start_lon'] = nodes_df['node_start'].apply(lambda x: all_odesa_nodes[x][1] if x in all_odesa_nodes else None)

nodes_df['node_finish_lat'] = nodes_df['node_finish'].apply(lambda x: all_odesa_nodes[x][0] if x in all_odesa_nodes else None)
nodes_df['node_finish_lon'] = nodes_df['node_finish'].apply(lambda x: all_odesa_nodes[x][1] if x in all_odesa_nodes else None)

2. Add weather data

In [127]:
dates_for_orders = {}
for _, row in orders_df.iterrows():
    dates_for_orders[row['Id']] = row['running_time']

In [128]:
def get_date(x):
    try:
        return dates_for_orders[x]
    except Exception as e:
        return None

In [129]:
ids = orders_df['Id'].to_list()
nodes_df['running_time'] = nodes_df['Id'].apply(get_date)

In [132]:
len(nodes_df[nodes_df['running_time'].isna()]), len(nodes_df)

(78980, 480291)

In [138]:
temps_for_dates = {}
weather_for_dates = {}

for temp, row in weather_df.iterrows():
    temps_for_dates[temp] = row['temp']
    weather_for_dates[temp] = int(row['coco'])

In [152]:
def get_temp(x):
    if x is None:
        return None
    else:
        date = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        return temps_for_dates[datetime(date.year, date.month, date.day, date.hour)]

def get_weather(x):
    if x is None:
        return None
    else:
        date = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        return weather_for_dates[datetime(date.year, date.month, date.day, date.hour)]

In [151]:
nodes_df['temp'] = nodes_df['running_time'].apply(get_temp)

In [155]:
nodes_df['weather'] = nodes_df['running_time'].apply(get_weather)

In [156]:
nodes_df

Unnamed: 0,Id,node_start,node_finish,distance,speed,node_start_lat,node_start_lon,node_finish_lat,node_finish_lon,running_time,temp,weather
0,-2627062893189810184,10980432,2133368107,17.414916999999999092096913955174,32.000000000000000000000000000000,46.472665200000001561875251354650,30.739248100000001073794919648208,46.472681500000000198724592337385,30.739022099999999682040652260184,,,
1,-2627062893189810184,10980433,5212387954,17.186538999999999788315108162351,26.000000000000000000000000000000,46.472507399999997801387507934123,30.741319699999998249495547497645,46.472524399999997513077687472105,30.741096500000001157104634330608,,,
2,-2627062893189810184,10980445,5221700954,28.513480999999998743987816851586,26.000000000000000000000000000000,46.472085700000000940690370043740,30.747110299999999227793523459695,46.472114300000001207990862894803,30.746740200000001408398020430468,,,
3,-2627062893189810184,10980498,10980445,154.266122000000024172550183720887,25.000000000000000000000000000000,46.471949100000003340937837492675,30.749114200000001062562660081312,46.472085700000000940690370043740,30.747110299999999227793523459695,,,
4,-2627062893189810184,10980647,1986137911,8.542823999999999529109118157066,29.000000000000000000000000000000,46.473095499999999447027221322060,30.753936800000001738908395054750,46.473020200000000556883605895564,30.753920999999998286966729210690,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
480286,-8229597404562288405,8952439761,317189358,4.847929999999999850501808396075,23.000000000000000000000000000000,39.909422100000000455111148767173,-86.185904800000002978777047246695,46.475932600000000149975676322356,30.717883300000000446061676484533,2022-01-24 11:48:51,-4.000000000000000000000000000000,21.000000000000000000000000000000
480287,-8229597404562288405,8952439762,8952439761,131.325685000000021318555809557438,26.000000000000000000000000000000,39.909422300000002792330633383244,-86.185943399999999314786691684276,39.909422100000000455111148767173,-86.185904800000002978777047246695,2022-01-24 11:48:51,-4.000000000000000000000000000000,21.000000000000000000000000000000
480288,-8229597404562288405,8952439763,8952439762,44.026544000000001233274815604091,38.000000000000000000000000000000,39.909571399999997254326444817707,-86.185446200000001226726453751326,39.909422300000002792330633383244,-86.185943399999999314786691684276,2022-01-24 11:48:51,-4.000000000000000000000000000000,21.000000000000000000000000000000
480289,-8229597404562288405,8952439764,8952439763,0.111226000000000005418776538590,34.000000000000000000000000000000,39.909720700000001158969098469242,-86.185448600000000851650838740170,39.909571399999997254326444817707,-86.185446200000001226726453751326,2022-01-24 11:48:51,-4.000000000000000000000000000000,21.000000000000000000000000000000


In [158]:
nodes_df.to_pickle('../../../data/processed/nodes_df_loc_weather.pickle')

3. Add nodes for full ride

In [44]:
from collections import defaultdict

In [72]:
rides = defaultdict(lambda: [[], []])

for _, row in nodes_df.iterrows():
    rides[row['Id']][0].append(row['node_start'])
    rides[row['Id']][1].append(row['node_finish'])

In [73]:
rides

defaultdict(<function __main__.<lambda>()>,
            {-2.62706289318981e+18: [[10980432.0,
               10980433.0,
               10980445.0,
               10980498.0,
               10980647.0,
               278078475.0,
               278078576.0,
               289703354.0,
               290404203.0,
               290404211.0,
               290404226.0,
               290404232.0,
               290404240.0,
               290404304.0,
               298820898.0,
               298820919.0,
               308922743.0,
               317199395.0,
               317200411.0,
               317201208.0,
               317201226.0,
               317201228.0,
               317201230.0,
               317473325.0,
               317474638.0,
               317477004.0,
               409551378.0,
               668842805.0,
               704314815.0,
               704314817.0,
               965088641.0,
               1570777032.0,
               1570777036.0,
            

In [74]:
rides_last_nodes = {}
rides_first_nodes = {}

for ride in rides:
    rides_first_nodes[ride] = [node for node in rides[ride][0] if node not in rides[ride][1]]
    rides_last_nodes[ride] = [node for node in rides[ride][1] if node not in rides[ride][0]]

In [75]:
rides_last_nodes

{-2.62706289318981e+18: [2321040846.0, 3719876031.0, 1746751159.0],
 -6.374252502568485e+18: [1749519518.0, 290404203.0],
 -8.799295196620867e+18: [8952393497.0, 651364615.0],
 1.4690470198783923e+18: [3910640102.0, 312712596.0],
 -5.990164307390025e+18: [6958445110.0],
 5.173549360062654e+17: [6029531872.0],
 -7.659553965239667e+17: [290932348.0, 2141798084.0, 6952959845.0],
 3.4265411731232404e+18: [321323560.0, 1988203304.0],
 -3.4741927539153505e+18: [8952393764.0, 5222301453.0],
 3.538643101914166e+18: [4791429771.0, 4441087546.0],
 8.6678189643256e+18: [2176760751.0, 4775690605.0],
 -1.7832369278161608e+18: [5216935099.0, 8952386634.0],
 -8.71877548072136e+18: [4412353452.0, 6727038908.0],
 -5.725674174539908e+17: [1930113206.0],
 -3.4691427803215887e+18: [8952416255.0, 4910626595.0, 4910626604.0],
 2.0834421017639066e+18: [1751042423.0,
  3129148774.0,
  10980433.0,
  290404226.0,
  8952394068.0],
 -5.873178351253077e+18: [5242505115.0, 317473325.0],
 8.397710352572035e+18: [697

In [80]:
one_ride = nodes_df[nodes_df['Id'] == -2627062893189810184]

In [82]:
one_ride_start_c = one_ride['node_start'].to_list()
one_ride_finish_c = one_ride['node_finish'].to_list()

In [83]:
[x for x in one_ride_start_c if x not in one_ride_finish_c]

[10980647, 290404304, 6029495798]

In [84]:
[x for x in one_ride_finish_c if x not in one_ride_start_c]

[2321040846, 3719876031, 1746751159]

In [86]:
paths = {}

for node_pairs in one_ride[['node_start', 'node_finish']].to_numpy():
    paths[node_pairs[0]] = node_pairs[1]

In [87]:
paths

{10980432: 2133368107,
 10980433: 5212387954,
 10980445: 5221700954,
 10980498: 10980445,
 10980647: 1986137911,
 278078475: 5215254840,
 278078576: 5165806198,
 289703354: 4634811974,
 290404203: 4634811976,
 290404211: 4634777183,
 290404226: 4483039346,
 290404232: 290404240,
 290404240: 290404226,
 290404304: 10980498,
 298820898: 8952405950,
 298820919: 5957893285,
 308922743: 2133368041,
 317199395: 308922743,
 317200411: 4977695816,
 317201208: 965088641,
 317201226: 5052281014,
 317201228: 4775768433,
 317201230: 4775768426,
 317473325: 5172747804,
 317474638: 4371565010,
 317477004: 4371565074,
 409551378: 4801174730,
 668842805: 409551378,
 704314815: 4873902420,
 704314817: 2471550404,
 965088641: 1751397870,
 1570777032: 8952426169,
 1570777036: 5165806199,
 1751397870: 5215254913,
 1751397871: 317201208,
 1973779676: 4775763987,
 1986137911: 3719876055,
 2133368041: 4775697625,
 2133368072: 10980432,
 2133368107: 6029586988,
 2321040854: 2418535105,
 2321040856: 2321040854