In [62]:
import pandas as pd
import pytz
from shapely.wkt import loads
from datetime import datetime
from geopy.distance import geodesic

def str_to_time(time_str: str):
    dt = datetime.strptime(time_str, "%Y-%m-%dT%H:%M:%SZ")
    return int(dt.replace(tzinfo=pytz.UTC).timestamp())

def time_to_str(timestamp):
    dt_object = datetime.utcfromtimestamp(int(timestamp))
    return dt_object.strftime('%Y-%m-%dT%H:%M:%SZ')

def cal_dist(c1, c2):
    return geodesic(
        (c1[1], c1[0]),
        (c2[1], c2[0]),
    ).meters

# generate usr
df_traj = pd.read_csv('data/traj.csv')
df_usr = pd.DataFrame({'usr_id': df_traj['entity_id'].drop_duplicates().sort_values()})
df_usr.to_csv('jump.usr', index=False)

# generate geo
def wkt_to_coordinates(wkt):
    geom = loads(wkt)
    return f"[{geom.x}, {geom.y}]"

def wkt_to_list(wkt):
    geom = loads(wkt)
    return [geom.x, geom.y]

df_node_split = pd.read_csv('data/node_split.csv')
total_node = len(df_node_split['id'])
print('total node:', total_node)

# df_geo = pd.DataFrame({
#     'geo_id': df_node_split['id'],  # Inheriting the 'geo_id' from 'id'
#     'type': 'Point',  # Setting 'type' as 'Point' for all rows
#     'coordinates': df_node_split['WKT'].apply(wkt_to_coordinates)  # Extracting coordinates from 'WKT'
# })
# df_geo.to_csv('jump.geo', index=False)
# print(list(df_node_split['id']))
# print(list(df_geo['coordinates']))

total node: 49730


In [76]:
# generate dyna

df_road = pd.read_csv('data/road_split.csv')
traj_to_entity = dict(zip(df_traj['traj_id'], df_traj['entity_id']))
traj_to_time_list = {traj_id: list() for traj_id in traj_to_entity.keys()}  # {traj_id: [time1, time2, ...]}
road_to_cost = dict(zip(df_road['id'], df_road['cost']))
road_to_source = dict(zip(df_road['id'], df_road['source']))
node_to_wktstr = dict(zip(df_node_split['id'], df_node_split['WKT']))
node_to_coord = {node_id: wkt_to_list(wkt) for node_id, wkt in node_to_wktstr.items()}

for traj_id, group in df_traj.groupby('traj_id'):
    time_list = traj_to_time_list[traj_id]
    for time in group['time']:
        time_list.append(str_to_time(time))

df_stmatch = pd.read_csv('data/mr_stmatch.csv', sep=';')
df_stmatch = df_stmatch[df_stmatch['mgeom'] != 'LINESTRING()']


    

In [77]:
list_coord = list(df_node_split['WKT'].apply(wkt_to_coordinates))
total_node = len(df_node_split['id'])
print('total node:', total_node)

output_loc, output_time, output_entity = [], [], []
def append_output(node_id, time_str, entity_id):
    output_loc.append(node_id)
    output_time.append(time_str)
    output_entity.append(entity_id)

for _, row in df_stmatch.iterrows():
    traj_id = row.id
    entity_id = traj_to_entity[traj_id]
    # collect matching information
    pgeom_wkt = loads(row.pgeom)
    pgeom_list = [[point[0], point[1]] for point in pgeom_wkt.coords]
    mgeom_wkt = loads(row.mgeom)
    mgeom_list = [[point[0], point[1]] for point in mgeom_wkt.coords]
    opath_list = [int(x) for x in row.opath.split(',')]
    cpath_list = [int(x) for x in row.cpath.split(',')]
    tpath_list = [[int(y) for y in x.split(',')] for x in row.tpath.split('|')]
    # calculate duration
    time_list = traj_to_time_list[traj_id]
    duration_list = [time_list[i+1] - time_list[i] for i in range(len(time_list) - 1)]
    # calculate cost
    offset_list = [cal_dist(pgeom_list[i], node_to_coord[road_to_source[opath_list[i]]]) for i in range(len(pgeom_list))]
    # offset_list = [0] * len(pgeom_list)
    cost_list = list()
    for i, tpath in enumerate(tpath_list):
        cost = sum(road_to_cost[tpath[j]] for j in range(len(tpath) - 1))
        cost_list.append(cost + offset_list[i+1] - offset_list[i])
    # print(traj_id)
    # print("pgeom_list: " ,len(pgeom_list), pgeom_list)
    # print("mgeom_list: " ,len(mgeom_list), mgeom_list)
    # print("cpath_list: " ,len(cpath_list), cpath_list)
    # print("tpath_list: " ,len(tpath_list), tpath_list)
    # print("time_list: " ,len(time_list), time_list)
    # print("duration_list: " ,len(duration_list), duration_list)
    # print("offset_list: " ,len(offset_list), offset_list)
    # print("cost_list: " ,len(cost_list), cost_list)
    
    for i, tpath in enumerate(tpath_list):
        append_output(len(list_coord), time_to_str(time_list[-1]), entity_id)
        list_coord.append(str(pgeom_list[i]))
        cost = road_to_cost[tpath[0]] - offset_list[i]
        for j in range(len(tpath) - 1):
            node_id = road_to_source[tpath[j+1]]
            timestamp = time_list[i] + (duration_list[i] * (cost / cost_list[i])) if cost_list[i] else 0
            append_output(node_id, time_to_str(timestamp), entity_id)
            cost += road_to_cost[tpath[j+1]]
    append_output(len(list_coord), time_to_str(time_list[-1]), entity_id)
    list_coord.append(str(pgeom_list[-1]))
    
    # print("output_loc: " ,len(output_loc), output_loc)
    # print("output_time: " ,len(output_time), output_time)
    # print("output_entity: " ,len(output_entity), output_entity)
    # break

total node: 49730


In [79]:

df_geo = pd.DataFrame({
    'geo_id': list(range(len(list_coord))),  # Inheriting the 'geo_id' from 'id'
    'type': 'Point',  # Setting 'type' as 'Point' for all rows
    'coordinates': list_coord # Extracting coordinates from 'WKT'
})
df_dyna = pd.DataFrame({
    'location': output_loc,
    'time': output_time,
    'entity_id': output_entity,
    'type': 'trajectory'
})

df_geo.to_csv('jump.geo', index=False)
df_dyna.to_csv('jump.dyna', index=True, index_label='dyna_id')