In [194]:
import geopandas as gp
import pandas as pd
from tqdm import tqdm
import numpy as np

In [154]:
# load data frames
import load_geolife

In [155]:
# Load preprocessed data 
raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf = load_geolife.geolife_raw_full_trip_gdf, load_geolife.geolife_raw_sp_gdf, load_geolife.geolife_raw_ep_gdf, load_geolife.geolife_tesselation_gdf
assert len(raw_full_trip_gdf) == len(raw_trip_sp_gdf) == len(raw_trip_ep_gdf)

# Merge Start Points (SP) and End Points (EP) with Tessellation

In [6]:
def match_boundary_points_with_tessellation(raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf):
    """This function matches the boundary points of the raw trips with the tesselation. 

    Args:
        raw_trip_sp_gdf (_type_): _description_
        raw_trip_ep_gdf (_type_): _description_
        tesselation_gdf (_type_): _description_

    Returns:
        _type_: This function returns two data frames, one for the start points and one for the end points. These data frames contain the tile_id of the tesselation that the point is located in.
    """
    # SP
    # Spatial join points to polygons
    gdf_sp = gp.sjoin(
        tesselation_gdf[["tile_id", "geometry"]],
        raw_trip_sp_gdf,
        how="inner"
    ).drop('index_right', axis=1)

    # Spatial join points to polygons
    gdf_ep = gp.sjoin(
        tesselation_gdf[["tile_id", "geometry"]],
        raw_trip_ep_gdf,
        how="inner"
    ).drop('index_right', axis=1)

    return gdf_sp, gdf_ep

In [236]:
gdf_sp, gdf_ep = match_boundary_points_with_tessellation(raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf)

# Extract Full Trips that Start and End within Tessellation Area

In [238]:
gdf_sp_ids = gdf_sp.TRIP_ID
gdf_ep_ids = gdf_ep.TRIP_ID

full_trip_gdf = raw_full_trip_gdf.query("TRIP_ID in @gdf_sp_ids and TRIP_ID in @gdf_ep_ids")
trip_sp_gdf = raw_trip_sp_gdf.query("TRIP_ID in @gdf_ep_ids")
trip_ep_gdf = raw_trip_ep_gdf.query("TRIP_ID in @gdf_sp_ids")

gdf_sp = gdf_sp.query("TRIP_ID in @gdf_ep_ids")
gdf_ep = gdf_ep.query("TRIP_ID in @gdf_sp_ids")

assert len(full_trip_gdf) == len(trip_sp_gdf) == len(trip_ep_gdf) == len(gdf_sp) == len(gdf_ep) == len(set(trip_sp_gdf.TRIP_ID).intersection(set(trip_ep_gdf.TRIP_ID))) # this last intersection checks that for all unique trip ids we have exactly ONE SP and EP

print(f"Number of trips that start and end wihin tessellation area: {len(full_trip_gdf)}")
print(f"Number of trips outside and therefore dropped: {len(raw_full_trip_gdf) - len(full_trip_gdf)}")

Number of trips that start and end wihin tessellation area: 13664
Number of trips outside and therefore dropped: 3069


## Build mapping of trip chains

In [239]:
def build_trip_chain_mapping(gdf_sp, gdf_ep, INFLOW_HR_DIFF_THRESHOLD=4, HR_DIFF_THRESHOLD=8):
    """This function returns a list of trip chains that are continued trips that happened subsequent to and from same tile within a given time threshold.

    Args:
        gdf_sp (_type_): _description_
        gdf_ep (_type_): _description_
        inflow_hr_diff_threshold (int, optional): _description_. Defaults to 4.
        hr_diff_threshold (int, optional): _description_. Defaults to 8.

    Returns:
        _type_: _description_
    """

    # Calculate mapping of continued trips that happened subsequent to and from same tile
    mapping_cont_trips = []
    for index, trip in tqdm(gdf_ep.sort_values('TRIP_ID').iterrows(), total=len(gdf_ep)):
        te_1_id = trip.TRIP_ID
        te_1_tid = trip.tile_id
        te_1_dt = pd.to_datetime(trip['TRIP_END'], format='%Y-%m-%d %H:%M:%S')
        ts_1_dt = pd.to_datetime(trip['TRIP_START'], format='%Y-%m-%d %H:%M:%S')

        inflow = gdf_ep.query("tile_id == @te_1_tid")
        inflow['TRIP_END'] = inflow.TRIP_END.astype('datetime64[ns]')
        inflow['TRIP_START'] = inflow.TRIP_START.astype('datetime64[ns]')
        inflow['INFLOW_HR_DIFF'] = inflow.TRIP_END.apply(lambda x: (x - te_1_dt).total_seconds()/3600)
        inflow = inflow.query("(INFLOW_HR_DIFF <= @INFLOW_HR_DIFF_THRESHOLD) and (INFLOW_HR_DIFF >= @INFLOW_HR_DIFF_THRESHOLD)") # Take trips 
        inflow = inflow.query("(TRIP_START > @te_1_dt) or (@ts_1_dt > TRIP_END)") # Ignore trips that have happened simultaneously

        # if more than one trip has arrived in +- hour window, then do not merge this trip
        if len(inflow) > 1:
            continue

        
        # Get all trips that started from same tile as t_1 has ended in
        ts_2 = gdf_sp.query("tile_id == @te_1_tid")

        # get difference between two trips hours (seconds divided by 3600 gets hours)
        ts_2['TRIP_START'] = ts_2.TRIP_START.astype('datetime64[ns]')
        ts_2['TRIP_END'] = ts_2.TRIP_END.astype('datetime64[ns]')
    
        ts_2['hr_diff'] = ts_2['TRIP_START'].apply(lambda x: (x - te_1_dt).total_seconds()/3600)

        # Only consider trips that started within a certain time after the initial trip ended in the same tessellation tile
        ts_2 = ts_2[(ts_2['hr_diff'].astype(str).astype(float) <= HR_DIFF_THRESHOLD) & (ts_2['hr_diff'].astype(str).astype(float) >= 0)]

        # Only consider trips that are not simultaneously
        ts_2 = ts_2.query("(TRIP_START > @te_1_dt) or (@ts_1_dt > TRIP_END)")

        # Only consider connection if exactly one trip started from same tile in time window
        if len(ts_2) == 1:
            mapping_cont_trips.append({
                'TRIP_ID': te_1_id,
                'TRIP_ID_CONT': ts_2.TRIP_ID.iloc[0]
            })

    return mapping_cont_trips
    


In [45]:
mapping_cont_trips = build_trip_chain_mapping(gdf_sp, gdf_ep)

In [48]:
def evaluate_trip_chaining(mapping_cont_trips, full_trip_gdf):
    """This function evaluates the trip chaining by checking if the chained trips are from the same person.

    Args:
        mapping_cont_trips (_type_): Dictionary of trip ids that are chained. Output of build_trip_chain_mapping()
        full_trip_gdf (_type_): The full trip gdf that contains all trips.
    Returns:
        _type_: None
    """
    mistakes = []
    for conn in mapping_cont_trips:
        trip_ids = [conn['TRIP_ID'],  conn['TRIP_ID_CONT']]
        unique_person = full_trip_gdf.query("TRIP_ID in @trip_ids").PERSON_ID.nunique()

        if unique_person > 1:
            mistakes.append(full_trip_gdf.query("TRIP_ID in @trip_ids"))


    print(f"Number of edges (matched) between trips: {len(mapping_cont_trips)}")
    print(f"Number of wrong matches: {len(mistakes)}")


In [248]:
evaluate_trip_chaining(mapping_cont_trips, full_trip_gdf)

Number of edges (matched) between trips: 5670
Number of wrong matches: 229


## Merge trips according to matching

In [241]:
def getTripChain(trip_id, mapping_cont_trips, chain=[]):
    """ Recursive function that returns a list for all chained trips for a give orig trip_id


    Args:
        trip_id (_type_): _description_
        chain (list, optional): _description_. Defaults to [].
        mapping_cont_trips (_type_): Mapping of continued trips. Output of build_trip_chain_mapping().

    Returns:
        _type_: _description_
    """
    if type(trip_id) == str:
        trip_id = int(trip_id)

    # add orig trip_id to output list
    if len(chain) == 0:
        chain.append(trip_id)

    # recursively find all chained trips originating from the orig trip_id
    for edge in mapping_cont_trips:
        if edge['TRIP_ID'] == trip_id:
            chain.append(edge['TRIP_ID_CONT'])
            getTripChain(edge['TRIP_ID_CONT'], mapping_cont_trips, chain)
            
        
    return chain

In [242]:
def merge_trips_from_matching(gdf_sp, mapping_cont_trips, full_trip_gdf):
    ### Merge trips according to matching

    # Get trip chain for each trip (Start Point)
    print("Building trip chains...")
    trip_chains = [getTripChain(trip, mapping_cont_trips, chain=[]) for trip in tqdm(gdf_sp.TRIP_ID)]
    print("Done.")

    # Sort for longest chain first
    trip_chains.sort(key = len, reverse = True)

    # Create dictionary to store mappings for evaluation
    trip_concat_dict = {}

    covered_trips = []
    merged_trips_gdf = []
    print("Merging trips...")
    for chain in tqdm(trip_chains, total=len(trip_chains)):
        # Check if any of the trips in the current chain has already been merged as part of another chain
        # Since we start with the longest chain and iterate through descending sorted list, we only retain the complete chains
        if set(chain).intersection(set(covered_trips)):
            continue

        # add trip chain to dict for evaluation later
        trip_concat_dict[chain[0]] = chain[1:]
        
        # add all trip ids part of current chain to list so that every trip is only contained in longest chain of it
        covered_trips += chain

        trips = full_trip_gdf.query("TRIP_ID in @chain").sort_values("TRIP_START")
        trips["temp"] = 1

        trips = trips.groupby('temp').agg(list).reset_index(drop=True).rename(columns={'TRIP_ID': 'TRIP_ID_CHAIN'})

        trips["wkt_trip"] = trips['geometry'].apply(lambda x: ", ".join([str(i) for i in x]).replace("), LINESTRING (", ", "))
        trips['TRIP_START'] = trips['TRIP_START'].apply(lambda x: min(x))
        trips['TRIP_END'] = trips['TRIP_END'].apply(lambda x: max(x))
        trips['TRIP_LEN_IN_MTRS'] = trips['TRIP_LEN_IN_MTRS'].apply(lambda x: sum(x))
        #trips['TRIP_DURATION_IN_SECS'] = trips['TRIP_DURATION_IN_SECS'].apply(lambda x: sum(x))
        trips['TRIP_WD'] = trips['TRIP_WD'].apply(lambda x: x[0]) # see below
        trips['TRIP_DATE'] = trips['TRIP_DATE'].apply(lambda x: x[0]) # see below
        trips['TRIP_ID'] = trips['TRIP_ID_CHAIN'].apply(lambda x: x[0]) # assign trip_id of first trip in chain to concatenated trip
        # This is the TRIP_ID of the last trip in the chain to be concatenated
        trips['TRIP_ID_LAST'] = trips['TRIP_ID_CHAIN'].apply(lambda x: x[-1]) 

        # Note: Here we are assigning the PERSON_ID of the first trip to the concatenated trip. This of course can be erroneous if the concatenation itself is wrong
        trips['PERSON_ID'] = trips['PERSON_ID'].apply(lambda x: x[0])
        trips = trips.drop(['geometry', 'TRIP_ID_CHAIN'], axis=1)

        trips = gp.GeoDataFrame(trips, geometry=gp.GeoSeries.from_wkt(trips['wkt_trip'])).drop('wkt_trip', axis=1)

        merged_trips_gdf.append(trips)
    print("Done.")

    trip_merged_gdf = pd.concat(merged_trips_gdf)

    print(f"Number of trips that were matched at least once: {len(set(covered_trips))}/{len(set(gdf_sp.TRIP_ID))}")

    # Concatenate all trips that were unmerged with the merged trips into a new gdf
    print("Concatenating merged and unmerged trips...")
    unmerged_trips = full_trip_gdf.query("TRIP_ID not in @covered_trips")
    full_trips_concat_gdf = pd.concat([unmerged_trips, trip_merged_gdf])
    full_trips_concat_gdf['TRIP_ID_FIRST'] = full_trips_concat_gdf['TRIP_ID'] # This is the same as TRIP_ID
    print("Done.")

    # Assign TRIP_ID as TRIP_ID_LAST in case TRIP has not been merged and first and last TRIP_Id are in fact the same
    full_trips_concat_gdf['TRIP_ID_LAST'] = np.where(full_trips_concat_gdf.TRIP_ID_LAST.isnull(), full_trips_concat_gdf.TRIP_ID, full_trips_concat_gdf.TRIP_ID_LAST)


    return full_trips_concat_gdf.reset_index(drop=True), trip_concat_dict

In [210]:
full_trips_concat_gdf, trip_concat_dict = merge_trips_from_matching(gdf_sp, mapping_cont_trips, full_trip_gdf)

Building trip chains...


100%|██████████| 13664/13664 [00:12<00:00, 1103.34it/s]


Done.
Merging trips...


100%|██████████| 13664/13664 [03:10<00:00, 71.87it/s] 


Done.
Number of trips that were matched at least once: 13366/13664
Concatenating merged and unmerged trips...
Done.


In [243]:
# Filter for those trip_ids that are still the start of a trip even after the concatenation (of trip chains)
t_id_sp = full_trips_concat_gdf.TRIP_ID_FIRST
t_id_ep = full_trips_concat_gdf.TRIP_ID_LAST

# Also filter dfs that contain points
gdf_sp_concat = gdf_sp.query("TRIP_ID in @t_id_sp")
trip_sp_gdf_concat = trip_sp_gdf.query("TRIP_ID in @t_id_sp")

gdf_ep_concat = gdf_ep.query("TRIP_ID in @t_id_ep")
trip_ep_gdf_concat = trip_ep_gdf.query("TRIP_ID in @t_id_ep")

assert len(trip_sp_gdf_concat) == len(trip_ep_gdf_concat) == len(gdf_sp_concat) == len(gdf_ep_concat)

In [219]:
def getIndexInList(trip_id, full_trip_gdf):
    """This function takes in a trip_id and returns the list index of this trip's position in the ground truth clustering.

    Args:
        trip_id (int): TRIP_ID

    Returns:
        int: The index of this TRIP_ID in the ground truth clustering vector.
    """
    index_list = full_trip_gdf.sort_values('TRIP_ID').TRIP_ID.to_list()

    return index_list.index(trip_id)


def build_clustering_after_concatenation(full_trips_concat_gdf, trip_concat_dict):
    """This function builds the clustering vector after the concatenation step.

    Args:
        full_trips_concat_gdf (GeoDataFrame): GeoDataFrame containing all trips after the concatenation step.
        trip_concat_dict (dict): Dictionary containing the trip chains that were concatenated.

    Returns:
        int: The index of this TRIP_ID in the ground truth clustering vector.
    """

    # This creates the array with clustering IDs after the concatenation step
    clustering_concat = {}
    for index, trip in full_trips_concat_gdf.reset_index().sort_values('TRIP_ID').iterrows():
        trip_order_index = getIndexInList(trip.TRIP_ID, full_trip_gdf)

        clustering_concat[trip_order_index] = index

        if trip.TRIP_ID in trip_concat_dict:
            for t in trip_concat_dict[trip.TRIP_ID]:
                clustering_concat[getIndexInList(t, full_trip_gdf)] = index

    clustering_concat = list(dict(sorted(clustering_concat.items())).values())

    print(f"Number of unique clusters: {len(set(clustering_concat))}")

    return clustering_concat


In [246]:
gdf_sp.query("TRIP_ID == 74")

Unnamed: 0,tile_id,geometry,TRIP_ID,TRIP_START,TRIP_END,TRIP_LENGTH_IN_MTRS,PERSON_ID,TRIP_WD,TRIP_DATE


In [245]:
full_trip_gdf.query("TRIP_ID == 74")

Unnamed: 0,TRIP_ID,TRIP_START,TRIP_END,geometry,TRIP_LENGTH_IN_MTRS,PERSON_ID,TRIP_WD,TRIP_DATE,TRIP_LEN_IN_MTRS


In [230]:
full_trip_gdf.sort_values('TRIP_ID').TRIP_ID.to_list()[74]

76

In [247]:
raw_full_trip_gdf.query("TRIP_ID == 74")

Unnamed: 0,TRIP_ID,TRIP_START,TRIP_END,geometry,TRIP_LENGTH_IN_MTRS,PERSON_ID,TRIP_WD,TRIP_DATE,TRIP_LEN_IN_MTRS
74,74,2009-04-14 12:40:21,2009-04-14 13:35:11,"LINESTRING (116.47565 39.90689, 116.47621 39.9...",31649.988277,0,Tuesday,2009-04-14,31649.988277


In [225]:
full_trips_concat_gdf

Unnamed: 0,TRIP_ID,TRIP_START,TRIP_END,geometry,TRIP_LENGTH_IN_MTRS,PERSON_ID,TRIP_WD,TRIP_DATE,TRIP_LEN_IN_MTRS,TRIP_ID_LAST,TRIP_ID_FIRST
0,74,2009-04-14 12:40:21,2009-04-14 13:35:11,"LINESTRING (116.47565 39.90689, 116.47621 39.9...",31649.988277,0,Tuesday,2009-04-14,31649.988277,74.0,74
1,75,2009-04-14 13:52:36,2009-04-15 10:06:55,"LINESTRING (116.80137 39.95207, 116.80129 39.9...",53090.957331,0,Tuesday,2009-04-14,53090.957331,75.0,75
2,78,2009-04-16 04:12:32,2009-04-16 04:32:01,"LINESTRING (116.32543 39.99605, 116.32513 39.9...",2454.918599,0,Thursday,2009-04-16,2454.918599,78.0,78
3,85,2009-04-17 04:21:21,2009-04-17 05:14:32,"LINESTRING (116.32517 39.99625, 116.32528 39.9...",5067.63454,0,Friday,2009-04-17,5067.634540,85.0,85
4,87,2009-04-19 00:53:31,2009-04-19 04:06:41,"LINESTRING (116.31490 40.00954, 116.31521 40.0...",60592.681897,0,Sunday,2009-04-19,60592.681897,87.0,87
...,...,...,...,...,...,...,...,...,...,...,...
11356,3629,2009-07-22 11:52:04,2009-07-22 12:05:29,"LINESTRING (116.42170 40.04310, 116.42172 40.0...",[1519.308674817583],144,Wednesday,2009-07-22,1519.308675,3629.0,3629
11357,7421,2008-11-07 08:53:45,2008-11-08 03:55:12,"LINESTRING (116.37989 39.85883, 116.37989 39.8...",[17377.251492553263],19,Friday,2008-11-07,17377.251493,7421.0,7421
11358,5889,2012-02-19 14:28:41,2012-02-19 15:44:44,"LINESTRING (116.45877 39.98437, 116.45868 39.9...",[3471.532567338531],163,Sunday,2012-02-19,3471.532567,5889.0,5889
11359,4508,2010-02-10 09:05:35,2010-02-11 06:15:11,"LINESTRING (116.44691 40.01943, 116.44520 40.0...",[52146.03841004418],153,Wednesday,2010-02-10,52146.038410,4508.0,4508


In [226]:
build_clustering_after_concatenation(full_trips_concat_gdf.reset_index(drop=True), trip_concat_dict)

ValueError: 74 is not in list