In [1]:
import sys
sys.path.append('..')

import attack

In [2]:
import geopandas as gp
import pandas as pd
from tqdm import tqdm
import numpy as np
import libpysal
import itertools
from joblib import Parallel, delayed
import random


In [3]:
# load data frames (~ 40 secs)
import load_geolife

In [4]:
# Load preprocessed data 
raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf = load_geolife.geolife_raw_full_trip_gdf, load_geolife.geolife_raw_sp_gdf, load_geolife.geolife_raw_ep_gdf, load_geolife.geolife_tesselation_gdf
assert len(raw_full_trip_gdf) == len(raw_trip_sp_gdf) == len(raw_trip_ep_gdf)

In [5]:
# Select n random person ids from the dataset
n_users = 4
raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf = attack.select_n_random_users_from_dataframes(n_users, raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf)

## Evaluation Functions

# Merge Start Points (SP) and End Points (EP) with Tessellation

In [6]:
gdf_sp, gdf_ep = attack.match_boundary_points_with_tessellation(raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf)

# Extract Full Trips that Start and End within Tessellation Area

In [7]:
full_trip_gdf, trip_sp_gdf, trip_ep_gdf, gdf_sp, gdf_ep = attack.extract_trips_that_start_end_in_tessellation(raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf, gdf_sp, gdf_ep)


Number of trips that start and end wihin tessellation area: 136
Number of trips outside and therefore dropped: 16


## Build mapping of trip chains

In [8]:
mapping_cont_trips = attack.build_trip_chain_mapping(gdf_sp, gdf_ep)

100%|██████████| 136/136 [00:02<00:00, 62.86it/s]


In [9]:
attack.evaluate_trip_chaining(mapping_cont_trips, full_trip_gdf)

Number of edges (matched) between trips: 8
Number of wrong matches: 0


## Merge trips according to matching

In [10]:
full_trips_concat_gdf, trip_concat_dict = attack.merge_trips_from_matching(gdf_sp, mapping_cont_trips, full_trip_gdf)

Building trip chains...


100%|██████████| 136/136 [00:00<00:00, 135750.91it/s]


Done.
Merging trips...


100%|██████████| 136/136 [00:01<00:00, 76.10it/s]

Done.
Number of trips that were matched at least once: 136/136
Concatenating MERGED and UNMERGED trips...
Done.





In [11]:
gdf_sp_concat, trip_sp_gdf_concat, gdf_ep_concat, trip_ep_gdf_concat = attack.extract_concatenated_trips(full_trips_concat_gdf, gdf_sp, trip_sp_gdf, gdf_ep, trip_ep_gdf)

## Build Clustering after Concatenation Step

In [12]:
clustering_concat = attack.build_clustering_after_concatenation(full_trips_concat_gdf, trip_concat_dict, full_trip_gdf)

Number of unique clusters: 128


# Build Home Locations (HL)

## From Start Points (SPs)

We use the SP-tessellation matching that still contains all SP (and potential HL), and not just the SP and EP of the concatenated trips. We do this, because we do not want loose potential HL contributed of substrip concatenated in a chain.

In [13]:
gdf_hl_combined_sp = attack.build_hl_from_start_points(gdf_sp)

 There are 11 disconnected components.
 There are 9 islands with ids: 0, 1, 2, 3, 4, 7, 16, 17, 20.


## From End Points (EPs)

In [14]:
gdf_hl_combined_ep = attack.build_hl_from_end_points(gdf_ep)

 There are 5 disconnected components.
 There are 4 islands with ids: 0, 1, 2, 3.


## Merge (concatenate)

In [15]:
gp_combined, HL_table = attack.concatenate_hl(gdf_hl_combined_sp, gdf_hl_combined_ep)

Count unique PERSON_IDs per HL:  PERSON_ID
1            12
dtype: int64
Number of users for which at least on Home Location has been identified:  3
Number of unique HL tiles: 12


 There are 12 disconnected components.
 There are 9 islands with ids: 2, 3, 6, 7, 8, 9, 24, 25, 26.


# Match trips with Home Location tiles

## Match concatenated trips

In [16]:
HL_table_se_concat, unmatched_trips, double_assigned_trips, nr_unmatched = attack.match_trips_to_HL(gp_combined, HL_table, trip_sp_gdf_concat, trip_ep_gdf_concat, full_trips_concat_gdf)

Number of unmatched trajectories (concatenated) that do neither start nor end in a HL tile: 49/128
Number of trajectories (concatenated) that start AND end in a HL tile: 20/128
Number of trips that match different HL tiles with their SP and EP: 11


## Assign double matched trips to one unique HL

Iterate over all double matched trips and compare them to all other trips that are uniquely assigned in their respective potential HLs that they have been matched with. Then take the HL with the single maximum lcss score between the trip under question and any trip of the assigned HL tiles.

In [17]:
HL_table_trips_concat = attack.assign_double_matched_trips_to_unique_hl(HL_table_se_concat, full_trips_concat_gdf, unmatched_trips, double_assigned_trips, nr_unmatched)


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done  12 out of  22 | elapsed:    8.7s remaining:    7.2s
[Parallel(n_jobs=-2)]: Done  15 out of  22 | elapsed:    8.8s remaining:    4.1s
[Parallel(n_jobs=-2)]: Done  18 out of  22 | elapsed:    8.9s remaining:    1.9s
[Parallel(n_jobs=-2)]: Done  22 out of  22 | elapsed:    9.2s finished


## Get trajectories that happened during the same time

In [18]:
full_trips_concat_gdf_overlap_dict = attack.getTripOverlaps(full_trips_concat_gdf)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Batch computation too fast (0.0890s.) Setting batch_size=2.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-2)]: Done  22 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done  40 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-2)]: Done  58 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-2)]: Done  80 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-2)]: Done 102 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-2)]: Done 128 out of 128 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done 128 out of 128 | elapsed:    2.3s finished


## Create clustering after HL assignment step

In [48]:
clustering_after_HL, HL_table_dict = attack.build_clustering_after_HL_assignment(HL_table_trips_concat, full_trip_gdf, trip_concat_dict, full_trips_concat_gdf_overlap_dict)

100%|██████████| 8/8 [00:00<00:00, 159.98it/s]


In [49]:
print("Clustering results after concatenation step:")
print(f"Number of unique clusters: {len(set(clustering_concat))}")
attack.evaluate(clustering_concat, full_trip_gdf)

clustering_HL = list(dict(sorted(clustering_after_HL.items())).values())
print("\nClustering results after HL matching step:")
print(f"Number of unique clusters: {len(set(clustering_HL))}")
attack.evaluate(clustering_HL, full_trip_gdf)

Clustering results after concatenation step:
Number of unique clusters: 128
Homogeneity: 1.000
Completeness: 0.150
V-measure: 0.261
Rand index: 0.484
ARI: 0.002
MI: 0.724
NMI: 0.261
AMI: 0.019

Clustering results after HL matching step:
Number of unique clusters: 55
Homogeneity: 0.981
Completeness: 0.244
V-measure: 0.391
Rand index: 0.611
ARI: 0.241
MI: 0.710
NMI: 0.391
AMI: 0.272


## Assign Trips Without Match

In [45]:
clustering_after_double_assign_HL = attack.assign_trips_without_match(
    clustering_after_HL, HL_table_dict, 
    full_trips_concat_gdf, 
    full_trips_concat_gdf_overlap_dict, 
    full_trip_gdf, 
    trip_concat_dict,
    SIM_THRESH_FOR_NO_MATCH=0.25)


Comparing trips that were not assigned to any HL_ID with trips that were assigned to a HL_ID...


  8%|▊         | 4/49 [00:09<01:32,  2.05s/it]

existing match and assign cluster id 2.0 to trip 27889


 10%|█         | 5/49 [00:10<01:15,  1.72s/it]

existing match and assign cluster id 2.0 to trip 27917


 12%|█▏        | 6/49 [00:11<01:04,  1.51s/it]

existing match and assign cluster id 2.0 to trip 27829


 14%|█▍        | 7/49 [00:12<00:58,  1.40s/it]

no match and assign new cluster id 56 to trips 27840 27814


 22%|██▏       | 11/49 [00:18<00:56,  1.48s/it]

no match and assign new cluster id 57 to trips 28137 27960


 29%|██▊       | 14/49 [00:22<00:47,  1.36s/it]

existing match and assign cluster id 2.0 to trip 27883


 31%|███       | 15/49 [00:24<00:53,  1.59s/it]

existing match and assign cluster id 57 to trip 27798


 33%|███▎      | 16/49 [00:26<00:51,  1.58s/it]

no match and assign new cluster id 58 to trips 27891 27966


 35%|███▍      | 17/49 [00:28<00:54,  1.69s/it]

no match and assign new cluster id 59 to trips 27688 28228


 39%|███▉      | 19/49 [00:30<00:46,  1.55s/it]

existing match and assign cluster id 2.0 to trip 27781


 41%|████      | 20/49 [00:31<00:38,  1.33s/it]

existing match and assign cluster id 2.0 to trip 27849


 47%|████▋     | 23/49 [00:37<00:43,  1.69s/it]

existing match and assign cluster id 2.0 to trip 27971


 49%|████▉     | 24/49 [00:38<00:42,  1.68s/it]

existing match and assign cluster id 2.0 to trip 27704


 55%|█████▌    | 27/49 [00:43<00:32,  1.49s/it]

no match and assign new cluster id 60 to trips 27874 27803


 57%|█████▋    | 28/49 [00:45<00:31,  1.50s/it]

existing match and assign cluster id 2.0 to trip 27921


 59%|█████▉    | 29/49 [00:46<00:32,  1.60s/it]

no match and assign new cluster id 61 to trips 27942 27950
Done.
Assigning clustering IDs to all trips that are part of a new cluster...
Done.





In [52]:
print("Clustering results after concatenation step:")
print(f"Number of unique clusters: {len(set(clustering_concat))}")
attack.evaluate(clustering_concat, full_trip_gdf)

print("\nClustering results after HL matching step:")
print(f"Number of unique clusters: {len(set(list(dict(sorted(clustering_after_HL.items())).values())))}")
attack.evaluate(list(dict(sorted(clustering_after_HL.items())).values()), full_trip_gdf)

print("\nClustering results after double assign HL step:")
print(f"Number of unique clusters: {len(set(list(dict(sorted(clustering_after_double_assign_HL.items())).values())))}")
attack.evaluate(list(dict(sorted(clustering_after_double_assign_HL.items())).values()), full_trip_gdf)

Clustering results after concatenation step:
Number of unique clusters: 128
Homogeneity: 1.000
Completeness: 0.150
V-measure: 0.261
Rand index: 0.484
ARI: 0.002
MI: 0.724
NMI: 0.261
AMI: 0.019

Clustering results after HL matching step:
Number of unique clusters: 55
Homogeneity: 0.981
Completeness: 0.244
V-measure: 0.391
Rand index: 0.611
ARI: 0.241
MI: 0.710
NMI: 0.391
AMI: 0.272

Clustering results after double assign HL step:
Number of unique clusters: 39
Homogeneity: 0.981
Completeness: 0.282
V-measure: 0.438
Rand index: 0.653
ARI: 0.322
MI: 0.710
NMI: 0.438
AMI: 0.353
