# Trajectory preprocessing 

In [128]:
import warnings
warnings.filterwarnings('ignore')

In [129]:
# import the libraries
import os

import duckdb
import pandas as pd
import numpy as np

import skmob
import geopandas as gpd
import folium

from skmob.utils.plot import plot_gdf
from skmob.preprocessing import filtering
from skmob.preprocessing import compression
from skmob.preprocessing import detection

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.figure_factory as ff

from skmob.measures.individual import jump_lengths

from IPython.display import display

In [130]:
# Importing a custom module in a different file
import sys
sys.path.append('C:\Camilo\Estudio\Padova\Master thesis\master-thesis-code')
import utils

Assigning the path for reading the filtered and split file

In [131]:
path_locations_split_filtered = r'..\..\Datasets\Processed\wifi_traces_filtered_split'

Assign the path for writing the output (preprocessed) trajectories and writing plots, if necessary.

In [132]:
path_interactive_plots = r'.\3-exploration-interactive-plots'
path_preprocessed_trajectories = r'..\..\Datasets\Processed\trajectories_preprocessed'

Setting up the DuckDB connection.

In [133]:
# Creating a duckDB connection for consistency in the configurations I need
duckdb_conn = duckdb.connect()

# Hiding the progress bar for queries
duckdb_conn.execute("PRAGMA disable_progress_bar;")

<duckdb.duckdb.DuckDBPyConnection at 0x1e4e0f36770>

In [134]:
# Setting the connection timezone explicitly to avoid any confusions
duckdb_conn.execute("SET TimeZone='Europe/Madrid'")

<duckdb.duckdb.DuckDBPyConnection at 0x1e4e0f36770>

## Sónar by Night process

### Reading the trajectories

In [135]:
locations_night_filtered = duckdb_conn.read_csv(os.path.join(path_locations_split_filtered,r'wifi_traces_night_filtered.csv'))

Printing the statistical summary as reference.

In [136]:
description_columns_night = ['observations_user_night', 'timespan_minutes_night', 'num_distinct_h3_cells_night', 'num_distinct_stage_night',
                             'observations_per_cell', 'observations_per_minute', 'minutes_per_cell', 'minutes_per_stage']

info_per_user_night = duckdb_conn.sql(f"""SELECT DISTINCT anonymized_macaddr, {', '.join(description_columns_night)} FROM locations_night_filtered""")

stats_summary_night = info_per_user_night.to_df().describe(percentiles=[.1, .25, .5, .75, .9]).round(2)
stats_summary_night

Unnamed: 0,observations_user_night,timespan_minutes_night,num_distinct_h3_cells_night,num_distinct_stage_night,observations_per_cell,observations_per_minute,minutes_per_cell,minutes_per_stage
count,3846.0,3846.0,3846.0,3846.0,3846.0,3846.0,3846.0,3846.0
mean,414.52,373.03,53.46,4.78,6.37,1.26,21.1,104.75
std,492.98,153.97,46.45,2.08,3.03,1.4,30.41,94.74
min,9.0,71.17,2.0,1.0,1.22,0.01,0.98,12.23
10%,28.0,153.32,7.0,2.0,3.5,0.07,2.68,33.76
25%,63.25,248.99,14.0,3.0,4.41,0.17,3.75,48.96
50%,226.0,381.44,40.0,5.0,5.58,0.74,7.8,71.07
75%,596.0,494.84,82.0,7.0,7.65,1.94,25.97,123.44
90%,1091.0,569.05,122.5,8.0,10.09,3.19,58.59,214.77
max,4990.0,681.88,289.0,9.0,30.54,13.51,269.87,681.47


In [137]:
locations_night_filtered.columns

['anonymized_macaddr',
 'macaddr_randomized',
 'timestamp_ap',
 'h3_cell',
 'client_latitude',
 'client_longitude',
 'location_name',
 'slug',
 'rssi',
 'confidence_factor',
 'vendor_name',
 'sonar_type',
 'label_night',
 'stage',
 'observations_user_night',
 'timespan_minutes_night',
 'num_distinct_h3_cells_night',
 'num_distinct_stage_night',
 'observations_per_cell',
 'observations_per_minute',
 'minutes_per_cell',
 'minutes_per_stage']

The reduced, split and filtered connections-events tables files should represent raw trajectories (before the specific preprocessing for trajectories is performed). Each of them contain the uid (`anonymized_macaddr`), the latitude and longitude (`client_latitude`, `client_longitude`), and the datetime (`timestamp`). I will consider each night (`label_night`) as a different trajectory from the same user.

In [138]:
# I first read the data as a pandas dataframe (to manage the timestamp clearly)
tdf_night = duckdb_conn.sql("""
                             SELECT anonymized_macaddr, macaddr_randomized,
                                label_night,
                                timestamp_ap,
                                client_latitude,
                                client_longitude,
                                rssi,
                                confidence_factor,
                                vendor_name,
                                stage, 
                                h3_cell,
                                observations_user_night,
                                timespan_minutes_night,
                                num_distinct_stage_night,
                                minutes_per_stage
                             FROM locations_night_filtered
                             """).to_df()
tdf_night['datetime'] = pd.to_datetime(tdf_night['timestamp_ap'], unit='s')
tdf_night['datetime'] = tdf_night['datetime'].dt.tz_localize('UTC').dt.tz_convert('Europe/Madrid')  

# Converting the standard dataframe into a TrajectoryDataFrame
# I will consider each night as a different trajectory from the same user
tdf_night = skmob.TrajDataFrame(
    tdf_night, 
    longitude='client_longitude', latitude='client_latitude',
    datetime='datetime', 
    user_id='anonymized_macaddr',
    trajectory_id='label_night').sort_values(by=['uid','tid','datetime'])

tdf_night.head()

Unnamed: 0,uid,macaddr_randomized,tid,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_night,timespan_minutes_night,num_distinct_stage_night,minutes_per_stage,datetime
493157,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718483927,41.353501,2.129162,-68.0,,,NA-Entrada,8d394461e82b5bf,586,340.9,7,48.7,2024-06-15 22:38:47+02:00
527834,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718483959,41.35344,2.129005,,,,NA-Entrada,8d394461e82a67f,586,340.9,7,48.7,2024-06-15 22:39:19+02:00
476599,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484037,41.353501,2.129162,-68.0,,,NA-Entrada,8d394461e82b5bf,586,340.9,7,48.7,2024-06-15 22:40:37+02:00
30285,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484555,41.353426,2.128963,-78.0,56.0,,NA-Entrada,8d394461e82a67f,586,340.9,7,48.7,2024-06-15 22:49:15+02:00
564470,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484561,41.353365,2.12897,-71.0,64.0,,NA-Entrada,8d394461e82a6ff,586,340.9,7,48.7,2024-06-15 22:49:21+02:00


In [139]:
print(f'Period tdf_night for night 1: {tdf_night.loc[tdf_night["tid"]==1].datetime.min()} - {tdf_night.loc[tdf_night["tid"]==1].datetime.max()}')
print(f'Period tdf_night for night 2: {tdf_night.loc[tdf_night["tid"]==2].datetime.min()} - {tdf_night.loc[tdf_night["tid"]==2].datetime.max()}')

Period tdf_night for night 1: 2024-06-14 19:50:00+02:00 - 2024-06-15 07:59:44+02:00
Period tdf_night for night 2: 2024-06-15 19:50:03+02:00 - 2024-06-16 08:00:00+02:00


### Visualizing the raw trajectories

Visualizing the raw trajectories

In [140]:
utils.plot_trajectories_in_context(tdf=tdf_night, background='night', max_users=10)

Raw trajectories with only non-randomized MAC adresses:

In [141]:
tdf_non_random_night = tdf_night.loc[tdf_night['macaddr_randomized']==0]
utils.plot_trajectories_in_context(tdf=tdf_non_random_night, background='night', max_users=10)

Raw trajectories with only randomized MAC adresses:

In [142]:
tdf_random_night = tdf_night.loc[tdf_night['macaddr_randomized']==1]
utils.plot_trajectories_in_context(tdf=tdf_random_night, background='night', max_users=10)

In [143]:
# TODO: Código temporal: borrar

In [144]:
temp_tdf = tdf_night.loc[tdf_night['uid']=='00172b78731070270dce8dedf92bbdf28e6de7fda23a8036aae23ca09276aef4']
temp_tdf

Unnamed: 0,uid,macaddr_randomized,tid,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_night,timespan_minutes_night,num_distinct_stage_night,minutes_per_stage,datetime
226198,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,1,1718401292,41.352950,2.129395,-61.0,,Xiaomi Communications Co Ltd,NA-Entrada,8d394461e808c7f,185,443.53,8,55.44,2024-06-14 23:41:32+02:00
250165,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,1,1718401314,41.352950,2.129395,-60.0,40.0,Xiaomi Communications Co Ltd,NA-Entrada,8d394461e808c7f,185,443.53,8,55.44,2024-06-14 23:41:54+02:00
251288,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,1,1718401424,41.352950,2.129395,-60.0,40.0,Xiaomi Communications Co Ltd,NA-Entrada,8d394461e808c7f,185,443.53,8,55.44,2024-06-14 23:43:44+02:00
245663,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,1,1718401950,41.353934,2.131697,-78.0,40.0,Xiaomi Communications Co Ltd,NA-Restauración,8d394461e84473f,185,443.53,8,55.44,2024-06-14 23:52:30+02:00
114735,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,1,1718402006,41.353695,2.131785,-63.0,128.0,Xiaomi Communications Co Ltd,NA-Restauración,8d394461e840cbf,185,443.53,8,55.44,2024-06-14 23:53:26+02:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14158,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,2,1718514581,41.354615,2.130570,-64.0,64.0,Xiaomi Communications Co Ltd,SonarLab x Printworks,8d394461e95b23f,84,341.88,6,56.98,2024-06-16 07:09:41+02:00
3918,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,2,1718514584,41.354154,2.130885,-65.0,80.0,Xiaomi Communications Co Ltd,SonarLab x Printworks,8d394461e87567f,84,341.88,6,56.98,2024-06-16 07:09:44+02:00
139296,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,2,1718514644,41.354148,2.130714,-64.0,72.0,Xiaomi Communications Co Ltd,SonarLab x Printworks,8d394461e87547f,84,341.88,6,56.98,2024-06-16 07:10:44+02:00
124976,00172b78731070270dce8dedf92bbdf28e6de7fda23a80...,0,2,1718514646,41.354148,2.130714,-64.0,72.0,Xiaomi Communications Co Ltd,SonarLab x Printworks,8d394461e87547f,84,341.88,6,56.98,2024-06-16 07:10:46+02:00


In [145]:
utils.plot_trajectories_in_context(tdf=temp_tdf, background='night', max_users=10)

In [146]:
# TODO: Fin código temporal

### Checking some metrics before the preprocessing

The time between of observations and the distance between observations give a better understanding of the data, which is useful for the definition of the preprocessing steps that are related with the trajectory shapes (not the user-based filters performed in the previous script).

#### Checking the time between observations

I need to check the time difference between observations of the same user and the same day/night. I can do it on the original table since I have not performed any cleaning steps yet.

In [147]:
tdf_night_sample_rate = duckdb_conn.sql("""
                                            SELECT 
                                                *,
                                                (timestamp_ap - LAG(timestamp_ap) OVER (
                                                    PARTITION BY uid, macaddr_randomized, tid
                                                    ORDER BY timestamp_ap
                                                )) / 60.0 AS time_diff_minutes
                                            FROM 
                                                tdf_night
                                            ORDER BY 
                                                uid, macaddr_randomized, tid, timestamp_ap
                                            """)
tdf_night_sample_rate

┌──────────────────────────────────────────────────────────────────┬────────────────────┬───────┬──────────────┬────────────────────┬────────────────────┬────────┬───────────────────┬─────────────────────────────────────┬────────────┬─────────────────┬─────────────────────────┬────────────────────────┬──────────────────────────┬───────────────────┬──────────────────────────┬─────────────────────┐
│                               uid                                │ macaddr_randomized │  tid  │ timestamp_ap │        lat         │        lng         │  rssi  │ confidence_factor │             vendor_name             │   stage    │     h3_cell     │ observations_user_night │ timespan_minutes_night │ num_distinct_stage_night │ minutes_per_stage │         datetime         │  time_diff_minutes  │
│                             varchar                              │       int64        │ int64 │    int64     │       double       │       double       │ double │      double       │               va

In [148]:
tdf_night_sample_rate_df = tdf_night_sample_rate.to_df()

In [149]:
"""# Creating a histogram
fig = px.histogram(
    tdf_night_sample_rate_df,
    x="time_diff_minutes",
    color="macaddr_randomized",
    nbins=5000,
    title="Time difference between observations (Sónar by Night)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Time difference between observations (minutes)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'night_sample_rate_hist_raw.html'))"""

'# Creating a histogram\nfig = px.histogram(\n    tdf_night_sample_rate_df,\n    x="time_diff_minutes",\n    color="macaddr_randomized",\n    nbins=5000,\n    title="Time difference between observations (Sónar by Night)",\n    category_orders={"macaddr_randomized": [0, 1]})\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title="Time difference between observations (minutes)",\n    yaxis_title="Observations",\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\n# Showing the plot\nfig.write_html(os.path.join(path_interactive_plots,\'night_sample_rate_hist_raw.html\'))'

In [150]:
print('Distribution of the time difference between observations (in minutes):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_night_sample_rate
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_night_sample_rate
                """)


Distribution of the time difference between observations (in minutes):


┌────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────┐
│ macaddr_randomized │ min_time_diff_minutes │ p05_time_diff_minutes │ p10_time_diff_minutes │ p25_time_diff_minutes │ median_time_diff_minutes │ p75_time_diff_minutes │ p90_time_diff_minutes │ p95_time_diff_minutes │ max_time_diff_minutes │ mean_time_diff_minutes │
│      varchar       │        double         │        double         │        double         │        double         │          double          │        double         │        double         │        double         │        double         │         double         │
├────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────┼───────────────────────┼───────────────────────┼──────

In [151]:
del tdf_night_sample_rate

Most of the observations are less than 2 minutes apart, so there is a small time-granularity (except from exceptional cases). This means that itis safe to asume a sequential behavior on the trajectories and there are few very sparse cases.

#### Checking the raw jump lengths

The distance between observations of the raw data is also insightful.

In [152]:
jump_lengths_night = tdf_night.groupby(by='macaddr_randomized').apply(jump_lengths, merge=True).reset_index()
jump_lengths_night.rename(columns={0:'jump_lengths_km'}, inplace=True)
jump_lengths_night = jump_lengths_night.explode('jump_lengths_km').reset_index(drop=True)
jump_lengths_night['jump_lengths_km'] = jump_lengths_night['jump_lengths_km'].astype(float)

jump_lengths_night['jump_lengths_m'] = round(1000 * jump_lengths_night['jump_lengths_km'], 2)
del jump_lengths_night['jump_lengths_km']

100%|██████████| 453/453 [00:01<00:00, 442.89it/s]
100%|██████████| 2342/2342 [00:07<00:00, 320.48it/s]


In [153]:
print('Distribution of the jump lengths (in meters):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_night
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_night
                """)


Distribution of the jump lengths (in meters):


┌────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬─────────────────────┬──────────────────┬──────────────────┬───────────────────┬──────────────────┐
│ macaddr_randomized │ min_jump_lengths │ p05_jump_lengths │ p10_jump_lengths │ p25_jump_lengths │ median_jump_lengths │ p75_jump_lengths │ p90_jump_lengths │ p95_jump_lengths  │ max_jump_lengths │
│      varchar       │      double      │      double      │      double      │      double      │       double        │      double      │      double      │      double       │      double      │
├────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────────┼──────────────────┼──────────────────┼───────────────────┼──────────────────┤
│ 0                  │              0.0 │              0.0 │              0.0 │              0.0 │                1.01 │            13.05 │             36.1 │ 64.09149999999994 │            362.7 │
│ 1       

In [154]:
'''# Creating a histogram
fig = px.histogram(
    jump_lengths_night,
    x="jump_lengths_m",
    color="macaddr_randomized",
    nbins=5000,
    title="Jump lenght distribution (Sónar by Night)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Jump lenght (meters)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'night_jump_lengths_hist_raw.html'))'''

'# Creating a histogram\nfig = px.histogram(\n    jump_lengths_night,\n    x="jump_lengths_m",\n    color="macaddr_randomized",\n    nbins=5000,\n    title="Jump lenght distribution (Sónar by Night)",\n    category_orders={"macaddr_randomized": [0, 1]})\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title="Jump lenght (meters)",\n    yaxis_title="Observations",\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\n# Showing the plot\nfig.write_html(os.path.join(path_interactive_plots,\'night_jump_lengths_hist_raw.html\'))'

Much of the data are "repeated points", and half of the points are less than 2.62 meters apart (less than the radius of a circle with the same area as the H# cells).

In [155]:
del jump_lengths_night

### Noise Filtering

I am using the `filter` function, from scikit-mobility. This method filters out a point if the speed from the previous point is higher than the parameter `max_speed_kmh`, which is by default set to 500km/h. In this case I work with 10 km/h, which is quite conservative since it this speed is above the [average running speed of men](https://www.healthline.com/health/how-fast-can-a-human-run).

In [156]:
max_speed_filter = 10

In [157]:
# Defining a function to filter the trajectory dataframe and show the difference for the first examples
def filter_and_report(tdf, max_speed_kmh, draw_plot=True, background_sonar='night', max_users_plot=10):
    f_tdf = filtering.filter(tdf, max_speed_kmh=max_speed_kmh)

    print(f'Number of points in the original tdf: {len(tdf)}')
    print(f'Number of points in the filtered tdf: {len(f_tdf)}')
    print(f'Number of filtered points: {len(tdf) - len(f_tdf)}')
    print(f'Parameters:\n{f_tdf.parameters}')

    if draw_plot:
        print(f'\nTrajectories pre-filter (black shadow lines) and post-filter (colored solid lines) \nfor the first {max_users_plot} users.')
        
        pre_filter_map = utils.plot_trajectories_in_context(tdf=tdf,
                                                            background=background_sonar, max_users=max_users_plot,
                                                            weight=8, hex_color='black', opacity=0.5)
        post_filter_map = utils.plot_trajectories_in_context(tdf=f_tdf,
                                                             background=pre_filter_map,
                                                             max_users=max_users_plot)

        display(post_filter_map)
    
    return f_tdf

In [158]:
f_tdf_night = filter_and_report(tdf=tdf_night, max_speed_kmh=max_speed_filter)

Number of points in the original tdf: 1594243
Number of points in the filtered tdf: 1244622
Number of filtered points: 349621
Parameters:
{'filter': {'function': 'filter', 'max_speed_kmh': 10, 'include_loops': False, 'speed_kmh': 5.0, 'max_loop': 6, 'ratio_max': 0.25}}

Trajectories pre-filter (black shadow lines) and post-filter (colored solid lines) 
for the first 10 users.


#### Examining a single example

I will use a sample user to quickly examine what is happening pre-process and post-process for a single example.

In [None]:
def examine_single_example(tdf_pre, tdf_pos, example_idx=0, given_uid=None, background_sonar='night', show_diff_table=False):

    # Selecting the user's trajectory
    if given_uid is None:
        example_uid = tdf_pre['uid'].unique()[example_idx]
    else:
        example_uid = given_uid

    tdf_pre_example = tdf_pre[tdf_pre['uid']==example_uid]
    tdf_pos_example = tdf_pos[tdf_pos['uid']==example_uid]
    
    print(f'Period of the selected example (pre-process): {tdf_pre_example.datetime.min()} - {tdf_pre_example.datetime.max()}')
    print(f'Period of the selected example (post-process): {tdf_pos_example.datetime.min()} - {tdf_pos_example.datetime.max()}')

    # Drawing the map for comparing
    print('\nMap of the single example')
    map_example_pre = utils.plot_trajectories_in_context(tdf=tdf_pre_example,
                                                 background=background_sonar,
                                                 weight=8, hex_color='black', opacity=0.5)
    
    map_example_pos = utils.plot_trajectories_in_context(tdf=tdf_pos_example,
                                               background=map_example_pre,
                                               hex_color='red')
    
    display(map_example_pos)

    # Showing which examples were filtered
    if show_diff_table:
        merged = tdf_pre_example.merge(tdf_pos_example, indicator=True, how='outer')
        diff_df = merged[merged['_merge'] == 'left_only']
        
        print(f'\nThere are {len(diff_df)} filtered points for the single example:')
        display(diff_df)

In [160]:
examine_single_example(tdf_pre=tdf_night, tdf_pos=f_tdf_night, show_diff_table=True)

Period of the selected example (pre-process): 2024-06-15 22:38:47+02:00 - 2024-06-16 04:19:41+02:00
Period of the selected example (post-process): 2024-06-15 22:38:47+02:00 - 2024-06-16 04:19:41+02:00

Map of the single example



There are 119 filtered points for the single example:


Unnamed: 0,uid,macaddr_randomized,tid,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_night,timespan_minutes_night,num_distinct_stage_night,minutes_per_stage,datetime,_merge
8,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484588,41.353044,2.129161,-82.0,48.0,,NA-Entrada,8d394461e80eb7f,586,340.9,7,48.7,2024-06-15 22:49:48+02:00,left_only
9,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484588,41.353044,2.129161,,48.0,,NA-Entrada,8d394461e80eb7f,586,340.9,7,48.7,2024-06-15 22:49:48+02:00,left_only
16,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484664,41.353426,2.128963,-78.0,56.0,,NA-Entrada,8d394461e82a67f,586,340.9,7,48.7,2024-06-15 22:51:04+02:00,left_only
17,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484664,41.353426,2.128963,,56.0,,NA-Entrada,8d394461e82a67f,586,340.9,7,48.7,2024-06-15 22:51:04+02:00,left_only
22,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484697,41.353044,2.129161,-82.0,48.0,,NA-Entrada,8d394461e80eb7f,586,340.9,7,48.7,2024-06-15 22:51:37+02:00,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718499626,41.354252,2.132867,,64.0,,SonarPub,8d394461e86a17f,586,340.9,7,48.7,2024-06-16 03:00:26+02:00,left_only
532,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718499637,41.354259,2.132685,-74.0,104.0,,SonarPub,8d394461e86ac7f,586,340.9,7,48.7,2024-06-16 03:00:37+02:00,left_only
539,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718499685,41.354395,2.132435,-87.0,96.0,,SonarPub,8d394461e86e7bf,586,340.9,7,48.7,2024-06-16 03:01:25+02:00,left_only
542,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718499693,41.354546,2.132480,-72.0,136.0,,SonarPub,8d394461e86e03f,586,340.9,7,48.7,2024-06-16 03:01:33+02:00,left_only


#### Metrics after filtering

##### Time between observations (filtered)

In [161]:
tdf_night_sample_rate_filtered_noise = duckdb_conn.sql("""
                                            SELECT 
                                                *,
                                                (timestamp_ap - LAG(timestamp_ap) OVER (
                                                    PARTITION BY uid, macaddr_randomized, tid
                                                    ORDER BY timestamp_ap
                                                )) / 60.0 AS time_diff_minutes
                                            FROM 
                                                f_tdf_night
                                            ORDER BY 
                                                uid, macaddr_randomized, tid, timestamp_ap
                                            """)
tdf_night_sample_rate_filtered_noise

┌──────────────────────────────────────────────────────────────────┬────────────────────┬───────┬──────────────┬────────────────────┬────────────────────┬────────┬───────────────────┬─────────────┬─────────────────┬─────────────────┬─────────────────────────┬────────────────────────┬──────────────────────────┬───────────────────┬──────────────────────────┬─────────────────────┐
│                               uid                                │ macaddr_randomized │  tid  │ timestamp_ap │        lat         │        lng         │  rssi  │ confidence_factor │ vendor_name │      stage      │     h3_cell     │ observations_user_night │ timespan_minutes_night │ num_distinct_stage_night │ minutes_per_stage │         datetime         │  time_diff_minutes  │
│                             varchar                              │       int64        │ int64 │    int64     │       double       │       double       │ double │      double       │   varchar   │     varchar     │     varchar     │     

In [162]:
tdf_night_sample_rate_filtered_noise_df = tdf_night_sample_rate_filtered_noise.to_df()

In [163]:
"""# Creating a histogram
fig = px.histogram(
    tdf_night_sample_rate_filtered_noise_df,
    x="time_diff_minutes",
    color="macaddr_randomized",
    nbins=5000,
    title="Time difference between observations (Sónar by Night)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Time difference between observations (minutes)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'night_sample_rate_hist_filt_noise.html'))"""

'# Creating a histogram\nfig = px.histogram(\n    tdf_night_sample_rate_filtered_noise_df,\n    x="time_diff_minutes",\n    color="macaddr_randomized",\n    nbins=5000,\n    title="Time difference between observations (Sónar by Night)",\n    category_orders={"macaddr_randomized": [0, 1]})\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title="Time difference between observations (minutes)",\n    yaxis_title="Observations",\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\n# Showing the plot\nfig.write_html(os.path.join(path_interactive_plots,\'night_sample_rate_hist_filt_noise.html\'))'

In [164]:
print('Distribution of the time difference between observations (in minutes):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_night_sample_rate_filtered_noise_df
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_night_sample_rate_filtered_noise_df
                """)


Distribution of the time difference between observations (in minutes):


┌────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────┐
│ macaddr_randomized │ min_time_diff_minutes │ p05_time_diff_minutes │ p10_time_diff_minutes │ p25_time_diff_minutes │ median_time_diff_minutes │ p75_time_diff_minutes │ p90_time_diff_minutes │ p95_time_diff_minutes │ max_time_diff_minutes │ mean_time_diff_minutes │
│      varchar       │        double         │        double         │        double         │        double         │          double          │        double         │        double         │        double         │        double         │         double         │
├────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────┼───────────────────────┼───────────────────────┼──────

In [165]:
del tdf_night_sample_rate_filtered_noise_df

##### Jump lengths (filtered)

In [166]:
jump_lengths_night_filtered = f_tdf_night.groupby(by='macaddr_randomized').apply(jump_lengths, merge=True).reset_index()
jump_lengths_night_filtered.rename(columns={0:'jump_lengths_km'}, inplace=True)
jump_lengths_night_filtered = jump_lengths_night_filtered.explode('jump_lengths_km').reset_index(drop=True)
jump_lengths_night_filtered['jump_lengths_km'] = jump_lengths_night_filtered['jump_lengths_km'].astype(float)

jump_lengths_night_filtered['jump_lengths_m'] = round(1000 * jump_lengths_night_filtered['jump_lengths_km'], 2)
del jump_lengths_night_filtered['jump_lengths_km']

100%|██████████| 453/453 [00:00<00:00, 473.93it/s]
100%|██████████| 2342/2342 [00:06<00:00, 382.08it/s]


In [167]:
print('Distribution of the jump lengths (in meters):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_night_filtered
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_night_filtered
                """)


Distribution of the jump lengths (in meters):


┌────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬─────────────────────┬──────────────────┬──────────────────┬────────────────────┬──────────────────┐
│ macaddr_randomized │ min_jump_lengths │ p05_jump_lengths │ p10_jump_lengths │ p25_jump_lengths │ median_jump_lengths │ p75_jump_lengths │ p90_jump_lengths │  p95_jump_lengths  │ max_jump_lengths │
│      varchar       │      double      │      double      │      double      │      double      │       double        │      double      │      double      │       double       │      double      │
├────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────────┼──────────────────┼──────────────────┼────────────────────┼──────────────────┤
│ 0                  │              0.0 │              0.0 │              0.0 │              0.0 │                0.15 │             8.56 │             25.9 │  47.57949999999997 │            362.7 │
│ 1  

In [168]:
del jump_lengths_night_filtered

### Trajectory compression / simplification

The goal of trajectory compression is to reduce the number of points while preserving the trajectory structure. I use `compression.compress` from scikit-mobility. All points within a radius of `spatial_radius_km` kilometers from a given initial point are compressed into a single point that has the median coordinates of all points and the time of the initial point. It compresses contiguous points.

I have H3 cells with resolution 13, which have an area of $43.87 m^2$. A circle with the same area would have a radius of about 0.004 km (4 meters).

Also, I saw that half of the data consists of jumps of less than 1.46 m, and there are many repeated points. However, 25% of the data consists of jumps of lengths between 1.46 and 9.88 meters. Applying compression with 1.46 meters helps greatly to reduce the size of the data and prevent overloading the analysis with redundant information, while mantaining high granularity (a radius of less than 4 meters). In case I needed to further reduce the dataset, I can do it afterwards.   


In [169]:
radius_km_compression = 0.00146

In [170]:
# Defining a function to compress the trajectory dataframe and show the difference for the first examples
def compress_and_report(tdf, spatial_radius_km, draw_plot=True, background_sonar='night', max_users_plot=10):
    c_tdf = compression.compress(tdf, spatial_radius_km=spatial_radius_km)

    print(f'Number of points in the previous tdf: {len(tdf)}')
    print(f'Number of points in the compressed tdf: {len(c_tdf)}')
    print(f'Number of compressed (discarded) points: {(len(tdf) - len(c_tdf))}')
    print(f'Parameters:\n{c_tdf.parameters}')

    if draw_plot:
        print(f'\nTrajectories pre-compression (black shadow lines) and post-compression (colored solid lines) \nfor the first {max_users_plot} users.')
        
        pre_compression_map = utils.plot_trajectories_in_context(tdf=tdf,
                                                           background=background_sonar, max_users=max_users_plot,
                                                           weight=8, hex_color='black', opacity=0.5)
        post_compression_map = utils.plot_trajectories_in_context(tdf=c_tdf,
                                                                  background=pre_compression_map,
                                                                  max_users=max_users_plot)

        display(post_compression_map)
    
    return c_tdf

In [171]:
cf_tdf_night = compress_and_report(tdf=f_tdf_night, spatial_radius_km=radius_km_compression)

Number of points in the previous tdf: 1244622
Number of points in the compressed tdf: 630782
Number of compressed (discarded) points: 613840
Parameters:
{'filter': {'function': 'filter', 'max_speed_kmh': 10, 'include_loops': False, 'speed_kmh': 5.0, 'max_loop': 6, 'ratio_max': 0.25}, 'compress': {'function': 'compress', 'spatial_radius_km': 0.00146}}

Trajectories pre-compression (black shadow lines) and post-compression (colored solid lines) 
for the first 10 users.


Checking the single example for the compression process.

In [172]:
examine_single_example(tdf_pre=f_tdf_night, tdf_pos=cf_tdf_night, show_diff_table=True)

Period of the selected example (pre-process): 2024-06-15 22:38:47+02:00 - 2024-06-16 04:19:41+02:00
Period of the selected example (post-process): 2024-06-15 22:38:47+02:00 - 2024-06-16 04:19:41+02:00

Map of the single example



There are 262 filtered points for the single example:


Unnamed: 0,uid,macaddr_randomized,tid,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_night,timespan_minutes_night,num_distinct_stage_night,minutes_per_stage,datetime,_merge
5,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484563,41.353365,2.128970,,64.0,,NA-Entrada,8d394461e82a6ff,586,340.9,7,48.7,2024-06-15 22:49:23+02:00,left_only
6,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484575,41.353331,2.128956,-84.0,56.0,,NA-Entrada,8d394461e80cd7f,586,340.9,7,48.7,2024-06-15 22:49:35+02:00,left_only
8,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484581,41.353339,2.128948,-53.0,56.0,,NA-Entrada,8d394461e80cd7f,586,340.9,7,48.7,2024-06-15 22:49:41+02:00,left_only
11,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484612,41.352924,2.129256,,32.0,,NA-Entrada,8d394461e808dbf,586,340.9,7,48.7,2024-06-15 22:50:12+02:00,left_only
12,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718484635,41.352940,2.129253,-80.0,24.0,,NA-Entrada,8d394461e808dbf,586,340.9,7,48.7,2024-06-15 22:50:35+02:00,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718503000,41.356051,2.131009,-77.0,80.0,,SonarCar,8d394461e94883f,586,340.9,7,48.7,2024-06-16 03:56:40+02:00,left_only
487,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718503139,41.356073,2.130682,-58.0,56.0,,SonarCar,8d394461e94c7bf,586,340.9,7,48.7,2024-06-16 03:58:59+02:00,left_only
489,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718503262,41.355850,2.131029,,24.0,,SonarCar,8d394461e9480bf,586,340.9,7,48.7,2024-06-16 04:01:02+02:00,left_only
490,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2,1718503287,41.355850,2.131029,-66.0,24.0,,SonarCar,8d394461e9480bf,586,340.9,7,48.7,2024-06-16 04:01:27+02:00,left_only


#### Metrics after compression

##### Time between observations (compression)

In [173]:
tdf_night_sample_rate_compression = duckdb_conn.sql("""
                                            SELECT 
                                                *,
                                                (timestamp_ap - LAG(timestamp_ap) OVER (
                                                    PARTITION BY uid, macaddr_randomized, tid
                                                    ORDER BY timestamp_ap
                                                )) / 60.0 AS time_diff_minutes
                                            FROM 
                                                cf_tdf_night
                                            ORDER BY 
                                                uid, macaddr_randomized, tid, timestamp_ap
                                            """)
tdf_night_sample_rate_compression

┌──────────────────────────────────────────────────────────────────┬────────────────────┬───────┬──────────────┬────────────────────┬────────────────────┬────────┬───────────────────┬─────────────┬────────────┬─────────────────┬─────────────────────────┬────────────────────────┬──────────────────────────┬───────────────────┬──────────────────────────┬─────────────────────┐
│                               uid                                │ macaddr_randomized │  tid  │ timestamp_ap │        lat         │        lng         │  rssi  │ confidence_factor │ vendor_name │   stage    │     h3_cell     │ observations_user_night │ timespan_minutes_night │ num_distinct_stage_night │ minutes_per_stage │         datetime         │  time_diff_minutes  │
│                             varchar                              │       int64        │ int64 │    int64     │       double       │       double       │ double │      double       │   varchar   │  varchar   │     varchar     │          int64     

In [174]:
tdf_night_sample_rate_compression_df = tdf_night_sample_rate_compression.to_df()

In [175]:
"""# Creating a histogram
fig = px.histogram(
    tdf_night_sample_rate_compression_df,
    x="time_diff_minutes",
    color="macaddr_randomized",
    nbins=5000,
    title="Time difference between observations (Sónar by Night). Compressed",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Time difference between observations (minutes)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'night_sample_rate_hist_compressed_1_5m.html'))"""

'# Creating a histogram\nfig = px.histogram(\n    tdf_night_sample_rate_compression_df,\n    x="time_diff_minutes",\n    color="macaddr_randomized",\n    nbins=5000,\n    title="Time difference between observations (Sónar by Night). Compressed",\n    category_orders={"macaddr_randomized": [0, 1]})\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title="Time difference between observations (minutes)",\n    yaxis_title="Observations",\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\n# Showing the plot\nfig.write_html(os.path.join(path_interactive_plots,\'night_sample_rate_hist_compressed_1_5m.html\'))'

In [176]:
print('Distribution of the time difference between observations (in minutes):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_night_sample_rate_compression_df
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_night_sample_rate_compression_df
                """)


Distribution of the time difference between observations (in minutes):


┌────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────┐
│ macaddr_randomized │ min_time_diff_minutes │ p05_time_diff_minutes │ p10_time_diff_minutes │ p25_time_diff_minutes │ median_time_diff_minutes │ p75_time_diff_minutes │ p90_time_diff_minutes │ p95_time_diff_minutes │ max_time_diff_minutes │ mean_time_diff_minutes │
│      varchar       │        double         │        double         │        double         │        double         │          double          │        double         │        double         │        double         │        double         │         double         │
├────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────┼───────────────────────┼───────────────────────┼──────

In [177]:
del tdf_night_sample_rate_compression_df

Now the observations are a little more separated in terms of time, but still, there is a small granularity (75% of the observations are less than one minute apart and 95% of the observations are at most 7 minutes apart)

##### Jump lengths (compressed)

In [178]:
jump_lengths_night_compressed = cf_tdf_night.groupby(by='macaddr_randomized').apply(jump_lengths, merge=True).reset_index()
jump_lengths_night_compressed.rename(columns={0:'jump_lengths_km'}, inplace=True)
jump_lengths_night_compressed = jump_lengths_night_compressed.explode('jump_lengths_km').reset_index(drop=True)
jump_lengths_night_compressed['jump_lengths_km'] = jump_lengths_night_compressed['jump_lengths_km'].astype(float)

jump_lengths_night_compressed['jump_lengths_m'] = round(1000 * jump_lengths_night_compressed['jump_lengths_km'], 2)
del jump_lengths_night_compressed['jump_lengths_km']

100%|██████████| 453/453 [00:00<00:00, 666.62it/s]
100%|██████████| 2342/2342 [00:04<00:00, 519.37it/s]


In [179]:
print('Distribution of the jump lengths (in meters):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_night_compressed
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_night_compressed
                """)


Distribution of the jump lengths (in meters):


┌────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬─────────────────────┬──────────────────┬───────────────────┬───────────────────┬──────────────────┐
│ macaddr_randomized │ min_jump_lengths │ p05_jump_lengths │ p10_jump_lengths │ p25_jump_lengths │ median_jump_lengths │ p75_jump_lengths │ p90_jump_lengths  │ p95_jump_lengths  │ max_jump_lengths │
│      varchar       │      double      │      double      │      double      │      double      │       double        │      double      │      double       │      double       │      double      │
├────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────────┼──────────────────┼───────────────────┼───────────────────┼──────────────────┤
│ 0                  │              0.0 │             1.95 │             2.55 │             4.92 │               11.24 │            24.58 │ 54.23800000000003 │ 97.27399999999993 │            362.7 │
│ 1  

In [180]:
"""# Creating a histogram
fig = px.histogram(
    jump_lengths_night_compressed,
    x="jump_lengths_m",
    color="macaddr_randomized",
    nbins=5000,
    title="Jump lenght distribution (Sónar by Night)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Jump lenght (meters)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'night_jump_lengths_hist_compressed.html_1_5m'))"""

'# Creating a histogram\nfig = px.histogram(\n    jump_lengths_night_compressed,\n    x="jump_lengths_m",\n    color="macaddr_randomized",\n    nbins=5000,\n    title="Jump lenght distribution (Sónar by Night)",\n    category_orders={"macaddr_randomized": [0, 1]})\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title="Jump lenght (meters)",\n    yaxis_title="Observations",\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\n# Showing the plot\nfig.write_html(os.path.join(path_interactive_plots,\'night_jump_lengths_hist_compressed.html_1_5m\'))'

In [181]:
"""# Creating a violin plot
fig = px.violin(
    jump_lengths_night_compressed,
    y='jump_lengths_m',
    color='macaddr_randomized',  
    box=True,  
    points='outliers',  
    title="Distribution of users' duration in a fixed H3 cell (Sónar by day)")

# Updating the layout
fig.update_layout(
    xaxis_title='Mac Address type',
    yaxis_title='Jump lenght (meters)',
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

fig.write_html(os.path.join(path_interactive_plots,'night_jump_lengths_violin_compressed_1_5m.html'))"""

'# Creating a violin plot\nfig = px.violin(\n    jump_lengths_night_compressed,\n    y=\'jump_lengths_m\',\n    color=\'macaddr_randomized\',  \n    box=True,  \n    points=\'outliers\',  \n    title="Distribution of users\' duration in a fixed H3 cell (Sónar by day)")\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title=\'Mac Address type\',\n    yaxis_title=\'Jump lenght (meters)\',\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\nfig.write_html(os.path.join(path_interactive_plots,\'night_jump_lengths_violin_compressed_1_5m.html\'))'

### Stay points detection

Because of the small scale of my setting and the difference of sizes between zones (the bars are quite small in comparison to other areas), it does not make sense to extract stay locations based on a fxed radius. A radius too large would put together observations that belong to different stages, and a radius too small discards many of the points in the trajectories and leaves only points within a really constrained area. For that reason, I do not compute stay locations.

### Final trajectory shapes

I will export the filtered and the compressed tables in case I need them afterwards in different situations. Since the trajectory preprocessing did not change the structure of the columns, some formatting can be applied to both tables in the same way.

In [182]:
f_tdf_night.columns

Index(['uid', 'macaddr_randomized', 'tid', 'timestamp_ap', 'lat', 'lng',
       'rssi', 'confidence_factor', 'vendor_name', 'stage', 'h3_cell',
       'observations_user_night', 'timespan_minutes_night',
       'num_distinct_stage_night', 'minutes_per_stage', 'datetime'],
      dtype='object')

In [183]:
# Dictionary for renaming the column to distinguish the ones that are metrics 
# based on the raw trajectories (they might be recomputed afterwards) 
renaming_cols_dict_night = {'stage':'stage_original',
                            'h3_cell':'h3_cell_original',
                            'observations_user_night':'observations_user_night_original',
                            'timespan_minutes_night':'timespan_minutes_night_original',
                            'num_distinct_stage_night':'num_distinct_stage_night_original',
                            'minutes_per_stage':'minutes_per_stage_original'}

# List for rearranging the columns
final_columns_night = ['uid', 'macaddr_randomized', 
                       'tid', 
                       'datetime', 'timestamp_ap', # I include both formats of the date in case I need them for quick calculations
                       'lat', 'lng',
                       'vendor_name',
                       'h3_cell_original', 
                       'stage_original',
                       'observations_user_night_original',
                       'timespan_minutes_night_original',
                       'num_distinct_stage_night_original',
                       'minutes_per_stage_original']

Formatting the filtered and the compressed trajectories.

In [184]:
# Rename the columns for the filtered trajectory
f_tdf_night.rename(columns=renaming_cols_dict_night,
                    inplace=True)

# Rearrange the columns for the filtered trajectory
f_tdf_night = f_tdf_night[final_columns_night]

print(f'Number of points filtered trajectory table shape (Sónar by Night): {f_tdf_night.shape}\n')
print(f'Number of points filtered trajectory by type of MAC address (Sónar by Night): {f_tdf_night.groupby("macaddr_randomized").size()}\n')

print(f'\nFiltered trajectory table columns (Sónar by Night): \n{f_tdf_night.columns}')

Number of points filtered trajectory table shape (Sónar by Night): (1244622, 14)

Number of points filtered trajectory by type of MAC address (Sónar by Night): macaddr_randomized
0     134855
1    1109767
dtype: int64


Filtered trajectory table columns (Sónar by Night): 
Index(['uid', 'macaddr_randomized', 'tid', 'datetime', 'timestamp_ap', 'lat',
       'lng', 'vendor_name', 'h3_cell_original', 'stage_original',
       'observations_user_night_original', 'timespan_minutes_night_original',
       'num_distinct_stage_night_original', 'minutes_per_stage_original'],
      dtype='object')


In [185]:
# Rename the columns for the compressed trajectory
cf_tdf_night.rename(columns=renaming_cols_dict_night,
                    inplace=True)

# Rearrange the columns for the compressed trajectory
cf_tdf_night = cf_tdf_night[final_columns_night]

print(f'Number of points compressed trajectory table shape (Sónar by Night): {cf_tdf_night.shape}\n')
print(f'Number of points compressed trajectory by type of MAC address (Sónar by Night): {cf_tdf_night.groupby("macaddr_randomized").size()}\n')

print(f'\nCompressed trajectory table columns (Sónar by Night): \n{cf_tdf_night.columns}')

Number of points compressed trajectory table shape (Sónar by Night): (630782, 14)

Number of points compressed trajectory by type of MAC address (Sónar by Night): macaddr_randomized
0     57590
1    573192
dtype: int64


Compressed trajectory table columns (Sónar by Night): 
Index(['uid', 'macaddr_randomized', 'tid', 'datetime', 'timestamp_ap', 'lat',
       'lng', 'vendor_name', 'h3_cell_original', 'stage_original',
       'observations_user_night_original', 'timespan_minutes_night_original',
       'num_distinct_stage_night_original', 'minutes_per_stage_original'],
      dtype='object')


## Sónar by Day process

### Reading the trajectories

In [186]:
locations_day_filtered = duckdb_conn.read_csv(os.path.join(path_locations_split_filtered,r'wifi_traces_day_filtered.csv'))
locations_day_filtered

┌──────────────────────────────────────────────────────────────────┬────────────────────┬──────────────┬─────────────────┬────────────────────┬────────────────────┬────────────────────┬─────────┬────────┬───────────────────┬─────────────────────────────────────┬────────────┬───────────┬────────────────────┬─────────────────┬───────────────────────┬──────────────────────┬───────────────────────────┬────────────────────────┬───────────────────────┬─────────────────────────┬──────────────────┬───────────────────┐
│                        anonymized_macaddr                        │ macaddr_randomized │ timestamp_ap │     h3_cell     │  client_latitude   │  client_longitude  │   location_name    │  slug   │  rssi  │ confidence_factor │             vendor_name             │ sonar_type │ label_day │       stage        │ floor_num_added │ observations_user_day │ timespan_minutes_day │ num_distinct_h3_cells_day │ num_distinct_stage_day │ observations_per_cell │ observations_per_minute │ minutes

The reduced, split and filtered connections-events tables files should represent raw trajectories (before the specific preprocessing for trajectories is performed). Each of them contain the uid (`anonymized_macaddr`), the latitude and longitude (`client_latitude`, `client_longitude`), and the datetime (`timestamp`). 

In this case, since there are multple floors and the preprocessing steps are based on the latitude and longitude, I need to create a new `label_day_floor_change_id` for the "mini-trajectories" that occur when a user changes the floor. This way, the trajectories on each floor are processed independently and, in the end, I can assemble the trajectories back together and sort them by time, obtaining the processed trajectory for each day.

### Removing the duplicates based on anonymized_macaddr and timestamp_ap

However, I had detected that there are duplicates for each user and timestamp (a device should not be in two different places at the same time). I planned on cleaning these observations based on their locations with the trajectory filtering, but for the Sónar by day this is unfeasible since I need to assign a trajectory id for each switch of floor and process it independently.  If I did not remove the duplicates before doing this, I would get inconsistent results and some duplicates (the ones tha occur simultaneisly with floor changes) would remain.

I remove the duplicates based based on the `anonymized_macaddr` and `timestamp_ap` columns, keeping the records with highest `confidence_factor`. This is the best guess I can have based on the information I was given and the documentation of the Cisco firehose product (which I was referred to when asked about this column).

In [187]:
# Dropping the duplicates and selecting only the locations columns (the information per user will be updated accordingly)
locations_day_filtered = duckdb_conn.sql("""
                                    SELECT DISTINCT ON (anonymized_macaddr, timestamp_ap)
                                         anonymized_macaddr, macaddr_randomized, 
                                         timestamp_ap, 
                                         h3_cell, client_latitude, client_longitude, 
                                         location_name, slug, 
                                         rssi, confidence_factor, 
                                         vendor_name, 
                                         sonar_type, 
                                         label_day, 
                                         stage, floor_num_added 
                                    FROM locations_day_filtered
                                    ORDER BY anonymized_macaddr, timestamp_ap, confidence_factor DESC;""")

locations_day_filtered.columns

['anonymized_macaddr',
 'macaddr_randomized',
 'timestamp_ap',
 'h3_cell',
 'client_latitude',
 'client_longitude',
 'location_name',
 'slug',
 'rssi',
 'confidence_factor',
 'vendor_name',
 'sonar_type',
 'label_day',
 'stage',
 'floor_num_added']

Extracting the new information per user.

In [188]:
info_per_user_day_updated = duckdb_conn.sql("""
                                                SELECT vendor_name,
                                                        macaddr_randomized,
                                                        anonymized_macaddr,
                                                        label_day,
                                                        COUNT(*) AS observations_user_day,
                                                        (MAX(timestamp_ap) - MIN(timestamp_ap))/60 AS timespan_minutes_day,
                                                        COUNT(DISTINCT h3_cell) AS num_distinct_h3_cells_day,
                                                        COUNT(DISTINCT stage) AS num_distinct_stage_day
                                                FROM locations_day_filtered
                                                GROUP BY vendor_name, macaddr_randomized, anonymized_macaddr, label_day
                                                ORDER BY label_day, anonymized_macaddr, timespan_minutes_day, observations_user_day""").to_df()

Updating the derived metrics for interpretation and adding the minutes_per_stage column.

In [189]:
info_per_user_day_updated['observations_per_cell'] = info_per_user_day_updated['observations_user_day'] / info_per_user_day_updated['num_distinct_h3_cells_day']
info_per_user_day_updated['observations_per_minute'] = info_per_user_day_updated['observations_user_day'] / info_per_user_day_updated['timespan_minutes_day']
info_per_user_day_updated['minutes_per_cell'] = info_per_user_day_updated['timespan_minutes_day'] / info_per_user_day_updated['num_distinct_h3_cells_day']
info_per_user_day_updated['minutes_per_stage'] = info_per_user_day_updated['timespan_minutes_day'] / info_per_user_day_updated['num_distinct_stage_day']

In [190]:
# Rounding up the other columns to make a lighter table
info_per_user_day_updated.loc[:, info_per_user_day_updated.dtypes == 'float64'] = info_per_user_day_updated.loc[:, info_per_user_day_updated.dtypes == 'float64'].round(2)

In [191]:
# Query to join the information per user to the locations table
locations_day_filtered = duckdb_conn.sql(f"""
                                           SELECT 
                                                ld.*, 
                                                iuu.observations_user_day, 
                                                iuu.timespan_minutes_day, 
                                                iuu.num_distinct_h3_cells_day,
                                                iuu.num_distinct_stage_day,
                                                iuu.observations_per_cell,
                                                iuu.observations_per_minute,
                                                iuu.minutes_per_cell,
                                                iuu.minutes_per_stage
                                           FROM locations_day_filtered ld
                                            INNER JOIN info_per_user_day_updated iuu 
                                            ON ld.anonymized_macaddr = iuu.anonymized_macaddr
                                                AND ld.macaddr_randomized = iuu.macaddr_randomized
                                                AND ld.label_day = iuu.label_day""")

locations_day_filtered.columns

['anonymized_macaddr',
 'macaddr_randomized',
 'timestamp_ap',
 'h3_cell',
 'client_latitude',
 'client_longitude',
 'location_name',
 'slug',
 'rssi',
 'confidence_factor',
 'vendor_name',
 'sonar_type',
 'label_day',
 'stage',
 'floor_num_added',
 'observations_user_day',
 'timespan_minutes_day',
 'num_distinct_h3_cells_day',
 'num_distinct_stage_day',
 'observations_per_cell',
 'observations_per_minute',
 'minutes_per_cell',
 'minutes_per_stage']

Printing the statistical summary as reference.

In [192]:
stats_summary_day = info_per_user_day_updated.describe(percentiles=[.05,.1, .25, .5, .75, .9, .95]).round(2)
stats_summary_day

Unnamed: 0,macaddr_randomized,label_day,observations_user_day,timespan_minutes_day,num_distinct_h3_cells_day,num_distinct_stage_day,observations_per_cell,observations_per_minute,minutes_per_cell,minutes_per_stage
count,4636.0,4636.0,4636.0,4636.0,4636.0,4636.0,4636.0,4636.0,4636.0,4636.0
mean,0.76,2.0,688.29,328.86,57.44,4.23,9.54,2.57,20.82,109.03
std,0.43,0.8,933.76,185.9,54.03,2.31,6.22,3.25,34.14,109.33
min,0.0,1.0,8.0,31.02,2.0,1.0,1.56,0.02,0.5,5.49
5%,0.0,1.0,19.0,67.44,4.0,1.0,3.67,0.06,1.42,20.66
10%,0.0,1.0,27.0,97.98,5.0,2.0,4.31,0.09,1.74,26.53
25%,1.0,1.0,71.0,177.63,12.0,2.0,5.41,0.27,2.75,41.86
50%,1.0,2.0,289.5,307.74,38.5,4.0,7.62,1.09,7.34,70.44
75%,1.0,3.0,955.0,455.81,93.0,6.0,11.67,3.72,23.11,132.51
90%,1.0,3.0,1935.5,593.33,138.0,8.0,17.27,7.52,56.46,237.92


In [193]:
print(f'Number of records Sónar by day (after removing duplicates based on anonymized_macaddr and timestamp_ap):')
display(duckdb_conn.sql("""
                        SELECT macaddr_randomized, COUNT(*) AS total_records
                        FROM locations_day_filtered
                        GROUP BY macaddr_randomized

                        UNION ALL

                        SELECT 'Total' AS macaddr_randomized, COUNT(*) AS total_records
                        FROM locations_day_filtered
                        """))


print(f'Unique macadresses Sónar by day (after removing duplicates based on anonymized_macaddr and timestamp_ap):')
display(duckdb_conn.sql("""
                        SELECT macaddr_randomized, COUNT(DISTINCT anonymized_macaddr) AS unique_user_count
                        FROM locations_day_filtered
                        GROUP BY macaddr_randomized

                        UNION ALL

                        SELECT 'Total' AS macaddr_randomized, COUNT(DISTINCT anonymized_macaddr) AS unique_user_count
                        FROM locations_day_filtered
                        """))

Number of records Sónar by day (after removing duplicates based on anonymized_macaddr and timestamp_ap):


┌────────────────────┬───────────────┐
│ macaddr_randomized │ total_records │
│      varchar       │     int64     │
├────────────────────┼───────────────┤
│ 0                  │        196322 │
│ 1                  │       2994591 │
│ Total              │       3190913 │
└────────────────────┴───────────────┘

Unique macadresses Sónar by day (after removing duplicates based on anonymized_macaddr and timestamp_ap):


┌────────────────────┬───────────────────┐
│ macaddr_randomized │ unique_user_count │
│      varchar       │       int64       │
├────────────────────┼───────────────────┤
│ 0                  │               769 │
│ 1                  │              1985 │
│ Total              │              2754 │
└────────────────────┴───────────────────┘

### Creating the label_day_floor_change_id

As mentioned before, I create a new `label_day_floor_change_id` for the "mini-trajectories" that occur when a user changes the floor.

In [194]:
locations_floor_change_id = duckdb_conn.sql("""
                                            WITH floor_changes AS (
                                                SELECT 
                                                    *,
                                                    -- Detect when the floor changes for each user on a given day
                                                    CASE 
                                                        WHEN floor_num_added != LAG(floor_num_added) OVER (
                                                            PARTITION BY anonymized_macaddr, label_day
                                                            ORDER BY timestamp_ap
                                                        ) THEN 1
                                                        ELSE 0
                                                    END AS floor_change_flag
                                                FROM locations_day_filtered
                                            )
                                            SELECT 
                                                *,
                                        
                                                -- Create trajectory ID by counting floor changes for each user and label_day
                                                SUM(floor_change_flag) OVER (
                                                    PARTITION BY anonymized_macaddr, label_day
                                                    ORDER BY timestamp_ap
                                                    --ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                                                ) AS floor_change_id
                                        
                                            FROM floor_changes
                                            ORDER BY anonymized_macaddr, label_day, timestamp_ap
                                            """)


locations_day_filtered = duckdb_conn.sql(f"""
                                            SELECT 
                                                {', '.join(locations_day_filtered.columns)},

                                                -- Concatenating the floor_change_id and the label_day into the final label_day_floor_change_id
                                                CONCAT(label_day, '_', CAST(floor_change_id AS INTEGER)) AS label_day_floor_change_id
                                                
                                                FROM locations_floor_change_id
                                            ORDER BY anonymized_macaddr, label_day, timestamp_ap""")

locations_day_filtered.columns

['anonymized_macaddr',
 'macaddr_randomized',
 'timestamp_ap',
 'h3_cell',
 'client_latitude',
 'client_longitude',
 'location_name',
 'slug',
 'rssi',
 'confidence_factor',
 'vendor_name',
 'sonar_type',
 'label_day',
 'stage',
 'floor_num_added',
 'observations_user_day',
 'timespan_minutes_day',
 'num_distinct_h3_cells_day',
 'num_distinct_stage_day',
 'observations_per_cell',
 'observations_per_minute',
 'minutes_per_cell',
 'minutes_per_stage',
 'label_day_floor_change_id']

### Creating the TrajDataFrames

In [195]:
# I first read the data as a pandas dataframe (to manage the timestamp clearly)
tdf_day = duckdb_conn.sql("""
                             SELECT anonymized_macaddr, macaddr_randomized,
                                label_day_floor_change_id,
                                label_day,
                                floor_num_added,
                                timestamp_ap,
                                client_latitude,
                                client_longitude,
                                rssi,
                                confidence_factor,
                                vendor_name,
                                stage, 
                                h3_cell,
                                observations_user_day,
                                timespan_minutes_day,
                                num_distinct_stage_day,
                                minutes_per_stage
                             FROM locations_day_filtered
                             """).to_df()
tdf_day['datetime'] = pd.to_datetime(tdf_day['timestamp_ap'], unit='s')
tdf_day['datetime'] = tdf_day['datetime'].dt.tz_localize('UTC').dt.tz_convert('Europe/Madrid')  

# Converting the standard dataframe into a TrajectoryDataFrame
# I will consider each day and floor change as a different trajectory from the same user
tdf_day = skmob.TrajDataFrame(
    tdf_day, 
    longitude='client_longitude', latitude='client_latitude',
    datetime='datetime', 
    user_id='anonymized_macaddr',
    trajectory_id='label_day_floor_change_id').sort_values(by=['uid', 'label_day', 'datetime'])

tdf_day.head()

Unnamed: 0,uid,macaddr_randomized,tid,label_day,floor_num_added,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_day,timespan_minutes_day,num_distinct_stage_day,minutes_per_stage,datetime
0,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2_0,2,0,1718379949,41.373153,2.151547,,88.0,,SonarVillage,8d394461ca7267f,1017,256.27,4,64.07,2024-06-14 17:45:49+02:00
1,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2_0,2,0,1718379979,41.373153,2.151547,,88.0,,SonarVillage,8d394461ca7267f,1017,256.27,4,64.07,2024-06-14 17:46:19+02:00
2,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2_0,2,0,1718380031,41.373153,2.151547,,88.0,,SonarVillage,8d394461ca7267f,1017,256.27,4,64.07,2024-06-14 17:47:11+02:00
3,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2_0,2,0,1718380040,41.373153,2.151547,-58.0,88.0,,SonarVillage,8d394461ca7267f,1017,256.27,4,64.07,2024-06-14 17:47:20+02:00
4,00154bc5831501b8bd95273b1181d9330c3bf5f34b1961...,1,2_0,2,0,1718380055,41.373153,2.151547,-87.0,48.0,,SonarVillage,8d394461ca7267f,1017,256.27,4,64.07,2024-06-14 17:47:35+02:00


In [196]:
print(f'Period tdf_day for day 1: {tdf_day.loc[tdf_day["label_day"]==1].datetime.min()} - {tdf_day.loc[tdf_day["label_day"]==1].datetime.max()}')
print(f'Period tdf_day for day 2: {tdf_day.loc[tdf_day["label_day"]==2].datetime.min()} - {tdf_day.loc[tdf_day["label_day"]==2].datetime.max()}')
print(f'Period tdf_day for day 3: {tdf_day.loc[tdf_day["label_day"]==3].datetime.min()} - {tdf_day.loc[tdf_day["label_day"]==3].datetime.max()}')

Period tdf_day for day 1: 2024-06-13 09:30:01+02:00 - 2024-06-14 00:59:56+02:00
Period tdf_day for day 2: 2024-06-14 09:30:00+02:00 - 2024-06-14 23:59:56+02:00
Period tdf_day for day 3: 2024-06-15 10:00:00+02:00 - 2024-06-15 23:59:59+02:00


### Visualizing the raw trajectories

Visualizing the raw trajectories

In [197]:
utils.plot_trajectories_in_context(tdf=tdf_day, background='day', max_users=10)

Raw trajectories with only non-randomized MAC adresses:

In [198]:
tdf_non_random_day = tdf_day.loc[tdf_day['macaddr_randomized']==0]
utils.plot_trajectories_in_context(tdf=tdf_non_random_day, background='day', max_users=10)

Raw trajectories with only randomized MAC adresses:

In [199]:
tdf_random_day = tdf_day.loc[tdf_day['macaddr_randomized']==1]
utils.plot_trajectories_in_context(tdf=tdf_random_day, background='day', max_users=10)

### Checking some metrics before the preprocessing

The time between of observations and the distance between observations give a better understanding of the data, which is useful for the definition of the preprocessing steps that are related with the trajectory shapes (not the user-based filters performed in the previous script).

#### Checking the time between observations

I need to check the time difference between observations of the same user and the same day/night. I can do it on the original table since I have not performed any cleaning steps yet.

In [200]:
tdf_day_sample_rate = duckdb_conn.sql("""
                                            SELECT 
                                                *,
                                                (timestamp_ap - LAG(timestamp_ap) OVER (
                                                    PARTITION BY uid, macaddr_randomized, label_day
                                                    ORDER BY timestamp_ap
                                                )) / 60.0 AS time_diff_minutes
                                            FROM 
                                                tdf_day
                                            ORDER BY 
                                                uid, macaddr_randomized, label_day, timestamp_ap
                                            """)
tdf_day_sample_rate

┌──────────────────────────────────────────────────────────────────┬────────────────────┬─────────┬───────────┬─────────────────┬──────────────┬────────────────────┬───────────────────┬────────┬───────────────────┬─────────────┬──────────────┬─────────────────┬───────────────────────┬──────────────────────┬────────────────────────┬───────────────────┬──────────────────────────┬──────────────────────┐
│                               uid                                │ macaddr_randomized │   tid   │ label_day │ floor_num_added │ timestamp_ap │        lat         │        lng        │  rssi  │ confidence_factor │ vendor_name │    stage     │     h3_cell     │ observations_user_day │ timespan_minutes_day │ num_distinct_stage_day │ minutes_per_stage │         datetime         │  time_diff_minutes   │
│                             varchar                              │       int64        │ varchar │   int64   │      int64      │    int64     │       double       │      double       │ double

In [201]:
tdf_day_sample_rate_df = tdf_day_sample_rate.to_df()

In [202]:
# Creating a histogram
fig = px.histogram(
    tdf_day_sample_rate_df,
    x="time_diff_minutes",
    color="macaddr_randomized",
    nbins=5000,
    title="Time difference between observations (Sónar by Day)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Time difference between observations (minutes)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'tdf_day_sample_rate_hist_raw.html'))

In [203]:
print('Distribution of the time difference between observations (in minutes):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_day_sample_rate_df
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_day_sample_rate_df
                """)


Distribution of the time difference between observations (in minutes):


┌────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────┐
│ macaddr_randomized │ min_time_diff_minutes │ p05_time_diff_minutes │ p10_time_diff_minutes │ p25_time_diff_minutes │ median_time_diff_minutes │ p75_time_diff_minutes │ p90_time_diff_minutes │ p95_time_diff_minutes │ max_time_diff_minutes │ mean_time_diff_minutes │
│      varchar       │        double         │        double         │        double         │        double         │          double          │        double         │        double         │        double         │        double         │         double         │
├────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────┼───────────────────────┼───────────────────────┼──────

In [204]:
del tdf_day_sample_rate

Most of the observations are less than 2 minutes apart, so there is a small time-granularity (except from exceptional cases). This means that itis safe to asume a sequential behavior on the trajectories and there are few very sparse cases.

#### Checking the raw jump lengths

The distance between observations of the raw data is also insightful.

In [205]:
jump_lengths_day = tdf_day.groupby(by='macaddr_randomized').apply(jump_lengths, merge=True).reset_index()
jump_lengths_day.rename(columns={0:'jump_lengths_km'}, inplace=True)
jump_lengths_day = jump_lengths_day.explode('jump_lengths_km').reset_index(drop=True)
jump_lengths_day['jump_lengths_km'] = jump_lengths_day['jump_lengths_km'].astype(float)

jump_lengths_day['jump_lengths_m'] = round(1000 * jump_lengths_day['jump_lengths_km'], 2)
del jump_lengths_day['jump_lengths_km']

100%|██████████| 769/769 [00:01<00:00, 472.58it/s]
100%|██████████| 1985/1985 [00:14<00:00, 139.20it/s]


In [206]:
print('Distribution of the jump lengths (in meters):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_day
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_day
                """)


Distribution of the jump lengths (in meters):


┌────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬─────────────────────┬──────────────────┬──────────────────┬───────────────────┬──────────────────┐
│ macaddr_randomized │ min_jump_lengths │ p05_jump_lengths │ p10_jump_lengths │ p25_jump_lengths │ median_jump_lengths │ p75_jump_lengths │ p90_jump_lengths │ p95_jump_lengths  │ max_jump_lengths │
│      varchar       │      double      │      double      │      double      │      double      │       double        │      double      │      double      │      double       │      double      │
├────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────────┼──────────────────┼──────────────────┼───────────────────┼──────────────────┤
│ 0                  │              0.0 │              0.0 │              0.0 │              0.0 │                 0.0 │              9.3 │            31.39 │ 54.79399999999994 │           296.42 │
│ 1       

In [207]:
"""# Creating a histogram
fig = px.histogram(
    jump_lengths_day,
    x="jump_lengths_m",
    color="macaddr_randomized",
    nbins=5000,
    title="Jump lenght distribution (Sónar by Day)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Jump length (meters)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'day_jump_lengths_hist_raw.html'))"""

'# Creating a histogram\nfig = px.histogram(\n    jump_lengths_day,\n    x="jump_lengths_m",\n    color="macaddr_randomized",\n    nbins=5000,\n    title="Jump lenght distribution (Sónar by Day)",\n    category_orders={"macaddr_randomized": [0, 1]})\n\n# Updating the layout\nfig.update_layout(\n    xaxis_title="Jump length (meters)",\n    yaxis_title="Observations",\n    title_font_size=16,\n    xaxis_title_font_size=14,\n    yaxis_title_font_size=14)\n\n# Showing the plot\nfig.write_html(os.path.join(path_interactive_plots,\'day_jump_lengths_hist_raw.html\'))'

Much of the data are "repeated points", and half of the points are less than 2.87 meters apart (less than the radius of a circle with the same area as the H3 cells).

In [208]:
del jump_lengths_day

### Noise Filtering

I am using the `filter` function, from scikit-mobility. This method filters out a point if the speed from the previous point is higher than the parameter `max_speed_kmh`, which is by default set to 500km/h. In this case I work with 10 km/h, which is quite conservative since it this speed is above the [average running speed of men](https://thatrunningthing.com/what-is-the-average-running-speed-of-a-human/).

In [209]:
max_speed_filter = 10

In [210]:
f_tdf_day = filter_and_report(tdf=tdf_day, max_speed_kmh=max_speed_filter, background_sonar='day')

Number of points in the original tdf: 3190913
Number of points in the filtered tdf: 2345995
Number of filtered points: 844918
Parameters:
{'filter': {'function': 'filter', 'max_speed_kmh': 10, 'include_loops': False, 'speed_kmh': 5.0, 'max_loop': 6, 'ratio_max': 0.25}}

Trajectories pre-filter (black shadow lines) and post-filter (colored solid lines) 
for the first 10 users.


#### Examining a single example

I will use a sample user to quickly examine what is happening pre-process and post-process for a single example.

In [211]:
# I extract a single day to avoid confusions


examine_single_example(tdf_pre = tdf_day.loc[tdf_day['label_day']==1], tdf_pos = f_tdf_day.loc[f_tdf_day['label_day']==1], 
                       show_diff_table=True, background_sonar='day', given_uid='00a80eab88bf619626833a80edf9539ab0a103b84ff6de567d3ad94b01e76b12')

Period of the selected example (pre-process): 2024-06-13 11:58:14+02:00 - 2024-06-13 20:26:43+02:00
Period of the selected example (post-process): 2024-06-13 11:58:14+02:00 - 2024-06-13 20:26:43+02:00

Map of the single example



There are 308 filtered points for the single example:


Unnamed: 0,uid,macaddr_randomized,tid,label_day,floor_num_added,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_day,timespan_minutes_day,num_distinct_stage_day,minutes_per_stage,datetime,_merge
20,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_13,1,0,1718277342,41.373540,2.151893,-82.0,144.0,,SonarVillage,8d394461ca739bf,1115,508.48,7,72.64,2024-06-13 13:15:42+02:00,left_only
41,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_14,1,1,1718277941,41.372270,2.152093,-77.0,128.0,,Project Area,8d394461ca52aff,1115,508.48,7,72.64,2024-06-13 13:25:41+02:00,left_only
68,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_15,1,0,1718283991,41.373378,2.152000,-74.0,96.0,,SonarVillage,8d394461ca7303f,1115,508.48,7,72.64,2024-06-13 15:06:31+02:00,left_only
69,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_15,1,0,1718283995,41.373499,2.151721,-83.0,96.0,,SonarVillage,8d394461ca706bf,1115,508.48,7,72.64,2024-06-13 15:06:35+02:00,left_only
80,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_15,1,0,1718284064,41.373478,2.151719,-73.0,104.0,,SonarVillage,8d394461ca706bf,1115,508.48,7,72.64,2024-06-13 15:07:44+02:00,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_3,1,0,1718274862,41.373536,2.152237,-67.0,96.0,,SonarVillage,8d394461ca73a7f,1115,508.48,7,72.64,2024-06-13 12:34:22+02:00,left_only
1058,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_3,1,0,1718274864,41.373536,2.152237,-68.0,96.0,,SonarVillage,8d394461ca73a7f,1115,508.48,7,72.64,2024-06-13 12:34:24+02:00,left_only
1059,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_3,1,0,1718274866,41.373614,2.152150,-74.0,80.0,,SonarVillage,8d394461ca714ff,1115,508.48,7,72.64,2024-06-13 12:34:26+02:00,left_only
1063,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_3,1,0,1718274882,41.373569,2.151996,-75.0,96.0,,SonarVillage,8d394461ca7387f,1115,508.48,7,72.64,2024-06-13 12:34:42+02:00,left_only


#### Metrics after filtering

##### Time between observations (filtered)

In [212]:
tdf_day_sample_rate_filtered_noise = duckdb_conn.sql("""
                                            SELECT 
                                                *,
                                                (timestamp_ap - LAG(timestamp_ap) OVER (
                                                    PARTITION BY uid, macaddr_randomized, label_day
                                                    ORDER BY timestamp_ap
                                                )) / 60.0 AS time_diff_minutes
                                            FROM 
                                                f_tdf_day
                                            ORDER BY 
                                                uid, macaddr_randomized, label_day, timestamp_ap
                                            """)
tdf_day_sample_rate_filtered_noise

┌──────────────────────────────────────────────────────────────────┬────────────────────┬─────────┬───────────┬─────────────────┬──────────────┬────────────────────┬───────────────────┬────────┬───────────────────┬─────────────┬──────────────┬─────────────────┬───────────────────────┬──────────────────────┬────────────────────────┬───────────────────┬──────────────────────────┬─────────────────────┐
│                               uid                                │ macaddr_randomized │   tid   │ label_day │ floor_num_added │ timestamp_ap │        lat         │        lng        │  rssi  │ confidence_factor │ vendor_name │    stage     │     h3_cell     │ observations_user_day │ timespan_minutes_day │ num_distinct_stage_day │ minutes_per_stage │         datetime         │  time_diff_minutes  │
│                             varchar                              │       int64        │ varchar │   int64   │      int64      │    int64     │       double       │      double       │ double │

In [246]:
tdf_day_sample_rate_filtered_noise_df = tdf_day_sample_rate_filtered_noise.to_df()

In [None]:
"""# Creating a histogram
fig = px.histogram(
    tdf_day_sample_rate_filtered_noise_df,
    x="time_diff_minutes",
    color="macaddr_randomized",
    nbins=5000,
    title="Time difference between observations (Sónar by Day)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Time difference between observations (minutes)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'day_sample_rate_hist_filt_noise.html'))"""

In [215]:
print('Distribution of the time difference between observations (in minutes):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_day_sample_rate_filtered_noise_df
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_day_sample_rate_filtered_noise_df
                """)


Distribution of the time difference between observations (in minutes):


┌────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────┐
│ macaddr_randomized │ min_time_diff_minutes │ p05_time_diff_minutes │ p10_time_diff_minutes │ p25_time_diff_minutes │ median_time_diff_minutes │ p75_time_diff_minutes │ p90_time_diff_minutes │ p95_time_diff_minutes │ max_time_diff_minutes │ mean_time_diff_minutes │
│      varchar       │        double         │        double         │        double         │        double         │          double          │        double         │        double         │        double         │        double         │         double         │
├────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────┼───────────────────────┼───────────────────────┼──────

In [250]:
del tdf_day_sample_rate_filtered_noise_df

##### Jump lengths (filtered)

In [217]:
jump_lengths_day_filtered = f_tdf_day.groupby(by='macaddr_randomized').apply(jump_lengths, merge=True).reset_index()
jump_lengths_day_filtered.rename(columns={0:'jump_lengths_km'}, inplace=True)
jump_lengths_day_filtered = jump_lengths_day_filtered.explode('jump_lengths_km').reset_index(drop=True)
jump_lengths_day_filtered['jump_lengths_km'] = jump_lengths_day_filtered['jump_lengths_km'].astype(float)

jump_lengths_day_filtered['jump_lengths_m'] = round(1000 * jump_lengths_day_filtered['jump_lengths_km'], 2)
del jump_lengths_day_filtered['jump_lengths_km']

100%|██████████| 769/769 [00:03<00:00, 199.59it/s]
100%|██████████| 1985/1985 [00:22<00:00, 89.26it/s] 


In [218]:
print('Distribution of the jump lengths (in meters):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_day_filtered
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_day_filtered
                """)


Distribution of the jump lengths (in meters):


┌────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬─────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┐
│ macaddr_randomized │ min_jump_lengths │ p05_jump_lengths │ p10_jump_lengths │ p25_jump_lengths │ median_jump_lengths │ p75_jump_lengths │ p90_jump_lengths │ p95_jump_lengths │ max_jump_lengths │
│      varchar       │      double      │      double      │      double      │      double      │       double        │      double      │      double      │      double      │      double      │
├────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┤
│ 0                  │              0.0 │              0.0 │              0.0 │              0.0 │                 0.0 │             5.45 │            20.49 │            39.88 │           296.42 │
│ 1            

In [219]:
del jump_lengths_day_filtered

### Trajectory compression / simplification

The goal of trajectory compression is to reduce the number of points while preserving the trajectory structure. I use `compression.compress` from scikit-mobility. All points within a radius of `spatial_radius_km` kilometers from a given initial point are compressed into a single point that has the median coordinates of all points and the time of the initial point. It compresses contiguous points.

I have H3 cells with resolution 13, which have an area of $43.87 m^2$. A circle with the same area would have a radius of about 0.004 km (4 meters).

Also, I saw that half of the data consists of jumps of less than 0.9 m, and there are many repeated points. However, 25% of the data consists of jumps of lengths between 0.9 and 7.36 meters. Applying compression with 1 meter helps greatly to reduce the size of the data and prevent overloading the analysis with redundant information, while mantaining high granularity (a radius of less than 4 meters). In case I needed to further reduce the dataset, I can do it afterwards.   


In [220]:
radius_km_compression = 0.0009

In [221]:
cf_tdf_day = compress_and_report(tdf=f_tdf_day, spatial_radius_km=radius_km_compression, background_sonar='day')

Number of points in the previous tdf: 2345995
Number of points in the compressed tdf: 1191053
Number of compressed (discarded) points: 1154942
Parameters:
{'filter': {'function': 'filter', 'max_speed_kmh': 10, 'include_loops': False, 'speed_kmh': 5.0, 'max_loop': 6, 'ratio_max': 0.25}, 'compress': {'function': 'compress', 'spatial_radius_km': 0.0009}}

Trajectories pre-compression (black shadow lines) and post-compression (colored solid lines) 
for the first 10 users.


Checking the single example for the compression process.

In [222]:
examine_single_example(tdf_pre = f_tdf_day.loc[f_tdf_day['label_day']==1], tdf_pos = cf_tdf_day.loc[cf_tdf_day['label_day']==1], 
                       show_diff_table=True, background_sonar='day', given_uid='00a80eab88bf619626833a80edf9539ab0a103b84ff6de567d3ad94b01e76b12')

Period of the selected example (pre-process): 2024-06-13 11:58:14+02:00 - 2024-06-13 20:26:43+02:00
Period of the selected example (post-process): 2024-06-13 11:58:14+02:00 - 2024-06-13 20:19:47+02:00

Map of the single example



There are 411 filtered points for the single example:


Unnamed: 0,uid,macaddr_randomized,tid,label_day,floor_num_added,timestamp_ap,lat,lng,rssi,confidence_factor,vendor_name,stage,h3_cell,observations_user_day,timespan_minutes_day,num_distinct_stage_day,minutes_per_stage,datetime,_merge
1,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_0,1,1,1718272696,41.372222,2.152224,-60.0,104.0,,Project Area,8d394461ca53cbf,1115,508.48,7,72.64,2024-06-13 11:58:16+02:00,left_only
3,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_1,1,0,1718272698,41.372107,2.151926,-60.0,32.0,,NA-lounge+d,8d394461ca520bf,1115,508.48,7,72.64,2024-06-13 11:58:18+02:00,left_only
4,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_1,1,0,1718272700,41.372107,2.151926,-72.0,32.0,,NA-lounge+d,8d394461ca520bf,1115,508.48,7,72.64,2024-06-13 11:58:20+02:00,left_only
5,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_1,1,0,1718272704,41.372106,2.151925,-67.0,72.0,,NA-lounge+d,8d394461ca520bf,1115,508.48,7,72.64,2024-06-13 11:58:24+02:00,left_only
6,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_1,1,0,1718272741,41.372106,2.151925,-66.0,48.0,,NA-lounge+d,8d394461ca520bf,1115,508.48,7,72.64,2024-06-13 11:59:01+02:00,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_5,1,0,1718277259,41.373540,2.151893,,144.0,,SonarVillage,8d394461ca739bf,1115,508.48,7,72.64,2024-06-13 13:14:19+02:00,left_only
827,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_5,1,0,1718277261,41.373540,2.151893,-77.0,144.0,,SonarVillage,8d394461ca739bf,1115,508.48,7,72.64,2024-06-13 13:14:21+02:00,left_only
830,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_7,1,0,1718277270,41.373540,2.151893,-76.0,144.0,,SonarVillage,8d394461ca739bf,1115,508.48,7,72.64,2024-06-13 13:14:30+02:00,left_only
831,00a80eab88bf619626833a80edf9539ab0a103b84ff6de...,1,1_7,1,0,1718277276,41.373540,2.151893,-93.0,144.0,,SonarVillage,8d394461ca739bf,1115,508.48,7,72.64,2024-06-13 13:14:36+02:00,left_only


#### Metrics after compression

##### Time between observations (compression)

In [223]:
tdf_day_sample_rate_compression = duckdb_conn.sql("""
                                            SELECT 
                                                *,
                                                (timestamp_ap - LAG(timestamp_ap) OVER (
                                                    PARTITION BY uid, macaddr_randomized, label_day
                                                    ORDER BY timestamp_ap
                                                )) / 60.0 AS time_diff_minutes
                                            FROM 
                                                cf_tdf_day
                                            ORDER BY 
                                                uid, macaddr_randomized, label_day, timestamp_ap
                                            """)
tdf_day_sample_rate_compression

┌──────────────────────────────────────────────────────────────────┬────────────────────┬─────────┬───────────┬─────────────────┬──────────────┬────────────────────┬────────────────────┬────────┬───────────────────┬─────────────────────────────────────┬──────────────┬─────────────────┬───────────────────────┬──────────────────────┬────────────────────────┬───────────────────┬──────────────────────────┬─────────────────────┐
│                               uid                                │ macaddr_randomized │   tid   │ label_day │ floor_num_added │ timestamp_ap │        lat         │        lng         │  rssi  │ confidence_factor │             vendor_name             │    stage     │     h3_cell     │ observations_user_day │ timespan_minutes_day │ num_distinct_stage_day │ minutes_per_stage │         datetime         │  time_diff_minutes  │
│                             varchar                              │       int32        │ varchar │   int32   │      int32      │    int32     │

In [251]:
tdf_day_sample_rate_compression_df = tdf_day_sample_rate_compression.to_df()

In [None]:
'''# Creating a histogram
fig = px.histogram(
    tdf_day_sample_rate_compression_df,
    x="time_diff_minutes",
    color="macaddr_randomized",
    nbins=5000,
    title="Time difference between observations (Sónar by Day). Compressed",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Time difference between observations (minutes)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
#fig.write_html(os.path.join(path_interactive_plots,'day_sample_rate_hist_compressed_1m.html'))'''

In [226]:
print('Distribution of the time difference between observations (in minutes):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_day_sample_rate_compression_df
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(time_diff_minutes) AS min_time_diff_minutes,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY time_diff_minutes) AS p05_time_diff_minutes,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY time_diff_minutes) AS p10_time_diff_minutes,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY time_diff_minutes) AS p25_time_diff_minutes,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY time_diff_minutes) AS median_time_diff_minutes,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY time_diff_minutes) AS p75_time_diff_minutes,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY time_diff_minutes) AS p90_time_diff_minutes,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY time_diff_minutes) AS p95_time_diff_minutes,
                    MAX(time_diff_minutes) AS max_time_diff_minutes,
                    AVG(time_diff_minutes) AS mean_time_diff_minutes
                FROM tdf_day_sample_rate_compression_df
                """)


Distribution of the time difference between observations (in minutes):


┌────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────┐
│ macaddr_randomized │ min_time_diff_minutes │ p05_time_diff_minutes │ p10_time_diff_minutes │ p25_time_diff_minutes │ median_time_diff_minutes │ p75_time_diff_minutes │ p90_time_diff_minutes │ p95_time_diff_minutes │ max_time_diff_minutes │ mean_time_diff_minutes │
│      varchar       │        double         │        double         │        double         │        double         │          double          │        double         │        double         │        double         │        double         │         double         │
├────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────┼───────────────────────┼───────────────────────┼──────

In [253]:
del tdf_day_sample_rate_compression_df

Now the observations are a little more separated in terms of time, but still, there is a small granularity (75% of the observations are less than one minute apart and 95% of the observations are at most 7 minutes apart)

##### Jump lengths (compressed)

In [228]:
jump_lengths_day_compressed = cf_tdf_day.groupby(by='macaddr_randomized').apply(jump_lengths, merge=True).reset_index()
jump_lengths_day_compressed.rename(columns={0:'jump_lengths_km'}, inplace=True)
jump_lengths_day_compressed = jump_lengths_day_compressed.explode('jump_lengths_km').reset_index(drop=True)
jump_lengths_day_compressed['jump_lengths_km'] = jump_lengths_day_compressed['jump_lengths_km'].astype(float)

jump_lengths_day_compressed['jump_lengths_m'] = round(1000 * jump_lengths_day_compressed['jump_lengths_km'], 2)
del jump_lengths_day_compressed['jump_lengths_km']

100%|██████████| 769/769 [00:03<00:00, 243.54it/s]
100%|██████████| 1985/1985 [00:17<00:00, 114.13it/s]


In [229]:
print('Distribution of the jump lengths (in meters):')
duckdb_conn.sql("""
                SELECT macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_day_compressed
                GROUP BY macaddr_randomized
                
                UNION ALL

                SELECT 'General' AS macaddr_randomized,
                    MIN(jump_lengths_m) AS min_jump_lengths,
                    PERCENTILE_CONT(0.05) WITHIN GROUP(ORDER BY jump_lengths_m) AS p05_jump_lengths,
                    PERCENTILE_CONT(0.10) WITHIN GROUP(ORDER BY jump_lengths_m) AS p10_jump_lengths,
                    PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY jump_lengths_m) AS p25_jump_lengths,
                    PERCENTILE_CONT(0.50) WITHIN GROUP(ORDER BY jump_lengths_m) AS median_jump_lengths,
                    PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY jump_lengths_m) AS p75_jump_lengths,
                    PERCENTILE_CONT(0.90) WITHIN GROUP(ORDER BY jump_lengths_m) AS p90_jump_lengths,
                    PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY jump_lengths_m) AS p95_jump_lengths,
                    MAX(jump_lengths_m) AS max_jump_lengths
                FROM jump_lengths_day_compressed
                """)


Distribution of the jump lengths (in meters):


┌────────────────────┬──────────────────┬──────────────────┬────────────────────┬──────────────────┬─────────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┐
│ macaddr_randomized │ min_jump_lengths │ p05_jump_lengths │  p10_jump_lengths  │ p25_jump_lengths │ median_jump_lengths │ p75_jump_lengths │ p90_jump_lengths │ p95_jump_lengths │ max_jump_lengths │
│      varchar       │      double      │      double      │       double       │      double      │       double        │      double      │      double      │      double      │      double      │
├────────────────────┼──────────────────┼──────────────────┼────────────────────┼──────────────────┼─────────────────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┤
│ 0                  │              0.0 │             1.25 │ 1.7000000000000002 │             3.56 │                8.98 │            21.09 │            47.66 │            71.55 │           296.42 │
│ 1  

In [None]:
'''# Creating a histogram
fig = px.histogram(
    jump_lengths_day_compressed,
    x="jump_lengths_m",
    color="macaddr_randomized",
    nbins=5000,
    title="Jump lenght distribution (Sónar by Day)",
    category_orders={"macaddr_randomized": [0, 1]})

# Updating the layout
fig.update_layout(
    xaxis_title="Jump lenght (meters)",
    yaxis_title="Observations",
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

# Showing the plot
fig.write_html(os.path.join(path_interactive_plots,'day_jump_lengths_hist_compressed.html_1m'))'''

In [None]:
"""# Creating a violin plot
fig = px.violin(
    jump_lengths_day_compressed,
    y='jump_lengths_m',
    color='macaddr_randomized',  
    box=True,  
    points='outliers',  
    title="Distribution of users' duration in a fixed H3 cell (Sónar by day)")

# Updating the layout
fig.update_layout(
    xaxis_title='Mac Address type',
    yaxis_title='Jump lenght (meters)',
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14)

fig.write_html(os.path.join(path_interactive_plots,'day_jump_lengths_violin_compressed_1m.html'))"""

### Stay points detection

Because of the small scale of my setting and the difference of sizes between zones (the bars are quite small in comparison to other areas), it does not make sense to extract stay locations based on a fxed radius. A radius too large would put together observations that belong to different stages, and a radius too small discards many of the points in the trajectories and leaves only points within a really constrained area. For that reason, I do not compute stay locations.

### Renaming columns for clarity on the TID

In Sónar by day, since I create a trajectory_id with `label_day` and `floor_change_id`, for obtaining the ordered sequence of stops and providing clarity, I rename the columns an reorder the dataframe. This way, as in Sónar by day, each combination of `anonymized_macaddr` and `label_day` end up being a trajectory. 

I do this for both the filtered and the compressed trajectories.

In [232]:
# Process for the filtered trajectory
f_tdf_day_renamed = f_tdf_day.rename(columns={'tid':'label_day_floor_change_id'})
        
f_tdf_day_renamed = skmob.TrajDataFrame(
    f_tdf_day_renamed, 
    longitude='lng', latitude='lat',
    datetime='datetime', 
    user_id='uid',
    trajectory_id='label_day')

f_tdf_day_renamed = f_tdf_day_renamed.sort_values(by=['uid', 'tid', 'datetime'])

In [233]:
# Process for the compressed trajectory
cf_tdf_day_renamed = cf_tdf_day.rename(columns={'tid':'label_day_floor_change_id'})
        
cf_tdf_day_renamed = skmob.TrajDataFrame(
    cf_tdf_day_renamed, 
    longitude='lng', latitude='lat',
    datetime='datetime', 
    user_id='uid',
    trajectory_id='label_day')

cf_tdf_day_renamed = cf_tdf_day_renamed.sort_values(by=['uid', 'tid', 'datetime'])

### Final trajectory shape

I will export the filtered and the compressed tables in case I need them afterwards in different situations. Since the trajectory preprocessing did not change the structure of the columns, some formatting can be applied to both tables in the same way.

In [234]:
f_tdf_day_renamed.columns

Index(['uid', 'macaddr_randomized', 'label_day_floor_change_id', 'tid',
       'floor_num_added', 'timestamp_ap', 'lat', 'lng', 'rssi',
       'confidence_factor', 'vendor_name', 'stage', 'h3_cell',
       'observations_user_day', 'timespan_minutes_day',
       'num_distinct_stage_day', 'minutes_per_stage', 'datetime'],
      dtype='object')

In [235]:
# Dictionary for renaming the column to distinguish the ones that are metrics 
# based on the raw trajectories (they might be recomputed afterwards) 
renaming_cols_dict_day = {'stage':'stage_original',
                          'h3_cell':'h3_cell_original',
                          'observations_user_day':'observations_user_day_original',
                          'timespan_minutes_day':'timespan_minutes_day_original',
                          'num_distinct_stage_day':'num_distinct_stage_day_original',
                          'minutes_per_stage':'minutes_per_stage_original'}

# List for rearranging the columns
final_columns_day = ['uid', 'macaddr_randomized',
                     'tid',
                     'floor_num_added','label_day_floor_change_id',
                     'datetime', 'timestamp_ap', # I include both formats of the date in case I need them for quick calculations
                     'lat', 'lng',
                     'vendor_name',
                     'h3_cell_original',
                     'stage_original',
                     'observations_user_day_original',
                     'timespan_minutes_day_original',
                     'num_distinct_stage_day_original',
                     'minutes_per_stage_original']

Formatting the filtered and the compressed trajectories.

In [236]:
# Renaming the columns
f_tdf_day_renamed.rename(columns=renaming_cols_dict_day,
                          inplace=True)

f_tdf_day_renamed = f_tdf_day_renamed[final_columns_day]

print(f'Number of points filtered trajectory table shape (Sónar by Day): {f_tdf_day_renamed.shape}\n')
print(f'Number of points filtered trajectory by type of MAC address (Sónar by Day): {f_tdf_day_renamed.groupby("macaddr_randomized").size()}\n')

print(f'Filtered trajectory table columns (Sónar by Day): \n{f_tdf_day_renamed.columns}')

Number of points filtered trajectory table shape (Sónar by Day): (2345995, 16)

Number of points filtered trajectory by type of MAC address (Sónar by Day): macaddr_randomized
0     169841
1    2176154
dtype: int64

Filtered trajectory table columns (Sónar by Day): 
Index(['uid', 'macaddr_randomized', 'tid', 'floor_num_added',
       'label_day_floor_change_id', 'datetime', 'timestamp_ap', 'lat', 'lng',
       'vendor_name', 'h3_cell_original', 'stage_original',
       'observations_user_day_original', 'timespan_minutes_day_original',
       'num_distinct_stage_day_original', 'minutes_per_stage_original'],
      dtype='object')


In [237]:
# Renaming the columns
cf_tdf_day_renamed.rename(columns=renaming_cols_dict_day,
                          inplace=True)

cf_tdf_day_renamed = cf_tdf_day_renamed[final_columns_day]

print(f'Number of points compressed trajectory table shape (Sónar by Day): {cf_tdf_day_renamed.shape}\n')
print(f'Number of points compressed trajectory by type of MAC address (Sónar by Day): {cf_tdf_day_renamed.groupby("macaddr_randomized").size()}\n')

print(f'Compressed trajectory table columns (Sónar by Day): \n{cf_tdf_day_renamed.columns}')

Number of points compressed trajectory table shape (Sónar by Day): (1191053, 16)

Number of points compressed trajectory by type of MAC address (Sónar by Day): macaddr_randomized
0      66727
1    1124326
dtype: int64

Compressed trajectory table columns (Sónar by Day): 
Index(['uid', 'macaddr_randomized', 'tid', 'floor_num_added',
       'label_day_floor_change_id', 'datetime', 'timestamp_ap', 'lat', 'lng',
       'vendor_name', 'h3_cell_original', 'stage_original',
       'observations_user_day_original', 'timespan_minutes_day_original',
       'num_distinct_stage_day_original', 'minutes_per_stage_original'],
      dtype='object')


## Writing the processed trajectories

- Sónar by Night

In [238]:
# Filtered trajectory
f_tdf_night.to_csv(os.path.join(path_preprocessed_trajectories,'tdf_night_preprocessed_filtered.csv'),index=False)

In [239]:
# Compressed trajectory
cf_tdf_night.to_csv(os.path.join(path_preprocessed_trajectories,'tdf_night_preprocessed_compressed.csv'),index=False)

- Sónar by day

In [240]:
# Filtered trajectory
f_tdf_day_renamed.to_csv(os.path.join(path_preprocessed_trajectories,'tdf_day_preprocessed_filtered.csv'),index=False)

In [241]:
# Compressed trajectory
cf_tdf_day_renamed.to_csv(os.path.join(path_preprocessed_trajectories,'tdf_day_preprocessed_compressed.csv'),index=False)