## Trajectory Data Analysis Gut Check Notebook

based on `scikit-mobility` ; this notebook takes a folder of .gpx files, converts them into a `TrajecDataFrame` & performs full-scope analysis of individual & collective measures from GPS data, as well as creating comparison visualizations.

the point of this notebook is to pass in GPS data from known events, and evaluate how well our current processing approach captures, detect, labels those critical signals.

build in `skmob-dev` env with Python 3.11.6


observations notes

- TODO: Need to set a minimum day count in order to confidently predict home_location
- TODO: Need to determine what % of GPS data is found btw 7pm-7am
- TODO: Consider using Frequent locations measure rather than home? You can see how HOME is misleading when calculated using time-span in RZR Ride to Triple Nickel for Breakfast - 2023-10-01.gpx


In [None]:
# imports
import os
import json
import pandas as pd
import numpy as np
import geopandas as gpd
import movingpandas as mpd

from tzfpy import get_tz

from gpxcsv import gpxtolist
import h3

import folium
from IPython.display import display, HTML
import holoviews as hv
from holoviews import dim, opts
import hvplot.pandas
import plotly.express as px
from keplergl import KeplerGl
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()

import skmob
from skmob.preprocessing import detection, clustering, compression, filtering
from skmob.measures.individual import home_location
from skmob.tessellation import tilers

from itables import init_notebook_mode, show
from dataprep.eda import create_report
from dataprep.eda import plot
from dataprep.eda import plot_diff

import warnings
warnings.filterwarnings('ignore')

# notebook extensions
init_notebook_mode(all_interactive=False)
hv.extension()

In [None]:
# preprocessing functions

def memory_saved(device_id, raw_tdf, compressed_tdf):
    raw_memory = raw_tdf.memory_usage(deep=False).sum()
    raw_MB = (raw_memory / (1024**2)).round(decimals=2)
    compressed_memory = compressed_tdf.memory_usage(deep=False).sum()
    compressed_MB = (compressed_memory / (1024**2)).round(decimals=2)
    total_mb_saved = raw_MB - compressed_MB
    return total_mb_saved

# get local timezone from lat/long coordinates
def apply_get_tz(row):
    return get_tz(float(row['longitude']), float(row['latitude']))

# convert timestamp_utc to local timezone
def convert_timezone(row):
    tz = get_tz(float(row['longitude']), float(row['latitude']))
    return row['timestamp_utc'].tz_convert(tz)

# Define the function to determine the period of the day
def get_period_of_day(timestamp):
    hour = timestamp.hour
    if 0 <= hour < 6:
        return 'early_am'
    elif 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 13:
        return 'lunch'
    elif 13 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'

# find how longitude/latitude is represented in input data, rename to longitude
#def rename_longitude_latitude(df):
    longitude_cols = ['longitude', 'lon', 'long', 'lng']
    latitude_cols = ['latitude', 'lat', 'Lat']

    for col in longitude_cols:
        if col in df.columns:
            df.rename(columns={col: 'longitude'}, inplace=True)

    for col in latitude_cols:
        if col in df.columns:
            df.rename(columns={col: 'latitude'}, inplace=True)

    # Print message only if no renaming occurred
    if 'longitude' not in df.columns or 'latitude' not in df.columns:
        print("Warning: Could not find longitude/latitude columns in the DataFrame.")

    return df

#-------------------------------------------LOAD GPX, CSV, BIGQUERY or GCS DATA-------------------------------------------------------
def gpx_load(gpx_file):
    gpx_df = pd.DataFrame(gpxtolist(gpx_file))
    gpx_df = rename_longitude_latitude(gpx_df) 
    gpx_df.rename(columns={'rcid': 'device_id', 'type': 'trajectory_id_part1', 'name': 'trajectory_id_part2'}, inplace=True)
    gpx_df['source'] = os.path.basename(gpx_file)
    gpx_df['timestamp_utc'] = pd.to_datetime(gpx_df['time'])
    gpx_df['timestamp_utc'] = gpx_df['timestamp_utc'].dt.tz_convert('UTC')
    drop_cols = ["lon", "timezone", "name", "rcid", "type", "appSku", "id", "appVersion", "time", "color", "Color", "totalDistanceInMeters", "totalDurationInSeconds"]

    # optional handle Polaris-specific columns
    polaris_cols = ["averageSpeed", "maxSpeed", "stoppedTimeInSeconds"]
    drop_cols.extend(col for col in polaris_cols if col in gpx_df.columns)
    gpx_df.drop(drop_cols, axis=1, inplace=True)
    return gpx_df[['device_id', 'longitude', 'latitude', 'timestamp_utc', 'source']]

def csv_load(csv_file):
    csv_df = pd.read_csv(csv_file)
    csv_df = rename_longitude_latitude(csv_df)
    csv_df['timestamp_utc'] = pd.to_datetime(csv_df['timestamp_utc'])
    csv_df['timestamp_local'] = csv_df['timestamp_utc'].dt.tz_localize(None)
    csv_df['source'] = os.path.basename(csv_file)
    return csv_df[['device_id', 'longitude', 'latitude', 'timestamp_utc', 'source']]

def bigquery_load(query):
    # ...  (logic to load data from BigQuery)
    pass

def gcs_bucket_load(bucket_name, file_path):
    # ...  (logic to load data from GCS bucket)
    pass

#-------------------------------------PREPROCESSING SPATIOTEMPORAL FEATURES----------------------------------------------------------------------------------
def preprocess_spatiotemporal_features(filepath, file_type):
    if file_type == 'gpx':
        df = gpx_load(filepath)
    elif file_type == 'csv':
        df = csv_load(filepath)
    # elif file_type == 'bigquery':
    #     df = bigquery_load(query)  # Replace with actual BigQuery logic
    # elif file_type == 'gcs':
    #     df = gcs_bucket_load(bucket_name, file_path)  # Replace with actual GCS logic
    
    else:
        raise ValueError("Unsupported file type:", file_type)

    if df is not None and not df.empty:
        device_load_df = df.copy()
    else:
        device_load_df = pd.read_csv(filepath) #fallback to CSV

    #temporal pre-processing features (do NOT intent to keep all these features - just for EDA now)
    device_load_df['timezone'] = device_load_df.apply(apply_get_tz, axis=1)
    device_load_df['timestamp_local'] = device_load_df.apply(convert_timezone, axis=1)
    device_load_df['datetime_local'] = device_load_df['timestamp_local'].dt.tz_localize(None, ambiguous="infer", nonexistent='raise')
    device_load_df['datetime_index'] = device_load_df['timestamp_local'].dt.tz_localize(None, ambiguous="infer", nonexistent='raise')
    #device_load_df['timestamp_iso'] = device_load_df['timestamp_local'].apply(lambda x: x.isoformat())
    #device_load_df['timestamp_rounded_min'] = device_load_df['timestamp_local'].dt.round('min')
    device_load_df['min_of_day'] = device_load_df['timestamp_local'].dt.hour * 60 + device_load_df['timestamp_local'].dt.minute
    device_load_df['hour_of_day'] = device_load_df['timestamp_local'].dt.hour
    device_load_df['period_of_day'] = device_load_df['timestamp_local'].apply(get_period_of_day)
    device_load_df['day_of_month'] = device_load_df['timestamp_local'].dt.day
    device_load_df['day_of_year'] = device_load_df['timestamp_local'].dt.day_of_year
    device_load_df['date'] = device_load_df['timestamp_local'].dt.date
    device_load_df['local_time'] = device_load_df['timestamp_local'].dt.strftime('%I:%M %p')
    device_load_df['day_of_week'] = device_load_df['timestamp_local'].dt.day_of_week
    device_load_df['day_of_week_name'] = device_load_df['timestamp_local'].dt.day_name()
    device_load_df['is_workday'] = device_load_df['timestamp_local'].dt.day_of_week.between(0, 4)
    device_load_df['is_weekend'] = device_load_df['timestamp_local'].dt.weekday >= 5
    device_load_df['is_business_hours'] = device_load_df['timestamp_local'].dt.hour.between(9, 16)
    device_load_df['month_name'] = device_load_df['timestamp_local'].dt.month_name()
    device_load_df['month'] = device_load_df['timestamp_local'].dt.month
    device_load_df['quarter'] = device_load_df['timestamp_local'].dt.quarter
    
    
    #spatial pre-processing features
    device_load_df['h3_cell_index_10'] = device_load_df.apply(lambda row: h3.geo_to_h3(row['latitude'], row['longitude'], 10), axis=1)
    device_load_df['altitude1_minOverlap'] = (device_load_df['day_of_month'] * 240) + (device_load_df['min_of_day'])
    device_load_df['altitude2_hourOverlap'] = (device_load_df['day_of_month'] * 240) + (device_load_df['hour_of_day'] * 10)
    
    #identity pre-processing features
    device_load_df['trajectory_id'] = device_load_df.apply(lambda x: f"{x['date'].strftime('%Y%m%d')}#{x['device_id']}", axis=1)
    device_load_df['semantic_label'] = [""] * len(device_load_df)
    
    #device_load_df.drop([], axis=1, inplace=True)
    
    #formatting output dataframe columns
    first_columns = ['trajectory_id', 'device_id', 'longitude', 'latitude', 'timestamp_local']
    last_columns = ['altitude1_minOverlap','altitude2_hourOverlap','h3_cell_index_10', 'semantic_label', 'timezone', 'timestamp_iso', 'source']
    other_columns = [col for col in device_load_df.columns if col not in (first_columns + last_columns)]
    ordered_output = first_columns + other_columns + last_columns
    
    preprocessing_output = device_load_df[ordered_output]
    preprocessing_output.sort_values(by='timestamp_local', inplace=True)
    
    return preprocessing_output

In [None]:
# data visualization functions

#adds stay_locations to scikit-mobility plot()
def add_stops_to_map(stay_locations_df, feature_group, color='red', icon='circle-stop'):
    for index, row in stay_locations_df.iterrows():
        stop_coord = (row['lat'], row['lng'])
        folium.Marker(
            location=stop_coord, 
            icon=folium.Icon(icon=icon, color=color)
        ).add_to(feature_group)

In [None]:
#alt mobility functions

from math import sqrt, sin, cos, pi, asin, pow, ceil, log
from scipy.stats import entropy

def radius_of_gyration2(user_gdf):
    """
    Compute the radius of gyration (in kilometers) of an individual. The radius of gyration $r_g$ is a measure that 
    indicates the characteristic distance travelled by an individual during a time period. 
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the radius of gyration
        
    Returns
    -------
    rg: float
        the radius of gyration of the individual
        
    See also
    --------
    k_radius_of_gyration, radii_of_gyration, k_radii_of_gyration
        
    References
    ----------
    .. [1] Gonzalez, Marta C., Hidalgo, Cesar A. and Barabasi, Albert-Laszlo. 
    "Understanding individual human mobility patterns." 
    Nature 453 , no. 7196 (2008): 779--782.
    
    .. [2] Pappalardo, L., Rinzivillo, S., Qu, Z., Pedreschi, D., Giannotti, F. 
    "Understanding the patterns of car travel."
    European Physics Journal Special Topics 215, no. 61 (2013).
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> radius_of_gyration(data[data.user == 0])
    18.05107972735531
    """
    
    lats_lons = user_gdf[['lat', 'lon']].values
    center_of_mass = np.mean(lats_lons, axis=0)
    rg = np.sqrt(np.mean([earth_distance((lat, lon), center_of_mass) ** 2.0 for lat, lon in lats_lons]))
    return rg

def radii_of_gyration(gdf):
    """
    Compute the radii of gyration (in kilometers) of all the individuals in a mobility dataset.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the radii of gyration
        
    Returns
    -------
    radii: pandas Series
        the radii of gyration of the individuals
        
    See also
    --------
    radius_of_gyration, k_radius_of_gyration, k_radii_of_gyration
    
    References
    ----------
    .. [1] Gonzalez, Marta C., Hidalgo, Cesar A. and Barabasi, Albert-Laszlo. 
    "Understanding individual human mobility patterns." 
    Nature 453 , no. 7196 (2008): 779--782.
    
    .. [2] Pappalardo, L., Rinzivillo, S., Qu, Z., Pedreschi, D., Giannotti, F. 
    "Understanding the patterns of car travel."
    European Physics Journal Special Topics 215, no. 61 (2013).
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> radius_of_gyration(data)
    user
    0    18.051080
    1    11.758258
    2    23.358970
    3     9.823990
    4    17.611092
    dtype: float64
    """
    radii = gdf.groupby('user').apply(lambda x: radius_of_gyration(x)) 
    return radii

def random_entropy2(user_gdf):
    """
    The random entropy of an individual captures the degree of predictability 
    of the individual's whereabouts if each location is visited with equal probability.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the random entropy
        
    Returns
    -------
    e: float
        the random entropy of the individual
    
    See also
    --------
    uncorrelated_entropy, uncorrelated_entropies, random_entropies, real_entropy, real_entropies
    
    References
    ----------
    .. [1] Song, Chaoming, Qu, Zehui, Blumm, Nicholas and Barabási, Albert-László. 
    "Limits of Predictability in Human Mobility." 
    Science 327 , no. 5968 (2010): 1018-1021.
    
    .. [2] Eagle, Nathan and Pentland, Alex Sandy. 
    "Eigenbehaviors: identifying structure in routine." 
    Behavioral Ecology and Sociobiology 63 , no. 7 (2009): 1057--1066.
    
    .. [3] Lu, X., Wetter, E., Bharti, N., Tatem, A. J. and Bengtsson, L.. 
    "Approaching the limit of predictability in human mobility." 
    Scientific Reports 3 (2013): 2923.
    
    .. [4] Sinatra, Roberta and Szell, Michael. 
    "Entropy and the Predictability of Online Life.." 
    Entropy 16 , no. 1 (2014): 543-556.
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> random_entropy(data[data.user == 0])
    6.832890014164741
    """
    n_distinct_locs = len(user_gdf.groupby(['lat', 'lon']))
    e = np.log2(n_distinct_locs)
    return e

def random_entropies(gdf):
    """
    Compute the random entropies of all individuals in the mobility dataset.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the random entropies
    
    Returns
    -------
    radii: pandas Series
        the random entropies of the individuals
        
    See also
    --------
    uncorrelated_entropy, uncorrelated_entropies, random_entropy, real_entropy, real_entropies
    
    References
    ----------
    .. [1] Song, Chaoming, Qu, Zehui, Blumm, Nicholas and Barabási, Albert-László. 
    "Limits of Predictability in Human Mobility." 
    Science 327 , no. 5968 (2010): 1018-1021.
    
    .. [2] Eagle, Nathan and Pentland, Alex Sandy. 
    "Eigenbehaviors: identifying structure in routine." 
    Behavioral Ecology and Sociobiology 63 , no. 7 (2009): 1057--1066.
    
    .. [3] Lu, X., Wetter, E., Bharti, N., Tatem, A. J. and Bengtsson, L.. 
    "Approaching the limit of predictability in human mobility." 
    Scientific Reports 3 (2013): 2923.
    
    .. [4] Sinatra, Roberta and Szell, Michael. 
    "Entropy and the Predictability of Online Life.." 
    Entropy 16 , no. 1 (2014): 543-556.
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> random_entropies(data).head()
    user
    0    6.832890
    1    6.409391
    2    6.807355
    3    6.686501
    4    6.599913
    dtype: float64
    """
    return gdf.groupby('user').apply(lambda x: random_entropy2(x))

def uncorrelated_entropy2(user_gdf, normalize=False):
    """
    Compute the temporal-uncorrelated entropy of the individual. 
    The temporal-uncorrelated entropy is the historical probability 
    that a location j was visited by an individual i, characterizing 
    the heterogeneity his of visitation patterns.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the temporal-uncorrelated entropy
        
    normalize: boolean
        if True normalize the entropy by dividing by log2(N), where N is the number of 
        distinct locations visited by the individual
    
    Returns
    -------
    e: float
        the temporal-uncorrelated entropy of the individual
    
    See also
    --------
    uncorrelated_entropies, random_entropy, random_entropies, real_entropy, real_entropies
    
    References
    ----------
    .. [1] Song, Chaoming, Qu, Zehui, Blumm, Nicholas and Barabási, Albert-László. 
    "Limits of Predictability in Human Mobility." 
    Science 327 , no. 5968 (2010): 1018-1021.
    
    .. [2] Eagle, Nathan and Pentland, Alex Sandy. 
    "Eigenbehaviors: identifying structure in routine." 
    Behavioral Ecology and Sociobiology 63 , no. 7 (2009): 1057--1066.
    
    .. [3] Lu, X., Wetter, E., Bharti, N., Tatem, A. J. and Bengtsson, L.. 
    "Approaching the limit of predictability in human mobility." 
    Scientific Reports 3 (2013): 2923.
    
    .. [4] Sinatra, Roberta and Szell, Michael. 
    "Entropy and the Predictability of Online Life.." 
    Entropy 16 , no. 1 (2014): 543-556.
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> uncorrelated_entropy(data[data.user == 0])
    3.905486664507274
    """
    n = len(user_gdf)
    probs = [1.0 * len(group)/n for group in user_gdf.groupby(by=['lat', 'lon']).groups.values()]
    e = entropy(probs)
    if normalize:
        vals = np.unique(user_gdf[['lat', 'lon']].values)
        e /= np.log2(len(vals))
    return e
    
def uncorrelated_entropies2(gdf, normalize=False):
    """
    Compute the temporal-uncorrelated entropies of all individuals in the mobility dataset.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the temporal-uncorrelated entropies
    
    normalize: boolean
        if True normalize the entropies by dividing by log2(N), where N is the number of 
        distinct locations visited by an individual
    
    Returns
    -------
    radii: pandas Series
        the temporal-uncorrelated entropies of the individuals
        
    See also
    --------
    uncorrelated_entropies, random_entropy, random_entropies, real_entropy, real_entropies
    
    References
    ----------
    .. [1] Song, Chaoming, Qu, Zehui, Blumm, Nicholas and Barabási, Albert-László. 
    "Limits of Predictability in Human Mobility." 
    Science 327 , no. 5968 (2010): 1018-1021.
    
    .. [2] Eagle, Nathan and Pentland, Alex Sandy. 
    "Eigenbehaviors: identifying structure in routine." 
    Behavioral Ecology and Sociobiology 63 , no. 7 (2009): 1057--1066.
    
    .. [3] Lu, X., Wetter, E., Bharti, N., Tatem, A. J. and Bengtsson, L.. 
    "Approaching the limit of predictability in human mobility." 
    Scientific Reports 3 (2013): 2923.
    
    .. [4] Sinatra, Roberta and Szell, Michael. 
    "Entropy and the Predictability of Online Life.." 
    Entropy 16 , no. 1 (2014): 543-556.
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> uncorrelated_entropies(data).head()
    user
    0    3.905487
    1    3.514337
    2    3.853106
    3    3.709166
    4    3.582786
    dtype: float64
    """
    return gdf.groupby('user').apply(lambda x: uncorrelated_entropy2(x, normalize=normalize))

def jump_lengths(user_gdf):
    """
    Compute the geographic distances (in kilometers) traveled by the individual. 
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the temporal-uncorrelated entropy
        
    Returns
    -------
    length: numpy array
        the vector of distances (jump lengths) traveled by the individual
        
    See also
    --------
    jump_lengths_many
    
    References
    ----------
    .. [1] Brockmann, D., Hufnagel, L. and Geisel, T.. 
    "The scaling laws of human travel." 
    Nature 439 (2006): 462.

    .. [2] Gonzalez, Marta C., Hidalgo, Cesar A. and Barabasi, Albert-Laszlo. 
    "Understanding individual human mobility patterns." 
    Nature 453 , no. 7196 (2008): 779--782.
    
    .. [3] Pappalardo, L., Rinzivillo, S., Qu, Z., Pedreschi, D., Giannotti, F. 
    "Understanding the patterns of car travel."
    European Physics Journal Special Topics 215, no. 61 (2013).
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> jump_lengths(data[data.user == 0])[:10]
    array([ 0.        ,  0.        , 33.15340099, 32.67275978,  5.38582513,
       32.67275978, 32.20046423,  1.41289961,  3.00189488,  3.59951821])
    """
    lats_lons = user_gdf.sort_values(by='datetime')[['lat', 'lon']].values
    lengths = np.array([earth_distance((lat, lon), lats_lons[i - 1]) for i, (lat, lon) in enumerate(lats_lons[1:])])
    return lengths

def jump_lengths_many(gdf):
    """
    Compute the geographic distances (in kilometers) traveled by the individuals in the mobility dataset. 
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals visits from which to compute the jump lengths
        
    Returns
    -------
    pandas Series
        a series with the distances traveled by each individual
        
    See also
    --------
    jump_lengths
    
    References
    ----------
    .. [1] Brockmann, D., Hufnagel, L. and Geisel, T.. 
    "The scaling laws of human travel." 
    Nature 439 (2006): 462.

    .. [2] Gonzalez, Marta C., Hidalgo, Cesar A. and Barabasi, Albert-Laszlo. 
    "Understanding individual human mobility patterns." 
    Nature 453 , no. 7196 (2008): 779--782.
    
    .. [3] Pappalardo, L., Rinzivillo, S., Qu, Z., Pedreschi, D., Giannotti, F. 
    "Understanding the patterns of car travel."
    European Physics Journal Special Topics 215, no. 61 (2013).
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> jump_lengths_many(data).head()
    user
    0    [0.0, 0.0, 33.15340098717961, 32.6727597773293...
    1    [0.0, 0.0, 3.1626703486900634, 9.8263943453601...
    2    [27.303614356305776, 37.59233024186675, 29.102...
    3    [16.9500600069531, 4.1246632643642425, 17.0889...
    4    [33.389542372252535, 0.0, 30.009274526065294, ...
    dtype: object
    """
    return gdf.groupby('user').apply(lambda x: jump_lengths(x))
    
def maximum_distance(user_gdf):
    """
    Compute the maximum distance (in kilometers) traveled by the individual.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the maximum distance
        
    Returns
    -------
    max_distance: float
        the maximum distance traveled by the individual
        
    See also
    --------
    jump_length, jump_lengths_many, maximum_distances
    
    References
    ----------
    .. [1] Williams, Nathalie E., Thomas, Timothy A., Dunbar, Matthew, Eagle, Nathan and Dobra, Adrian. 
    "Measures of Human Mobility Using Mobile Phone Records Enhanced with GIS Data." 
    CoRR abs/1408.5420 (2014).

    .. [2] Lu, Xin, Bengtsson, Linus and Holme, Petter. 
    "Predictability of population displacement after the 2010 haiti earthquake." 
    National Academy of Sciences 109 , no. 29 (2012): 11576--11581.
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> maximum_distance(data[data.user == 0])
    75.48321896587721
    """
    lengths = jump_lengths(user_gdf)
    max_distance = lengths.max()
    return max_distance

def maximum_distances(gdf):
    """
    Compute the maximum distances (in kilometers) traveled by all the individual in the mobility dataset.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the maximum distances
        
    Returns
    -------
    max_distances: pandas Series
        the maximum distances traveled by the individuals
        
    See also
    --------
    maximum_distance, jump_length, jump_lengths_many
    
    References
    ----------
    .. [1] Williams, Nathalie E., Thomas, Timothy A., Dunbar, Matthew, Eagle, Nathan and Dobra, Adrian. 
    "Measures of Human Mobility Using Mobile Phone Records Enhanced with GIS Data." 
    CoRR abs/1408.5420 (2014): .

    .. [2] Lu, Xin, Bengtsson, Linus and Holme, Petter. 
    "Predictability of population displacement after the 2010 haiti earthquake." 
    National Academy of Sciences 109 , no. 29 (2012): 11576--11581.
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> maximum_distances(data)
    user
    0    75.483219
    1    73.040543
    2    90.039642
    3    44.066343
    4    77.102435
    dtype: float64
    """
    return gdf.groupby('user').apply(lambda x: maximum_distance(x))
 
def straight_line_distance(user_gdf):
    """
    Compute the distance (in kilometers) traveled straight line by an individual, 
    i.e., the sum of the distances traveled by the individual
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the straight line distance
        
    Returns
    -------
    straight_distance: float
        the straight line distance traveled by the individual
        
    See also
    --------
    straight_line_distances, jump_length, jump_lengths_many, maximum_distance, maximum_distances
    
    References
    ----------
    .. [1] Williams, Nathalie E., Thomas, Timothy A., Dunbar, Matthew, Eagle, Nathan and Dobra, Adrian. 
    "Measures of Human Mobility Using Mobile Phone Records Enhanced with GIS Data." 
    CoRR abs/1408.5420 (2014): .
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> straight_line_distance(data[data.user == 0])
    9599.607024062618
    """
    lengths = jump_lengths(user_gdf)
    straight_distance = lengths.sum()
    return straight_distance

def straight_line_distances(gdf):
    """
    Compute the distances (in kilometers) traveled straight line by all the individuals
    in the mobility dataset. 
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the straight line distances
        
    Returns
    -------
    straight_distances: pandas Series
        the straight line distances traveled by the individuals
        
    See also
    --------
    straight_line_distance, maximum_distance, maximum_distances, jump_length, jump_lengths_many
    
    References
    ----------
    .. [1] Williams, Nathalie E., Thomas, Timothy A., Dunbar, Matthew, Eagle, Nathan and Dobra, Adrian. 
    "Measures of Human Mobility Using Mobile Phone Records Enhanced with GIS Data." 
    CoRR abs/1408.5420 (2014): .

    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> straight_line_distancess(data)
    user
    0     9599.607024
    1     5642.534652
    2    13352.200529
    3     4691.082059
    4     9413.354220
    dtype: float64
    """
    return gdf.groupby('user').apply(lambda x: straight_line_distance(x))
    
def waiting_times(user_gdf):
    """
    Compute the waiting times (or inter-times) between the movements of an individual.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the number of distinct locations
        
    Returns
    -------
    wtimes: numpy array
        the list of waiting times of the individual
        
    See also
    --------
    waiting_times_many
    
    References
    ----------
    .. [1] Song, Chaoming, Koren, Tal, Wang, Pu and Barabasi, Albert-Laszlo. 
        "Modelling the scaling properties of human mobility." 
        Nature Physics 6 , no. 10 (2010): 818--823.
        
    .. [2] Pappalardo, Luca, Rinzivillo, Salvatore, Simini, Filippo
          "Human Mobility Modelling: exploration and preferential return meet the gravity model."
          Procedia Computer Science, Volume 83, 2016, Pages 934-939
          http://dx.doi.org/10.1016/j.procs.2016.04.188.
          
    .. [3] Pappalardo, Luca and Simini, Filippo. 
        "Data-driven generation of spatio-temporal routines in human mobility.." 
        Data Min. Knowl. Discov. 32 , no. 3 (2018): 787-829.
        
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> waiting_times(data[data.user == 0])[:10]
    array([2459, 2619, 1830, 2484, 1833,  420, 3711, 4949, 4184, 3535],
      dtype='timedelta64[s]')
    """
    times = user_gdf.sort_values(by='datetime').datetime.astype('datetime64')
    wtimes = times.diff()[1:].values.astype('timedelta64[s]')
    return wtimes

def waiting_times_many(gdf):
    """
    Compute the waiting times (or inter-times) between the movements of an individual.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the individuals' visits from which to compute the lists of waiting times
        
    Returns
    -------
    w_times: geopandas series
        the series of the lists of waiting times
        
    See also
    --------
    waiting_times
    
    References
    ----------
    .. [1] Song, Chaoming, Koren, Tal, Wang, Pu and Barabasi, Albert-Laszlo. 
        "Modelling the scaling properties of human mobility." 
        Nature Physics 6 , no. 10 (2010): 818--823.
        
    .. [2] Pappalardo, Luca, Rinzivillo, Salvatore, Simini, Filippo
          "Human Mobility Modelling: exploration and preferential return meet the gravity model."
          Procedia Computer Science, Volume 83, 2016, Pages 934-939
          http://dx.doi.org/10.1016/j.procs.2016.04.188.
          
    .. [3] Pappalardo, Luca and Simini, Filippo. 
        "Data-driven generation of spatio-temporal routines in human mobility.." 
        Data Min. Knowl. Discov. 32 , no. 3 (2018): 787-829.
        
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> waiting_times_many(data)[:10]
    user
    0    [2459 seconds, 2619 seconds, 1830 seconds, 248...
    1    [4049 seconds, 6429 seconds, 4054 seconds, 174...
    2    [750 seconds, 2014 seconds, 2016 seconds, 2169...
    3    [1015 seconds, 245 seconds, 3907 seconds, 4535...
    4    [1527 seconds, 7131 seconds, 1093 seconds, 204...
    dtype: object
    """
    return gdf.groupby('user').apply(lambda x: waiting_times(x))

def number_of_locations(user_gdf):
    """
    Compute the number of distint locations visited by the individual.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the number of distinct locations
        
    Returns
    -------
    n_locs: int
        the number of distinct locations visited by the individual
        
    See also
    --------
    number_of_locations_many
    
    References
    ----------
    .. [1] Gonzalez, Marta C., Hidalgo, Cesar A. and Barabasi, Albert-Laszlo. 
    "Understanding individual human mobility patterns." 
    Nature 453 , no. 7196 (2008): 779--782.
    
    .. [2] Pappalardo, L., Rinzivillo, S., Qu, Z., Pedreschi, D., Giannotti, F. 
    "Understanding the patterns of car travel."
    European Physics Journal Special Topics 215, no. 61 (2013).
    
    .. [3] Williams, Nathalie E., Thomas, Timothy A., Dunbar, Matthew, Eagle, Nathan and Dobra, Adrian. 
    "Measures of Human Mobility Using Mobile Phone Records Enhanced with GIS Data." 
    CoRR abs/1408.5420 (2014): .
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> number_of_locations(data[data.user == 0])
    114
    """
    n_locs = len(user_gdf.groupby(['lat', 'lon']).groups)
    return n_locs

def number_of_locations_many(gdf):
    """
    Compute the number of distint locations visited by the individuals.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the number of distinct locations
        
    Returns
    -------
    n_locs: pandas Series
        the number of distinct locations for each individual 
        
    See also
    --------
    number_of_locations
    
    References
    ----------
    .. [1] Gonzalez, Marta C., Hidalgo, Cesar A. and Barabasi, Albert-Laszlo. 
    "Understanding individual human mobility patterns." 
    Nature 453 , no. 7196 (2008): 779--782.
    
    .. [2] Pappalardo, L., Rinzivillo, S., Qu, Z., Pedreschi, D., Giannotti, F. 
    "Understanding the patterns of car travel."
    European Physics Journal Special Topics 215, no. 61 (2013).
    
    .. [3] Williams, Nathalie E., Thomas, Timothy A., Dunbar, Matthew, Eagle, Nathan and Dobra, Adrian. 
    "Measures of Human Mobility Using Mobile Phone Records Enhanced with GIS Data." 
    CoRR abs/1408.5420 (2014): .
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> number_of_locations_many(data)
    user
    0    114
    1     85
    2    112
    3    103
    4     97
    dtype: int64
    """
    return gdf.groupby('user').apply(lambda x: number_of_locations(x))


def origin_destination_matrix(gdf):
    """
    Compute an origin-destination matrix from the movements of the individuals
    in the mobility dataset.
    
    Parameters
    ----------
    gdf: pandas DataFrame
        the visits of the individuals from which to compute the origin destination matrix
        
    Returns
    -------
    loc2loc2flux: dict
        a dictionary of dictionaries to fluxes, representing the origin destination matrix
        
    References
    ----------
    .. [1] Calabrese, Francesco, Lorenzo, Giusy Di, Liu, Liang and Ratti, Carlo. 
    "Estimating Origin-Destination Flows Using Mobile Phone Location Data." 
    IEEE Pervasive Computing 10 , no. 4 (2011): 36-44.
    
    .. [2] Patrick Bonnel, Etienne Hombourger, Ana-Maria Olteanu-Raimond, Zbigniew Smoreda.
    "Passive Mobile Phone Dataset to Construct Origin-destination Matrix: Potentials and Limitations."
    Transportation Research Procedia 11 (2015): 381-398.

    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> loc2loc2flux = origin_destination_matrix(data)
    >>> len(loc2loc2flux)
    3316
    """
    loc2loc2weight = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    def update_od_matrix(user_gdf):
        lats_lons = user_gdf[['lat', 'lon']].values
        for i, (lat, lon) in enumerate(lats_lons[1:]):
            loc2loc2weight[tuple(lats_lons[i - 1])][(lat, lon)]['weight'] += 1 

    gdf.sort_values(by='datetime').groupby('user').apply(lambda x: update_od_matrix(x))
    return loc2loc2weight

def individual_mobility_network(user_gdf, as_networkx=False):
    """
    Compute the individual mobility network of an individual.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the individual mobility network
        
    as_networkx: boolean
        if True return the network as a networkx graph object
        
    Returns
    -------
    loc2loc2weight: dict or networkx Graph object
        a dictionary of dictionaries to fluxes, representing the individual mobility network
        
    References
    ----------
    .. [1] Rinzivillo, Salvatore, Gabrielli, Lorenzo, Nanni, Mirco, Pappalardo, Luca, Pedreschi, Dino and Giannotti, Fosca. 
    "The purpose of motion: Learning activities from Individual Mobility Networks." 
    Proceedings of the 2014 IEEE International Conference on Data Science and Advanced Analytics (DSAA). 
    
    .. [2] Bagrow, James P. and Lin, Yu-Ru. 
    "Mesoscopic Structure and Social Aspects of Human Mobility." 
    PLoS ONE 7 , no. 5 (2012): e37676. 
    
    .. [3] Song, Chaoming, Qu, Zehui, Blumm, Nicholas and Barabási, Albert-László. 
    "Limits of Predictability in Human Mobility." 
    Science 327 , no. 5968 (2010): 1018-1021.

    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> loc2loc2flux = individual_mobility_network(data[data.user == 0])
    >>> len(loc2loc2flux)
    114
    """
    loc2loc2weight = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    
    user_gdf = user_gdf.sort_values(by='datetime')
    lats_lons = user_gdf[['lat', 'lon']].values
    for i, (lat, lon) in enumerate(lats_lons[1:]):
        loc2loc2weight[tuple(lats_lons[i - 1])][(lat, lon)]['weight'] += 1 
    
    if as_networkx:
        G = nx.from_dict_of_dicts(loc2loc2weight)
        return G
    else:
        return loc2loc2weight

def home(user_gdf, start_night='22:00', end_night='07:00'):
    """
    Compute the home location of an individual
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the number of distinct locations
    
    start_night: str
        the starting hour for the night
        
    end_night: str
        the ending hour for the night
    
    Returns
    -------
    home: tuple
        the latitude and longitude coordinates of the home location
        
    See also
    --------
    homes
    
    References
    ----------
    .. [1] Csáji, Balázs Csanád, Browet, Arnaud, Traag, Vincent A., Delvenne, Jean-Charles, Huens, Etienne, Dooren, Paul Van, Smoreda, Zbigniew and Blondel, Vincent D. 
    "Exploring the Mobility of Mobile Phone Users." CoRR abs/1211.6014 (2012). 
    
    .. [2] Phithakkitnukoon, Santi, Smoreda, Zbigniew and Olivier, Patrick. 
    "Socio-geography of human mobility: A study using longitudinal mobile phone data." 
    PloS ONE 7 , no. 6 (2012): e39253. 
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> home(data[data.user == 0])
    (45.836397158800004, 10.5902665407)
    """
    night_visits = user_gdf.set_index(pd.DatetimeIndex(user_gdf.datetime)).between_time(start_night, end_night)
    lat, lon = night_visits.groupby(['lat', 'lon']).count().sort_values(by='datetime', ascending=True)[:1].iloc[0].name
    home = (lat, lon)
    return home

def homes(gdf, start_night='22:00', end_night='07:00'):
    """
    Compute the home location of an individual
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the number of distinct locations
    
    start_night: str
        the starting hour for the night
        
    end_night: str
        the ending hour for the night
    
    Returns
    -------
    home: tuple
        the latitude and longitude coordinates of the home location
        
    See also
    --------
    homes
    
    References
    ----------
    .. [1] Csáji, Balázs Csanád, Browet, Arnaud, Traag, Vincent A., Delvenne, Jean-Charles, Huens, Etienne, Dooren, Paul Van, Smoreda, Zbigniew and Blondel, Vincent D. 
    "Exploring the Mobility of Mobile Phone Users." CoRR abs/1211.6014 (2012). 
    
    .. [2] Phithakkitnukoon, Santi, Smoreda, Zbigniew and Olivier, Patrick. 
    "Socio-geography of human mobility: A study using longitudinal mobile phone data." 
    PloS ONE 7 , no. 6 (2012): e39253. 
    
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1)
    >>> homes(data).head()
    user
    0         (45.836397158800004, 10.5902665407)
    1    (46.037479671199996, 10.970806546199999)
    2         (46.0616833352, 11.126817998699998)
    3              (45.8485821438, 10.9641220587)
    4              (46.1853864726, 10.7298832096)
    dtype: object
    """
    return gdf.groupby('user').apply(lambda x: home(x, start_night=start_night, end_night=end_night))

def earth_distance(lat_lng1, lat_lng2):
    """
    Compute the distance (in km) along earth between two lat/lon pairs
    :param lat_lng1: tuple
        the first lat/lon pair
    :param lat_lng2: tuple
        the second lat/lon pair

    :return: float
        the distance along earth in km
    """
    lat1, lng1 = [l*pi/180 for l in lat_lng1]
    lat2, lng2 = [l*pi/180 for l in lat_lng2]
    dlat, dlng = lat1-lat2, lng1-lng2
    ds = 2 * asin(sqrt(sin(dlat/2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlng/2.0) ** 2))
    return 6371.01 * ds  # spherical earth...

In [None]:
#mobility profiler
def mobility_profile(user_gdf):
    """
    Compute a repertoire of individual mobility measures for the individual.
    
    Parameters
    ----------
    user_gdf: pandas DataFrame
        the individual's visits from which to compute the number of distinct locations
        
    Returns
    -------
    res: list of lists
    """
    funcs = [home, number_of_locations, radius_of_gyration2, random_entropy2, uncorrelated_entropy2, maximum_distance, straight_line_distance]
    res, names = [], []
    for func in funcs:
        res.append(func(user_gdf))
        names.append(func.__name__)
    return res

def mobility_profiles(gdf):
    """
    Examples
    --------
    >>> data = read_data('../datasets/depr_100agents.csv', user_index=3, datetime_index=2, lat_index=0, lon_index=1) 
    >>> mobility_entropies(gdf).head()
    home  number_of_locations  \
    0       (45.836397158800004, 10.5902665407)                  114   
    1  (46.037479671199996, 10.970806546199999)                   85   
    2       (46.0616833352, 11.126817998699998)                  112   
    3            (45.8485821438, 10.9641220587)                  103   
    4            (46.1853864726, 10.7298832096)                   97   

       radius_of_gyration  random_entropy  uncorrelated_entropy  maximum_distance  \
    0           18.051080        6.832890              3.905487         75.483219   
    1           11.758258        6.409391              3.514337         73.040543   
    2           23.358970        6.807355              3.853106         90.039642   
    3            9.823990        6.686501              3.709166         44.066343   
    4           17.611092        6.599913              3.582786         77.102435   

       straight_line_distance  
    0             9599.607024  
    1             5642.534652  
    2            13352.200529  
    3             4691.082059  
    4             9413.354220  
    """
    funcs = [home, number_of_locations, radius_of_gyration2, random_entropy, uncorrelated_entropy, maximum_distance, straight_line_distance]
    names = []
    for func in funcs:
        names.append(func.__name__)
    return pd.DataFrame(gdf.groupby('user').apply(lambda x: mobility_profile(x)).tolist(), columns=names)

In [None]:
#data quality checks

def calculate_data_hours_percentage(data):
    # Convert the 'timestamp_utc' column to datetime type
    data['timestamp_utc'] = pd.to_datetime(data['timestamp_utc'])

    # Extract the date from the 'timestamp_utc' column
    data['date'] = data['timestamp_utc'].dt.date

    # Extract the hour from the 'timestamp_utc' column
    data['hour'] = data['timestamp_utc'].dt.hour

    # Group the data by date and hour, and count the number of records in each group
    grouped_data = data.groupby(['date', 'hour']).size().reset_index(name='count')

    # Count the number of unique hours with data for each date
    hours_with_data = grouped_data.groupby('date').size()

    # Calculate the percentage of hours with data for each date
    percentage_hours_with_data = (hours_with_data / 24) * 100

    return percentage_hours_with_data

---


### Load Data (start here)

can load from `.gpx`, `.csv`, BigQuery Table or GCS Bucket - all expected to be in `cleaned_data schema`. If RAW data, make sure to run `tlai_filter_gps_data()` & `tlai_stay_locations`.

`preprocess_spatiotemporal_features()` pipeline takes standardized, `cleaned_data schema` (or `raw_data` from TLai vendors) & adds temporal and spatial features which are a pre-requisite to wrangling `analytic_dataframes`.


In [None]:
# Provide metadata for input files (clean_data or raw_data schemas)
DATA_DIR = '/Users/jonathancachat/Downloads/mobility-data-analysis/data/'

DEVICE_ID = 'e64b9fe5-f2f0-4614-bd34-df2c5d44034a' #or FILENAME

#input files
GPX_FILE = DATA_DIR+DEVICE_ID+'.gpx'
RAW_CSV_FILE = DATA_DIR+DEVICE_ID+'-RAW_df.csv'
CLEANED_CSV_FILE = DATA_DIR+DEVICE_ID+'-CLEANED_tdf.csv'

#output 
COMPRESSED_TRAJ_DATAFRAME_JSON = DATA_DIR+DEVICE_ID+'-COMPRESSED_tdf.json'
COMPRESSED_TRAJ_DATAFRAME_CSV = DATA_DIR+DEVICE_ID+'-COMPRESSED_tdf.csv'
PREPROCESSED_CSV_FILE = DATA_DIR+DEVICE_ID+'-PREPROCESSED_features.csv'
ANALYTIC_CSV_FILE = DATA_DIR+DEVICE_ID+'-ANALYTIC_export.csv'

In [None]:
# execute LOAD & PREPROCESSING pipeline
preprocessed_df = preprocess_spatiotemporal_features(RAW_CSV_FILE, "csv")

preprocessed_df.to_csv(PREPROCESSED_CSV_FILE, index=False)

display(preprocessed_df.info())
#show(preprocessed_df, classes="display nowrap compact")

In [None]:
#dataprep.ai EDA report
report = create_report(preprocessed_df[['trajectory_id','device_id','lon','lat','period_of_day','day_of_month','day_of_year','date','day_of_week','day_of_week_name',
'month','quarter','altitude1_minOverlap','altitude2_hourOverlap','h3_cell_index_10']])

report.show_browser()

In [None]:
# Calculate the percentage of hours with data
percentage = calculate_data_hours_percentage(preprocessed_df)

print("Percentage of hours with data:")
print(percentage)


---

### SciKit-Mobility Quick Reference

tdf = `TrajecDataFrame`

processsing

- `filtering.filter(tdf, max_speed_kmh = 500.0, include_loops = False, speed_kmh = 5.0, max_loop = 6, ratio_max = 0.25)` - For each actor, filter out the noise or outlier points.
- `compression.compress(tdf, spatial_radius_km = 0.2)` - Reduce the number of points of each object in a ‘TrajDataFrame’ with median coordinates within a radius (Zheng 2015).
- `detection.stay_locations(tdf, stop_radius_factor = 0.5, minutes_for_a_stop = 20.0, spatial_radius_km = 0.2, leaving_time = True, no_data_for_minutes = 1e12, min_speed_kmh = None)` - Detect the stay locations of each object in a ‘TrajDataFrame’.
- `clustering.cluster(tdf, cluster_radius_km = 0.1, min_samples = 1)` - Cluster the stay locations of each object in a ‘TrajDataFrame’.

plots

- `tdf.plot_trajectory()` - Plot a trajectory.
- `tdf.plot_cluster()` - Plot a cluster.
- `tdf.plot_stops()` - Add stops to plot.
- `plot_diary(tdf, start_datetime=None, end_datetime=None, ax=None, legend=False)` - Plot a mobility diary of an actor in a `TrajDataFrame`. It requires a TrajDataFrame with clusters, output of preprocessing.clustering.cluster. The column constants.CLUSTER must be present.


In [None]:
raw_tdf = skmob.TrajDataFrame(preprocessed_df, latitude='latitude', longitude='longitude', datetime='datetime_index', user_id='device_id', trajectory_id='trajectory_id', timestamp=True)

raw_tdf.plot_trajectory(tiles="Cartodb dark_matter", zoom=12, max_points=10000)

### Data Quality Checks


In [None]:
# Extract the earliest and latest timestamps
earliest_timestamp = raw_tdf['timestamp_local'].min()
latest_timestamp = raw_tdf['timestamp_local'].max()

# Calculate the time difference
time_difference = latest_timestamp - earliest_timestamp

# Check if the duration is more than 24 hours
if time_difference.total_seconds() > 24 * 3600:
    print("WARNING! The duration spans more than 24 hours.")

# Check if the data spans different days
if earliest_timestamp.date() != latest_timestamp.date():
    print("WARNING! The data spans across different days.")
    
    
## Summarize Track
print(" ")
print("------------------------------")
print("This dataset has {} records.".format(len(raw_tdf)))
print(f"from: {earliest_timestamp} to: {latest_timestamp}.")
print(" ")
print('time window = %s' 
      %(raw_tdf.iloc[-1].datetime - raw_tdf.iloc[0].datetime))
print(" ")
print("That's {} hours".format(time_difference.total_seconds() / 3600))

print(".....with an average of {} GPS samples per hour!".format(round(len(raw_tdf)/(time_difference.total_seconds() / 3600))))

In [None]:
#Time-series Histograms
fig1 = px.histogram(raw_tdf, x='hour_of_day', nbins=24)
fig2 = px.histogram(raw_tdf, x='day_of_week_name', nbins=7)

# Customizing the plot
fig1.update_layout(
    title='raw GPS Data Distribution Over 24 Hours',
    xaxis_title='Hour of Day',
    yaxis_title='Row Count',
    bargap=0.2
)

fig2.update_layout(
    title='raw GPS Data Distribution Over Days',
    xaxis_title='Day Of Week',
    yaxis_title='Row Count',
    bargap=0.2
)

# Display the plot
fig1.show()
fig2.show()

### Filter & Compress Trajectory


In [None]:
#filter and compress, verify transformations
filtered_tdf = filtering.filter(raw_tdf, max_speed_kmh=400, include_loops = False, speed_kmh = 3, max_loop = 15, ratio_max = 0.25)

compressed_tdf = compression.compress(filtered_tdf, spatial_radius_km=0.05)

print('RAW trajectory:\t%s'%len(raw_tdf))
print('FILTERED trajectory:\t%s'%len(filtered_tdf))
print('COMPRESSED trajectory:\t%s'%len(compressed_tdf))
print('Removed rows:\t\t\t%s'%(len(raw_tdf)-len(compressed_tdf)))

percent_reduced = (100-(len(compressed_tdf)/len(raw_tdf)*100))
percent_reduced = round(percent_reduced,ndigits=2)

skmob.io.file.write(compressed_tdf, COMPRESSED_TRAJ_DATAFRAME_JSON)
compressed_tdf.to_csv(COMPRESSED_TRAJ_DATAFRAME_CSV)

print(f"compressed trajectory is {percent_reduced}% smaller than raw trajectory")

#compressed_tdf.parameters

In [None]:
#Time-series Histograms
fig1 = px.histogram(compressed_tdf, x='hour_of_day', nbins=24)
fig2 = px.histogram(compressed_tdf, x='day_of_week_name', nbins=7)

# Customizing the plot
fig1.update_layout(
    title='Compressed GPS Data Distribution Over 24 Hours',
    xaxis_title='Hour of Day',
    yaxis_title='Row Count',
    bargap=0.2
)

fig2.update_layout(
    title='Compressed GPS Data Distribution Over Days',
    xaxis_title='Day Of Week',
    yaxis_title='Row Count',
    bargap=0.2
)

# Display the plot
fig1.show()
fig2.show()

In [None]:
#dataprep.ai EDA report for raw and compressed trajectories

raw_tdf_comparison = raw_tdf[['tid','uid','lng','lat','timestamp_local','min_of_day','hour_of_day','period_of_day','day_of_year','day_of_week','day_of_week_name']]
compressed_tdf_comparison = compressed_tdf[['tid','uid','lng','lat','timestamp_local','min_of_day','hour_of_day','period_of_day','day_of_year','day_of_week','day_of_week_name']]


plot_diff([raw_tdf_comparison,compressed_tdf_comparison]).show_browser()

convert to code cell if you want a Folium map to compare raw_tdf, filtered_tdf, and compressed_tdf

not done for all trajectories due to size of dataframe

```py
# Create a Folium map to compare raw_tdf, filtered_tdf, and compressed_tdf
map = folium.Map(location=raw_home_gps, tiles="Cartodb dark_matter", zoom_start=12)

raw_feature_group = folium.FeatureGroup(name="from RAW", show=True).add_to(map)
filtered_feature_group = folium.FeatureGroup(name="from FILTERED", show=False).add_to(map)
compressed_feature_group = folium.FeatureGroup(name="from COMPRESSED", show=False).add_to(map)

# timeseries color breakdown
period_colors = {
    'early_am': 'lightgray',
    'morning': 'orange',
    'lunch': 'lightgreen',
    'afternoon': 'lightblue',
    'evening': 'gray',
}

# Plotting RAW trajectory
raw_coords = [(row['lat'], row['lng']) for index, row in raw_tdf.iterrows()]
raw_periods = [row['period_of_day'] for index, row in raw_tdf.iterrows()]
for (start, end), period in zip(zip(raw_coords, raw_coords[1:]), raw_periods):
    period_color = period_colors.get(period, 'grey')  # Default to 'grey' if period not found
    raw_tooltip_text = f"{period}"  # Tooltip text
    raw_tooltip = folium.Tooltip(raw_tooltip_text)
    folium.PolyLine([start, end], color=period_color, weight=3, opacity=0.75, tooltip=raw_tooltip).add_to(raw_feature_group)
folium.Marker(raw_home_gps, icon=folium.Icon(icon='home', color='green')).add_to(raw_feature_group)

# Plotting FILTERED trajectory
# filtered_coords = [(row['lat'], row['lng']) for index, row in filtered_tdf.iterrows()]
# filtered_periods = [row['period_of_day'] for index, row in filtered_tdf.iterrows()]
# for (start, end), period in zip(zip(filtered_coords, filtered_coords[1:]), filtered_periods):
#     period_color = period_colors.get(period, 'grey')  # Default to 'grey' if period not found
#     filtered_tooltip_text = f"{period}"  # Tooltip text
#     filtered_tooltip = folium.Tooltip(filtered_tooltip_text)
#     folium.PolyLine([start, end], color=period_color, weight=3, opacity=0.75, tooltip=filtered_tooltip).add_to(filtered_feature_group)
# folium.Marker(filtered_home_gps, icon=folium.Icon(icon='home', color='green')).add_to(filtered_feature_group)

# Plotting COMPRESSED trajectory
compressed_coords = [(row['lat'], row['lng']) for index, row in compressed_tdf.iterrows()]
compressed_periods = [row['period_of_day'] for index, row in compressed_tdf.iterrows()]
for (start, end), period in zip(zip(compressed_coords, compressed_coords[1:]), compressed_periods):
    period_color = period_colors.get(period, 'grey')  # Default to 'grey' if period not found
    compressed_tooltip_text = f"{period}"  # Tooltip text
    compressed_tooltip = folium.Tooltip(compressed_tooltip_text)
    folium.PolyLine([start, end], color=period_color, weight=3, opacity=0.75, tooltip=compressed_tooltip).add_to(compressed_feature_group)
folium.Marker(compressed_home_gps, icon=folium.Icon(icon='home', color='green')).add_to(compressed_feature_group)

folium.LayerControl().add_to(map)

map
```


---

### Compare "Home" Location to 'frequent_location' & 'cluster'

**Home**

- This function can also obviously be modified for `work_location` or `night_life` location based on hour changes. is to generate timezone based on location look up and convert UTC to local timezone before establishing the `TrajDataFrame`. in previous R&D I used: `filtered_home_location = home_location(filtered_tdf, start_night='20:00')`

**cluster**

- cluster POIs within H3 lvl 10 hexagon; radius corresponding to an area of 0.015 square kilometers is approximately 0.0691 km (69.1 meters). stops are clustered by spatial proximity using DBSCAN, `cluster` column is added (int) = cluster_id.
- cluster_id 0 is most visited, 1 the second most visited, etc.


In [None]:
# Home Location
start_nighttime = '20:00' #late evening
end_nighttime = '14:00' #early morning

raw_home_location = home_location(raw_tdf, start_night=start_nighttime, end_night=end_nighttime, show_progress=False)
raw_home_gps = (raw_home_location['lat'], raw_home_location['lng'])

compressed_home_location = home_location(compressed_tdf, start_night=start_nighttime, end_night=end_nighttime, show_progress=False)
compressed_home_gps = (compressed_home_location['lat'], compressed_home_location['lng'])

home_icon = folium.Icon(icon='home', color='blue')

In [None]:
# Stops / Stay Locations
stop_factor = 1
min_stop_duration = 20.0 #mins
spatial_radius = 0.05

raw_stay_locations = detection.stay_locations(raw_tdf, stop_radius_factor=stop_factor, minutes_for_a_stop=min_stop_duration, spatial_radius_km=spatial_radius, leaving_time=True)
filtered_stay_locations = detection.stay_locations(filtered_tdf, stop_radius_factor=stop_factor, minutes_for_a_stop=min_stop_duration, spatial_radius_km=spatial_radius, leaving_time=True)
compressed_stay_locations = detection.stay_locations(compressed_tdf, stop_radius_factor=stop_factor, minutes_for_a_stop=min_stop_duration, spatial_radius_km=spatial_radius, leaving_time=True)

print('Number of stops from RAW: %s'% len(raw_stay_locations))
print('Number of stops from FILTERED: %s'% len(filtered_stay_locations))
print('Number of stops from COMPRESSED: %s'% len(compressed_stay_locations))

STAY_LOCATIONS_CSV = DATA_DIR+DEVICE_ID+'-COMPRESSED_tdf_STOP_Locations.csv'

compressed_stay_locations.to_csv(STAY_LOCATIONS_CSV)

#show(compressed_stay_locations, classes="display nowrap compact")

In [None]:
# 2D Geo Maps with HOME & STOPS (options)
raw_map = raw_tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Cartodb dark_matter', max_points=None, start_end_markers=True)
home_from_raw = (raw_home_location['lat'], raw_home_location['lng'])
folium.Marker(home_from_raw, icon=folium.Icon(icon='home', color='green')).add_to(raw_map)
raw_map_with_stops = raw_stay_locations.plot_stops(map_f=raw_map)

#filtered_map = filtered_tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Cartodb dark_matter', max_points=None, start_end_markers=True)
#home_from_filtered = (filtered_home_location['lat'], filtered_home_location['lng'])
#folium.Marker(home_from_filtered, icon=folium.Icon(icon='home', color='green')).add_to(filtered_map)
#filtered_map_with_stops = filtered_stay_locations.plot_stops(map_f=filtered_map)

compressed_map = compressed_tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Cartodb dark_matter', max_points=None, start_end_markers=True)
home_from_compressed = (compressed_home_location['lat'], compressed_home_location['lng'])
folium.Marker(home_from_compressed, icon=folium.Icon(icon='home', color='green')).add_to(compressed_map)
compressed_map_with_stops = compressed_stay_locations.plot_stops(map_f=compressed_map)

# Convert Folium maps to HTML
raw_map_html = raw_map._repr_html_()

#filtered_map_html = filtered_map._repr_html_()
compressed_map_html = compressed_map._repr_html_()

# HTML template with maps side by side
html_template = f"""
<div style="display: flex; flex-direction: row; justify-content: center; width: 100%;">
    <div style="flex: 1; padding: 10px;">
        <h3>Raw GPS with Stops</h3>
        {raw_map_html}
    </div>
    <div style="flex: 1; padding: 10px;">
        <h3>Compressed GPS with Stops</h3>
        {compressed_map_html}
    </div>
</div>
"""


# Display in Jupyter Notebook
display(HTML(html_template))

In [None]:
## Clusters
cluster_radius = 0.05
min_samples = 1

# RAW GPS data
raw_cluster_df = clustering.cluster(raw_stay_locations, cluster_radius_km=cluster_radius, min_samples=min_samples)
start_datetime = raw_cluster_df['datetime'].min()
end_datetime = raw_cluster_df['datetime'].max()  
#display(raw_cluster_df.sort_values(by=['cluster']))


# COMPRESSED GPS data
compressed_cluster_df = clustering.cluster(compressed_stay_locations, cluster_radius_km=cluster_radius, min_samples=min_samples)
start_datetime = compressed_cluster_df['datetime'].min()
end_datetime = compressed_cluster_df['datetime'].max()
#display(compressed_cluster_df.sort_values(by=['cluster']))

display(raw_cluster_df['cluster'].value_counts())
display(raw_cluster_df['cluster'].nunique())
display(compressed_cluster_df['cluster'].value_counts())
display(compressed_cluster_df['cluster'].nunique())

---

### Visualize Daily Movement Patterns & Timeline


In [None]:
# Create a Folium map
map = folium.Map(location=compressed_home_gps, tiles="Cartodb dark_matter", zoom_start=12)

# timeseries color breakdown
period_colors = {
    'early_am': 'lightgray',
    'morning': 'orange',
    'lunch': 'lightgreen',
    'afternoon': 'lightblue',
    'evening': 'gray',
}

# Get the unique days from the compressed_tdf
unique_days = compressed_tdf['date'].unique()

# Create a feature group for each day
day_feature_groups = {}
for day in unique_days:
    day_feature_group = folium.FeatureGroup(name=f"Day {day}", show=False).add_to(map)
    day_feature_groups[day] = day_feature_group

# Plotting COMPRESSED trajectory
for index, row in compressed_tdf.iterrows():
    start = (row['lat'], row['lng'])
    end = (compressed_tdf.loc[index+1, 'lat'], compressed_tdf.loc[index+1, 'lng']) if index+1 < len(compressed_tdf) else None
    period = row['period_of_day']
    day_of_week_name = row['day_of_week_name']
    day = row['date']

    if end is not None:
        period_color = period_colors.get(period, 'grey')  # Default to 'grey' if period not found
        compressed_tooltip_text = f"period_of_day: {period} \n day_of_week: {day_of_week_name}"  # Tooltip text
        compressed_tooltip = folium.Tooltip(compressed_tooltip_text)
        folium.PolyLine([start, end], color=period_color, weight=3, opacity=0.75, tooltip=compressed_tooltip).add_to(day_feature_groups[day])

folium.Marker(compressed_home_gps, icon=folium.Icon(icon='home', color='green')).add_to(map)

folium.LayerControl().add_to(map)
map

In [None]:
#alternative way to do above, where groups = day
from skmob.utils import utils
groups = utils.group_df_by_time(compressed_tdf, 
                        offset_unit='hours', add_starting_location=True)

map_f = groups[0].plot_trajectory(start_end_markers=True, hex_color='red', weight=3)
map_f = groups[6].plot_trajectory(map_f=map_f, start_end_markers=True, hex_color='green', weight=3)
map_f

In [None]:
#plot timeline for 10/15

user = DEVICE_ID
start_datetime = pd.to_datetime('2024-12-15 000000')
end_datetime = pd.to_datetime('2023-10-22 000000')
raw_cluster_df.plot_diary(user, start_datetime=start_datetime, end_datetime=end_datetime, legend=True)
compressed_cluster_df.plot_diary(user, start_datetime=start_datetime, end_datetime=end_datetime, legend=True)

---

### 3D Trajectory plots

these may not work with large amount of data, attempt to do it with one day - 10/15


In [None]:
# Plot the trajectory in space-time cube
fig = px.line_3d(compressed_tdf, x="lng", y="lat", z="timestamp_local", color='period_of_day')
fig.update_traces(line=dict(width=5))

fig.show()

---

### SciKit-Mobility Measures and Attributes

#### Analytics that can be gotten from Trip breakdown

tdf = `TrajecDataFrame`

measures & attributes

- `number_of_locations()` - Number of Distnct locations visited by actor
- `home_location()` - location most visited by actor at nighttime.
- `max_distance_from_home()` - Maximum distance (in km) from home traveled by an actor
- `number_of_visits()` - number of visits to any location by actor
- `location_frequency()` - visitation frequency of each location of an actor
- `individual_mobility_network()` - Mobility network of an actor
- `recency_rank()` - list of locations ranked by recency for an actor
- `frequency_rank()` - list of location ranked by frequency for an actor
-
- `radius_of_gyration()` - characterize distance traveled by an actor.
- `k_radius_of_gyration()` - characterize distance traveled by an actor & their 'k' most frequently visited locations.
- `random_entropy()` - degree of predictability of an actor's location if each location is visited with equal probability.
- `uncorrelated_entropy()` - historical probability that a location was visited by actor
- `real_entropy()` - mobility entropy of an actor considering the order in which locations are visited.
- `jump_length()` - distance (km) traveled by an actor
- `maximum_distance()` - maximum distance (km) traveled by an actor
- `distance_straight_lines()` - sum of the distances traveled by an actor
- `waiting_times()` - intervals (in seconds) between the movements of an actor


In [None]:
from skmob.measures.individual import number_of_locations, home_location, max_distance_from_home, number_of_visits, location_frequency, individual_mobility_network, recency_rank, frequency_rank
display(number_of_locations(raw_tdf, show_progress=False))
display(number_of_locations(filtered_tdf, show_progress=False))
display(number_of_locations(compressed_tdf, show_progress=False))

In [None]:
display(home_location(raw_tdf, start_night='22:00', end_night='07:00', show_progress=False))
display(home_location(filtered_tdf, start_night='22:00', end_night='07:00', show_progress=False))
display(home_location(compressed_tdf, start_night='22:00', end_night='07:00', show_progress=False))

In [None]:
display(max_distance_from_home(raw_tdf, show_progress=False))
display(max_distance_from_home(filtered_tdf, show_progress=False))
display(max_distance_from_home(compressed_tdf, show_progress=False))

In [None]:
display(number_of_visits(raw_tdf, show_progress=False))
display(number_of_visits(filtered_tdf, show_progress=False))
display(number_of_visits(compressed_tdf, show_progress=False))

In [None]:
display(location_frequency(raw_tdf, show_progress=False))
display(location_frequency(filtered_tdf, show_progress=False))
display(location_frequency(compressed_tdf, show_progress=False))

In [None]:
display(individual_mobility_network(raw_tdf, show_progress=False))
display(individual_mobility_network(filtered_tdf, show_progress=False))
display(individual_mobility_network(compressed_tdf, show_progress=False))

mobility_network = individual_mobility_network(compressed_tdf, show_progress=False)
mobility_network

In [None]:
display(recency_rank(raw_tdf, show_progress=False))
display(recency_rank(filtered_tdf, show_progress=False))
display(recency_rank(compressed_tdf, show_progress=False))

In [None]:
display(frequency_rank(raw_tdf, show_progress=False))
display(frequency_rank(filtered_tdf, show_progress=False))
display(frequency_rank(compressed_tdf, show_progress=False))

#### Effects on Mobility measures (profile of movement characteristics)

scikit-mobility has several movement-based measures that can be easily called

- `radius_of_gyration()` - characterize distance traveled by an actor. (interpreted as the typical distance traveled by user a when observed up to time t)
- `k_radius_of_gyration()` - characterize distance traveled by an actor & their 'k' most frequently visited locations.
- `random_entropy()` - degree of predictability of an actor's location if each location is visited with equal probability.
- `uncorrelated_entropy()` - historical probability that a location was visited by actor
- `real_entropy()` - mobility entropy of an actor considering the order in which locations are visited.
- `jump_length()` - distance (km) traveled by an actor
- `maximum_distance()` - maximum distance (km) traveled by an actor
- `distance_straight_lines()` - sum of the distances traveled by an actor
- `waiting_times()` - intervals (in seconds) between the movements of an actor


In [None]:
from skmob.measures.individual import radius_of_gyration, k_radius_of_gyration, random_entropy, uncorrelated_entropy, real_entropy, jump_lengths, maximum_distance, distance_straight_line, waiting_times

display(radius_of_gyration(raw_tdf, show_progress=False))
display(radius_of_gyration(filtered_tdf, show_progress=False))
display(radius_of_gyration(compressed_tdf, show_progress=False))

In [None]:
#Characteristic distance (in km) traveled by an individual between their k most frequent locations
display(k_radius_of_gyration(raw_tdf, show_progress=False))
display(k_radius_of_gyration(filtered_tdf, show_progress=False))
display(k_radius_of_gyration(compressed_tdf, show_progress=False))

In [None]:
#Degree of predictability of an individual’s whereabouts if each location is visited with equal probability
display(random_entropy(raw_tdf, show_progress=False))
display(random_entropy(filtered_tdf, show_progress=False))
display(random_entropy(compressed_tdf, show_progress=False))

In [None]:
#Historical probability that a location was visited by an individual
display(uncorrelated_entropy(raw_tdf, show_progress=False))
display(uncorrelated_entropy(filtered_tdf, show_progress=False))
display(uncorrelated_entropy(compressed_tdf, show_progress=False))

In [None]:
#Mobility entropy of an individual considering also the order in which locations were visited
display(real_entropy(raw_tdf, show_progress=True))
display(real_entropy(filtered_tdf, show_progress=True))
display(real_entropy(compressed_tdf, show_progress=True))

In [None]:
#Distances (in km) traveled by an individual
display(jump_lengths(raw_tdf, show_progress=False))
display(jump_lengths(filtered_tdf, show_progress=False))
display(jump_lengths(compressed_tdf, show_progress=False))

In [None]:
display(maximum_distance(raw_tdf, show_progress=False))
display(maximum_distance(filtered_tdf, show_progress=False))
display(maximum_distance(compressed_tdf, show_progress=False))

In [None]:
display(distance_straight_line(raw_tdf, show_progress=False))
display(distance_straight_line(filtered_tdf, show_progress=False))
display(distance_straight_line(compressed_tdf, show_progress=False))

In [None]:
display(waiting_times(raw_tdf, show_progress=False))
display(waiting_times(filtered_tdf, show_progress=False))
display(waiting_times(compressed_tdf, show_progress=False))

## Alternative Approaches to Mobility Measures


## Radius of gyration ([to top](#top))

The _radius of gyration_ of an individual $u$ is the <u>characteristic traveled distance</u> during a period of time, and it is formally defined as:
$$r_g(u) = \sqrt{\frac{1}{N} \sum_i^N{w_i (r_i - r_{cm})^2}}$$
where $N$ is the number of visits of $u$, $w_i$ is the number of times $u$ visited location $i$, $r_i$ is the pair of coordinates of location $i$, $r_{cm}$ is the _center of mass_ (i.e., the average position) of $u$. The center of mass is easily defined as:
$$r_{cm} = \frac{1}{N} \sum_i^N r_i.$$


In [None]:
data = pd.read_csv('/Users/jonathancachat/Downloads/google-mobility-data-analysis/data/4f4c0e64-1e4b-4f9b-a98b-69660fe23e12-PREPROCESSED_features.csv')

data.rename(columns={'longitude':'lon','latitude':'lat'}, inplace=True)

data.rename(columns={'timestamp_local':'datetime'}, inplace=True)

radius_of_gyration2(data)

In [None]:
rog_perweek = data.groupby(by='hour_of_day').apply(lambda x: radius_of_gyration2(x))

trace= go.Scatter(
    x=[x[0] for x in rog_perweek.iteritems()],
    y=rog_perweek,
    text = 'km'
)
layout =  go.Layout(
    dict(title = "Radius of gyration per hour_of_day (sequential)"),
    xaxis=dict(title='hour_of_day'),
    yaxis=dict(title='ROG')
)

data_to_plot=[trace]

fig=go.Figure(data=data_to_plot, layout=layout)
iplot(fig)

In [None]:
incremental_data=[data[data['hour_of_day']<=i] for i in set(data.hour_of_day)]

incremental_rog_hour_of_day = {i+1:radius_of_gyration2(v) for i,v in enumerate(incremental_data)}
incremental_rog_hour_of_day

trace= go.Scatter(
    x=[k for k,v in incremental_rog_hour_of_day.items()],
    y=[v for k,v in incremental_rog_hour_of_day.items()],
    text = 'km'
)
layout =  go.Layout(
    dict(title = "Radius of gyration per month (incremental)"),
    xaxis=dict(title='hour_of_day'),
    yaxis=dict(title='ROG')
)

data_to_plot=[trace]

fig=go.Figure(data=data_to_plot, layout=layout)
iplot(fig)

In [None]:
incremental_data_day_of_month =[data[data['day_of_month']<=i] for i in set(data.day_of_month)]
incremental_rog_day_of_month = {i+1:radius_of_gyration2(v) for i,v in enumerate(incremental_data_day_of_month)}

trace= go.Scatter(
    x=[k for k,v in incremental_rog_day_of_month.items()],
    y=[v for k,v in incremental_rog_day_of_month.items()],
    text = 'km'
)
layout =  go.Layout(
    dict(title = "Radius of gyration per week (incremental)"),
    xaxis=dict(title='day_of_month'),
    yaxis=dict(title='ROG')
)

data_to_plot=[trace]

fig=go.Figure(data=data_to_plot, layout=layout)
iplot(fig)

## Mobility Entropy

The _uncorrelated entropy_ of an individual $u$ measures the predictability of their movements in period of time. It is formally defined as:
$$S_{unc} = -\sum_i^N p_i \log p_i $$
where $p_i$ is $u$'s probability of visiting location $i$. We can also normalize the uncorrelated entropy by dividing it by $\log_2 N$.


In [None]:
print('Uncorrelated entropy of the whole trajectory: %s'%uncorrelated_entropy2(data))
print('Random entropy of the whole trajectory: %s'%random_entropy2(data))

In [None]:
entr_permonth = data.groupby(by='hour_of_day').apply(lambda x: uncorrelated_entropy2(x))
incremental_entr_month = {i+1:uncorrelated_entropy2(v) for i,v in enumerate(incremental_data)}

trace_1= go.Scatter(
    x=[x[0] for x in entr_permonth.iteritems()],
    y=entr_permonth,
    name='sequential',
    line=dict(
        color='red'
    )  
)

trace_2= go.Scatter(
    x=[k for k,v in incremental_entr_month.items()],
    y=[v for k,v in incremental_entr_month.items()],
    name = 'incremental',
    line=dict(
        color='blue'
    )
)


data_to_plot=[trace_1,trace_2]

fig=go.Figure(data=data_to_plot)
iplot(fig)

In [None]:
entr_permonth = data.groupby(by='day_of_month').apply(lambda x: uncorrelated_entropy2(x))
incremental_entr_month = {i+1:uncorrelated_entropy2(v) for i,v in enumerate(incremental_data)}

trace_1= go.Scatter(
    x=[x[0] for x in entr_permonth.iteritems()],
    y=entr_permonth,
    name='sequential',
    line=dict(
        color='red'
    )  
)

trace_2= go.Scatter(
    x=[k for k,v in incremental_entr_month.items()],
    y=[v for k,v in incremental_entr_month.items()],
    name = 'incremental',
    line=dict(
        color='blue'
    )
)


data_to_plot=[trace_1,trace_2]

fig=go.Figure(data=data_to_plot)
iplot(fig)

# Profiling

## Home location

The most common way to detect the "home" location of an individual is to select the locations where the user is most frequently present during nighttime.


In [None]:
home_loc=list(home(data))
print(home_loc,'\n')


m=folium.Map(location=[home_loc[0], home_loc[1]],zoom_start=16)
icon=folium.Icon(icon='home',color='red')
folium.Marker([home_loc[0], home_loc[1]],icon=icon).add_to(m)

m

In [None]:
profile = mobility_profile(data)

pd.DataFrame(
    {'Home location': [profile[0]],
     'Tot number of locations': profile[1],
     'Radius of gyration (km)': profile[2],
     'Random entropy': profile[3],
     'Uncorrelated entropy': profile[4],
     'Maximum travelled distance (km)': profile[5],
     'Straight line distance (km)': profile[6]
    })