In [1]:
import geopandas as gp
from shapely import Point, LineString
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

In [7]:
# Import the data you want to privatize
raw_full_trip_gdf = gp.read_file("../data/geolife/geolife_smooth.geojson", geometry='geometry')

In [8]:
# get start and end points (these steps will be done in data loader so we just have to replicate them here in order to get same data format)

# Create SP and EP columns
raw_full_trip_gdf['TRIP_SP'] = raw_full_trip_gdf.geometry.apply(lambda x: Point(x.coords[0]))
raw_full_trip_gdf['TRIP_EP'] = raw_full_trip_gdf.geometry.apply(lambda x: Point(x.coords[-1]))

In [9]:
def cut_epz(row, rad_l=100, rad_h=300):
    """This function takes in a row of a GeoDataframe and returns the private linestring truncated with SP and EP privacy zones of a random radius between the specified parameters in meters.

    Args:
        row (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Generate circular privacy zone around start and end point with random radius uniform between 100-300m
    sp_rad = np.random.randint(rad_l,rad_h+1)
    ep_rad = np.random.randint(rad_l,rad_h+1)


    sp_zone = row.TRIP_SP.buffer(sp_rad)
    ep_zone = row.TRIP_EP.buffer(ep_rad)

    # Remove points of linestring that lie in either privacy zone and return empty linestring if entire trajectroy has been truncated
    try:
        priv_linestring = LineString([x for x in row.geometry.coords if not Point(x).within(sp_zone) and not Point(x).within(ep_zone)])
        if len(priv_linestring.coords) < 1:
            return LineString()
        else:
            return priv_linestring
    except Exception as e:
        return LineString()
    
    

In [10]:
raw_full_trip_gdf['geometry'] = raw_full_trip_gdf.progress_apply(lambda x: cut_epz(x), axis = 1)

raw_full_trip_gdf.head(20)

  0%|          | 0/17783 [00:00<?, ?it/s]

Unnamed: 0,traj_id,start_t,end_t,length,direction,user_id,geometry,TRIP_SP,TRIP_EP
0,20000101231219,2000-01-02T07:12:19,2000-01-02T07:15:23,443.398681,358.214883,163,LINESTRING EMPTY,POINT (442546.261 4426779.605),POINT (442532.455 4427222.593)
1,20070412093132,2007-04-12T17:31:32,2007-04-12T19:33:40,9802.018225,242.62347,142,"LINESTRING (442826.716 4425303.400, 442828.155...",POINT (442820.019 4425112.000),POINT (437309.430 4422258.445)
2,20070412101853,2007-04-12T18:18:53,2007-04-12T18:23:15,80.616455,17.416475,161,LINESTRING EMPTY,POINT (442813.259 4425257.326),POINT (442837.382 4425334.223)
3,20070412102116,2007-04-12T18:21:16,2007-04-12T22:56:56,2420.113159,128.610649,163,"LINESTRING (443308.707 4425206.510, 443360.357...",POINT (442801.448 4425351.582),POINT (444061.288 4424345.481)
4,20070412102325,2007-04-12T18:23:25,2007-04-12T18:26:25,44.396423,18.679239,161,LINESTRING EMPTY,POINT (442823.214 4425292.322),POINT (442837.430 4425334.373)
5,20070412134621,2007-04-12T21:46:21,2007-04-12T22:35:34,2661.942461,244.474504,97,"LINESTRING (442535.753 4425246.565, 442443.128...",POINT (442816.646 4425237.874),POINT (441071.212 4424404.391)
6,20070413005306,2007-04-13T08:53:06,2007-04-13T15:18:02,6667.328541,309.547213,163,"LINESTRING (444126.086 4424546.772, 444127.455...",POINT (444067.120 4424298.872),POINT (442900.227 4425262.400)
7,20070413013238,2007-04-13T09:32:38,2007-04-13T15:18:32,11432.857404,58.673134,142,"LINESTRING (437592.443 4422282.379, 437597.585...",POINT (437545.051 4422020.334),POINT (442877.125 4425265.711)
8,20070413105648,2007-04-13T18:56:48,2007-04-13T23:02:47,7043.462379,193.975423,161,"LINESTRING (442558.074 4425172.720, 442550.904...",POINT (442801.836 4425332.311),POINT (441459.455 4419938.461)
9,20070413150314,2007-04-13T23:03:14,2007-04-13T23:05:58,312.616713,286.721666,161,LINESTRING EMPTY,POINT (441713.513 4419861.455),POINT (441458.913 4419937.944)


In [11]:
# drop helper columns
raw_full_trip_gdf.drop(['TRIP_SP', 'TRIP_EP'], axis=1, inplace=True)

# Create geojson
raw_full_trip_gdf.to_file("../data/geolife/geolife_smooth_private.geojson", driver='GeoJSON')

  pd.Int64Index,


In [5]:
# Testing and example for privacy zone creation (with fix radius in this case)

EPZ_RADIUS = 200

raw_full_trip_gdf['START_EPZ'] = raw_full_trip_gdf.TRIP_SP.apply(lambda x: x.buffer(EPZ_RADIUS))
raw_full_trip_gdf['END_EPZ'] = raw_full_trip_gdf.TRIP_EP.apply(lambda x: x.buffer(EPZ_RADIUS))

# check whether radius is correct (2*r)
from scipy.spatial import distance

distance.euclidean(raw_full_trip_gdf.START_EPZ[0].exterior.coords[0], raw_full_trip_gdf.START_EPZ[0].exterior.coords[32])

In [246]:
# Example with plots
test = raw_full_trip_gdf.head(100).copy()
test['geo_private'] = test.apply(lambda x: cut_epz(x), axis = 1)

# Filter empty linestrings after privatization (0 points left)
test['NR_POINTS'] = test.geo_private.apply(lambda x: len(x.coords))
test = test.query('NR_POINTS > 0').copy()

m = test.query('TRIP_ID == 20070412093132')[['geometry']].explore()

gp.GeoDataFrame(test[['TRIP_ID', 'geo_private']].dropna(), geometry='geo_private', crs='epsg:32650').query('TRIP_ID == 20070412093132').explore(m = m, color='purple')