In [1]:
import geopandas as gp
import movingpandas as mpd
from datetime import timedelta
from pyproj import CRS
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import logging
logging.basicConfig(filename='log_read_freemove.log', level=logging.DEBUG)

In [None]:
print('Reading data...')
raw_full_trip_gdf = gp.read_file("W:/Master-Thesis-Repository/data/freemove_dlr_data/raw_full_trip.geojson")
print('Data read.')

In [3]:
def write_geojson(gdf, path):
    assert isinstance(gdf, gp.GeoDataFrame)
    gdf.to_file(path, driver='GeoJSON')



In [4]:
print('removing points outside valid lon and lat range...')
# Remove points that fall outside the valid lon and lat range (-90 to 90 for latitude and -180 to 180 for longitude)
raw_full_trip_gdf = raw_full_trip_gdf[(raw_full_trip_gdf.lat >= -90) & (raw_full_trip_gdf.lat <= 90) & (raw_full_trip_gdf.lon >= -180) & (raw_full_trip_gdf.lon <= 180)]
print('points removed.')

removing points outside valid lon and lat range...
points removed.


In [5]:
print('Dropping NA values...')
len_before = len(raw_full_trip_gdf)
logging.info('Rows before dropping NA values: {}'.format(len(raw_full_trip_gdf)))
raw_full_trip_gdf.dropna(inplace=True)
logging.info('Rows after dropping NA values: {}'.format(len(raw_full_trip_gdf)))
logging.info('Rows dropped: {}'.format(len_before - len(raw_full_trip_gdf)))
print('NA values dropped.')

Dropping NA values...
NA values dropped.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  geolife_raw_gdf.dropna(inplace=True)


In [6]:
print('creating trajectory collection...')
# Create trajectory collection
raw_full_trip_collection = mpd.TrajectoryCollection(raw_full_trip_gdf, traj_id_col='traj_id', obj_id_col ='user', t='time', x='lon', y='lat')
print('trajectory collection created.')

logging.info(f'This is a test log of traj id: {raw_full_trip_collection.trajectories[0].id}')
logging.info(f'Number of trajectories in data: {len(raw_full_trip_collection.trajectories)}')

creating trajectory collection...
trajectory collection created.


In [12]:
# Convert to EPSG:32650 for China (Beijing)
def convert_epsg(traj):
    result = traj.to_crs(CRS(32650))
    result.obj_id = traj.obj_id
    return result
print('converting to EPSG:32650 for China (Beijing)...')
raw_full_trip_collection.trajectories = Parallel(n_jobs=-2, verbose=10)(delayed(convert_epsg)(traj) for traj in raw_full_trip_collection.trajectories)
print('converted to EPSG:32650 for China (Beijing).')

converting to EPSG:32650 for China (Beijing)...


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-2)]: Batch computation too fast (0.1987s.) Setting batch_size=2.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-2)]: Done  47 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-2)]: Batch computation too fast (0.1292s.) Setting batch_size=4.
[Parallel(n_jobs=-2)]: Done  67 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-2)]: Batch computation too fast (0.1960s.) Setting batch_size=8.
[Parallel(n_jobs=-2)]: Done 109 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-2)]: Done 189 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-2)]: Done 309 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-2)]: Done 429 tasks      | elapsed:  

converted to EPSG:32650 for China (Beijing).


In [13]:
# Split trajectories

def split_traj(traj, MAX_DIAMETER=100, MIN_DURATION=timedelta(minutes=15), MIN_LENGTH=500):
    try:
        split = mpd.StopSplitter(traj).split(max_diameter=MAX_DIAMETER, min_duration=MIN_DURATION, min_length=MIN_LENGTH)
        for i in range(len(split.trajectories)):
            split.trajectories[i].obj_id = traj.obj_id
        return split.trajectories
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        logging.warning(f'{e} Error at traj: {traj.id}')

        return []

print('splitting trajectories...')
# split_trajs = Parallel(n_jobs=4, verbose=10)(delayed(split_traj)(traj) for traj in geolife_raw_collection.trajectories)
split_trajs = []
for traj in tqdm(raw_full_trip_collection.trajectories):
    try:
        split_trajs.append(split_traj(traj))
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        continue
    
split_trajs = [traj for sublist in split_trajs for traj in sublist]
freemove_splitted_collection = mpd.TrajectoryCollection(split_trajs)
print('trajectories split.')

splitting trajectories...


 94%|█████████▎| 16632/17784 [12:20:06<2:22:24,  7.42s/it] 

IllegalArgumentException: CGAlgorithmsDD::orientationIndex encountered NaN/Inf numbers Error at traj:  20110911000506


100%|██████████| 17784/17784 [13:06:40<00:00,  2.65s/it]   


trajectories split.


In [14]:


# Smooth trajectories
def smooth_traj(traj, PROCESS_NOISE_STD=0.1, MEASUREMENT_NOISE_STD=10):
    try:
        result = mpd.KalmanSmootherCV(traj).smooth(process_noise_std=PROCESS_NOISE_STD, measurement_noise_std=MEASUREMENT_NOISE_STD)
        result.obj_id = traj.obj_id
        return result
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        logging.warning(f'{e} Error at traj: {traj.id}')
        return traj

print('smoothing trajectories...')
# geolife_splitted_smooth_collection = Parallel(n_jobs=4, verbose=10)(delayed(smooth_traj)(traj) for traj in geolife_splitted_collection.trajectories)
freemove_splitted_smooth_collection = []
for traj in tqdm(freemove_splitted_collection.trajectories):
    try:
        freemove_splitted_smooth_collection.append(smooth_traj(traj))
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        continue

freemove_splitted_smooth_collection = mpd.TrajectoryCollection(freemove_splitted_smooth_collection)
print('trajectories smoothed.')

smoothing trajectories...


100%|██████████| 26668/26668 [5:05:55<00:00,  1.45it/s]   


trajectories smoothed.


In [15]:
# Generalize trajectories
def generalize_traj(traj, TOLERANCE=1.0):
    try:
        result = mpd.DouglasPeuckerGeneralizer(traj).generalize(tolerance=TOLERANCE)
        result.obj_id = traj.obj_id
        return result
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        logging.warning(f'{e} Error at traj: {traj.id}')
        return traj

# Douglas-Peucker generalization for non-smoothed trajectories
print('generalizing trajectories... (1)')
logging.info('generalizing trajectories... (1)')
# geolife_splitted_generalized_collection = Parallel(n_jobs=4, verbose=10)(delayed(generalize_traj)(traj) for traj in geolife_splitted_collection.trajectories)
freemove_splitted_generalized_collection = []
for traj in tqdm(freemove_splitted_collection.trajectories):
    try:
        freemove_splitted_generalized_collection.append(generalize_traj(traj))
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        continue
freemove_splitted_generalized_collection = mpd.TrajectoryCollection(freemove_splitted_generalized_collection)
print('trajectories generalized. (1)')
logging.info('trajectories generalized. (1)')

generalizing trajectories... (1)


100%|██████████| 26668/26668 [1:25:47<00:00,  5.18it/s]   


trajectories generalized. (1)


In [16]:
# Douglas-Peucker generalization for smoothed trajectories
print('generalizing trajectories... (2)')
logging.info('generalizing trajectories... (2)')
# geolife_splitted_smooth_generalized_collection = Parallel(n_jobs=4, verbose=10)(delayed(generalize_traj)(traj) for traj in geolife_splitted_smooth_collection.trajectories)
freemove_splitted_smooth_generalized_collection = []
for traj in tqdm(freemove_splitted_smooth_collection.trajectories):
    try:
        freemove_splitted_smooth_generalized_collection.append(generalize_traj(traj))
    except BaseException as e:
        print(e, 'Error at traj: ', traj.id)
        continue

freemove_splitted_smooth_generalized_collection = mpd.TrajectoryCollection(freemove_splitted_smooth_generalized_collection)
print('trajectories generalized. (2)')
logging.info('trajectories generalized. (2)')

generalizing trajectories... (2)


100%|██████████| 26668/26668 [1:07:59<00:00,  6.54it/s]  


trajectories generalized. (2)


In [17]:
def convert_trajcollection_to_gdf(trajcollection):
    gdfs = []
    for traj in tqdm(trajcollection.trajectories):
        traj_gdf = traj.to_traj_gdf()
        traj_gdf['user_id'] = traj.obj_id
        gdfs.append(traj_gdf)

    gdf = gp.GeoDataFrame(pd.concat(gdfs), crs='EPSG:32650')

    return gp.GeoDataFrame(pd.concat(gdfs), crs='EPSG:32650')



In [18]:
logging.info('converting to gdf...')
freemove_splitted = convert_trajcollection_to_gdf(freemove_splitted_collection)
freemove_splitted_smooth = convert_trajcollection_to_gdf(freemove_splitted_smooth_collection)
freemove_splitted_generalized = convert_trajcollection_to_gdf(freemove_splitted_generalized_collection)
freemove_splitted_smooth_generalized = convert_trajcollection_to_gdf(freemove_splitted_smooth_generalized_collection)
logging.info('converted to gdf.')

100%|██████████| 26668/26668 [07:40<00:00, 57.94it/s] 
100%|██████████| 26668/26668 [07:16<00:00, 61.14it/s] 
100%|██████████| 26668/26668 [03:08<00:00, 141.45it/s]
100%|██████████| 26668/26668 [01:48<00:00, 245.46it/s]


In [19]:

def convert_timezone(gdf):
    # Convert timezone to Asia/Shanghai
    gdf = gdf.copy()
    gdf['start_t'] = gdf['start_t'].dt.tz_localize('GMT').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
    gdf['end_t'] = gdf['end_t'].dt.tz_localize('GMT').dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)
    return gdf


# In[ ]:

logging.info('converting timezone...')
freemove_splitted = convert_timezone(freemove_splitted)
freemove_splitted_smooth = convert_timezone(freemove_splitted_smooth)
freemove_splitted_generalized = convert_timezone(freemove_splitted_generalized)
freemove_splitted_smooth_generalized = convert_timezone(freemove_splitted_smooth_generalized)
logging.info('converted timezone.')

In [20]:
# Write gdf to pickle file to load fast for further processing
logging.info('writing to pickle...')
write_geojson(freemove_splitted, 'W:/Master-Thesis-Repository/data/freemove_dlr_data/freemove_splitted.geojson')
write_geojson(freemove_splitted_smooth, 'W:/Master-Thesis-Repository/data/freemove_dlr_data/freemove_splitted_smooth.geojson')
write_geojson(freemove_splitted_generalized, 'W:/Master-Thesis-Repository/data/freemove_dlr_data/freemove_splitted_generalized.geojson')
write_geojson(freemove_splitted_smooth_generalized, 'W:/Master-Thesis-Repository/data/freemove_dlr_data/freemove_splitted_smooth_generalized.geojson')
logging.info('written to pickle.')


  pd.Int64Index,
