In [5]:
import numpy as np
import os, glob, copy
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from geopy.distance import distance
import json

In [11]:
# load original files
filepath='/xxxxxx/'
qry = filepath +'*.npy'
files = glob.glob(qry)
files.sort()

In [8]:
def compute_total_distance(traj):
    """
    Compute the total distance of a trajectory.

    Parameters:
        traj (np.ndarray): trajectory, shape (N, 2)

    Returns:
        float: total distance
    """
    distances = [distance(traj[i][::-1], traj[i+1][::-1]).m for i in range(len(traj)-1)]
    return sum(distances)

def traj_wraping(x, length=200):
    """
    using linear interpolation to warp a trajectory to a fixed length

    Parameters:
        x (np.ndarray): trajectory, shape (N, 2)
        length (int): the length of the warped trajectory
    
    Returns:
        np.ndarray: warped trajectory, shape (length, 2)
    """
    len_x = len(x)
    time_steps = np.arange(length) * (len_x - 1) / (length - 1)
    road_index = time_steps.astype(np.int32)
    traj = x[:,:2].T
    road = x[:,2]
    warped_trajectory = np.zeros((2, length))
    for i in range(2):
        warped_trajectory[i] = np.interp(time_steps, np.arange(len_x), traj[i])
    warped_road = road[road_index]
    warped_trajectory = np.concatenate((warped_trajectory.T, warped_road[:, None]), axis=1)
    return warped_trajectory


In [9]:
def divide_grids(boundary, grids_num=12):
    """ 
    Divide the start and end areas of the trajectory
    Parameters:
        boundary (dictory): include the boundary of latitude and longitude

    Returns:
        list: the list of latitude and longitude values for each grid
    """
    lati_min, lati_max = boundary['lati_min'], boundary['lati_max']
    long_min, long_max = boundary['long_min'], boundary['long_max']
    # Divide the latitude and longitude into grids_num intervals.
    lati_interval = (lati_max - lati_min) / grids_num
    long_interval = (long_max - long_min) / grids_num
    # Create arrays of latitude and longitude values.
    latgrids = np.arange(lati_min, lati_max, lati_interval)
    longrids = np.arange(long_min, long_max, long_interval)
    return latgrids, longrids

def count_points_in_grid(lng, lati, longrids, latgrids, grids_num=12):
    """
    calculete the index of the grid
    """
    lati_index = np.searchsorted(latgrids, lati, side='right') - 1
    long_index = np.searchsorted(longrids, lng, side='right') - 1
    return lati_index * grids_num + long_index

# boundary
# {"lati_max": 30.73391319203361, 
# "long_max": 104.13132306846771,
# "lati_min": 30.6517184514858, 
# "long_min": 104.03535429444587}
# the mean and std of the latitude and longitude
mean = np.array([108.9462081441436, 34.24473806539756])
std = np.array([2.30826803e-02, 1.97827013e-02])

# load the boundary
boundary = json.load(open('xxx/boundary.json'))
grids_num=12
latgrids,longrids=divide_grids(boundary, grids_num)

In [10]:
def extract_feature(trips, threshold=120, length=200):
    '''
    Extract the features of a trip.
    Head attributes:
    trip_time, departure_time, trip_length, trip_distance, avg_dis, avg_speed, s_id, e_id
    Trajectory:
    Length warpped trajectory, default length is 200
    '''
    trips_attrs, trips_trajs = [], []
    for trip in (trips):
        if len(trip[1]) >= threshold:
            all_dis = trip[1][:,4]
            if np.max(all_dis) >200: # remove the trips with anomalies GPS distance
                continue
            trip_time, departure_time, trip_distance, trip_length, traj = trip[0][2], trip[0][3], trip[0][4], len(trip[1]), trip[1][:,:3].astype(np.float64)
            avg_dis, avg_speed = trip_distance / trip_length, trip_distance / trip_time
            start_loc, end_loc = traj[0], traj[-1]
            s_id = count_points_in_grid(start_loc[0], start_loc[1], longrids, latgrids, grids_num)
            e_id = count_points_in_grid(end_loc[0], end_loc[1], longrids, latgrids, grids_num)
            traj = traj_wraping(traj, length)
            head_attrs = [departure_time, trip_distance,  trip_time, trip_length, avg_dis, avg_speed, s_id, e_id]
            trips_attrs.append(head_attrs)
            trips_trajs.append(traj)
    trips_attrs = np.array(trips_attrs)
    trips_trajs = np.array(trips_trajs)
    return trips_attrs, trips_trajs 

In [None]:
all_traj_head,all_trajs=[],[]
# setting the number of files to be processed
files_number=40
for i in tqdm(range(files_number)):
    data = np.load(files[i], allow_pickle=True)
    Traj_head,Trajs = extract_feature(data, threshold=120,length=200)
    all_traj_head.append(Traj_head)
    all_trajs.append(Trajs)
    # break
all_traj_head = np.vstack(all_traj_head)
all_trajs = np.vstack(all_trajs)

In [None]:
# normalize the data
h_mean = np.mean(all_traj_head[:,1:6], axis=0)
h_std = np.std(all_traj_head[:,1:6], axis=0)
all_traj_head[:,1:6] = (all_traj_head[:,1:6]-h_mean)/h_std
all_trajs = (all_trajs-mean)/std
# save data
np.save('all_traj_head.npy',all_traj_head)
np.save('all_trajs.npy',all_trajs)