In [37]:
import pandas as pd
import numpy as np

df = pd.read_csv('/Users/bean/Documents/Doctorate/1Research/MapMetadata/mapmetadata/data_partiton_test.csv')

---

### Code for DataPartition

In [64]:
import pandas as pd
import numpy as np

import datetime
from sklearn.cluster import KMeans
import osmnx as ox

class DataPartition:
    def __init__(self, df):

        # CONFIGS
        self.max_time_diff = pd.Timedelta(minutes=2)
        self.k = 4
        self.network_type = 'drive'

        # DATASET
        self.traj_ids = df['traj_id'].to_list()
        self.timestamps = self.get_timestamp_list(col=df['timestamp'])
        self.latitudes = df['latitude'].to_list()
        self.longitudes = df['longitude'].to_list()

        self.num_points = len(df)

        # INDEXES
        self.time_group_idx, self.time_idx_exp, self.speeds = self.get_time_group_idx()
        self.space_group_mbrs, self.space_group_idx = self.get_space_groups()

        # DATA SUBSETS
            # lat/long together
        # self.base_df = np.stack([self.longitudes, self.latitudes, self.speeds]).transpose()
        # self.points_in_time_groups = np.split(self.base_df, self.time_group_idx)
            # lat/long separate
        self.lats_in_time_groups = np.split(self.latitudes, self.time_group_idx)
        self.lons_in_time_groups = np.split(self.longitudes, self.time_group_idx)

    def print_group_indices(self):

        print(f"time group index: {self.time_group_idx}\n")
        print(f"time index expanded: {self.time_idx_exp}\n")
        print(f"space group MBRs: {self.space_group_mbrs}\n")
        print(f"space group index: {self.space_group_idx}\n")

        return

    def get_timestamp_list(self, col):
        """Convert timestamps to datetime objects"""
        dt = pd.to_datetime(col, errors = 'coerce')
        return dt.to_list()

    def get_time_group_idx(self):
        """
        Return:
            List of index intervals defining each time group
                All points in a time group are within the self.max_time_diff time difference
                All points in a time group are from the same trajectory
            Estimated speed at each point
                The last two points are given the same speed
        """
        time_group_intervals = []
        time_intervals_expanded = []
        time_diff = []
        dist_diff = []
        s = 0
        group_time_diff = pd.Timedelta(minutes=0)
        prev_id = self.traj_ids[0]
        prev_lat = self.latitudes[0]
        prev_lon = self.longitudes[0]
        prev_timestamp = self.timestamps[0]

        for i in range(1, (self.num_points-1)):

            cur_id = self.traj_ids[i]
            cur_lat = self.latitudes[i]
            cur_lon = self.longitudes[i]
            cur_timestamp = self.timestamps[i]

            if cur_id == prev_id:
                time_diff.append((cur_timestamp - prev_timestamp).total_seconds())
                dist_diff.append(round(np.linalg.norm(np.array([cur_lon, cur_lat], dtype=np.float32) - np.array([prev_lon, prev_lat], dtype=np.float32)), 2))

                group_time_diff = cur_timestamp - self.timestamps[s]
                if (group_time_diff > self.max_time_diff):
                    time_group_intervals.append((s,i))
                    time_intervals_expanded.append(np.arange(s,i))
                    s = i
            else:
                # last two points of prev trajectory get same speed
                time_diff.append(time_diff[-1])
                dist_diff.append(dist_diff[-1])
                prev_id = cur_id
            
            prev_lat = cur_lat
            prev_lon = cur_lon
            prev_timestamp = cur_timestamp

        # last index - check if ids of last two points are the same, add correct ids to lists
        last_i = self.num_points - 1    # == i + 1
        cur_id = self.traj_ids[last_i]

        if cur_id != prev_id:   # corner case - last point not in same trajectory as penultimate point
            # add last group from loop
            time_group_intervals.append((s, i))
            time_intervals_expanded.append(np.arange(s, last_i))    # need last_i == i + 1 because we want [s, i]
            time_diff.append(time_diff[-1])
            dist_diff.append(dist_diff[-1])

            # add last point
            time_group_intervals.append(last_i, last_i)
            time_intervals_expanded.append(np.arange(last_i, self.num_points))
            time_diff.append(None)
            dist_diff.append(None)

        else:   # standard case - last point in same trajectory as penultimate point
            time_group_intervals.append((s, last_i))
            time_intervals_expanded.append(np.arange(s, self.num_points))

            cur_lat = self.latitudes[last_i]
            cur_lon = self.longitudes[last_i]
            time_diff.append((self.timestamps[last_i] - self.timestamps[i]).total_seconds())
            dist_diff.append(round(np.linalg.norm(np.array([cur_lon, cur_lat], dtype=np.float32) - np.array([prev_lon, prev_lat], dtype=np.float32)), 2))

        indices = [end for _, end in time_group_intervals[:-1]]  #TODO: make this the time_group_idx if we only need these vals   

        dt = [d/t for d, t in zip(dist_diff,time_diff)]
        dt.append(dt[-1])   # last two points have the same value

        return indices, time_intervals_expanded, dt
    
    def get_extrema(self):
        """
        CALLED BY GET_SPACE_GROUPS

        Get the min/max lat/long of the first and last points in a time group
        Return [min lat, max lat] , [min_long, max_long]
        """
        # indices = [end for _, end in self.time_group_idx[:-1]]  #TODO: make this the time_group_idx if we only need these vals   

        # [time group 1 [first point lat/lon, last point lat/lon], time group 2 [ first point, last point], ...]
        first_last_lats = np.matrix([[l[0], l[-1]] for l in np.split(self.latitudes, self.time_group_idx)])
        first_last_lons = np.matrix([[l[0], l[-1]] for l in np.split(self.longitudes, self.time_group_idx)])
        
        sorted_lats = np.asarray(np.sort(first_last_lats, axis=1))
        sorted_lons = np.asarray(np.sort(first_last_lons, axis=1))

        return sorted_lats, sorted_lons

    def get_bbox(self, lat, lon, labels, i):
        """
        CALLED BY GET_SPACE_GROUPS

        Given a space group label, get all the lat/lon values [time group values] that are in that space group
            (lat/lon are list of [min val, max val] of each time group, and you mask by space group label i)
        Return (min_long, min_lat, max_long, max_lat) == (left, bottom, right, top)
        """
        g_lat = np.matrix([row for row, l in zip(lat, labels) if l == i])
        g_lon = np.matrix([row for row, l in zip(lon, labels) if l == i])
        return (np.min(g_lon), np.min(g_lat), np.max(g_lon), np.max(g_lat))

    def get_space_groups(self):
        """
        NOTES:  Number of space groups is the number of road networks
                We want to get the smallest number of road networks that are small enough
                to make map matching fast, so we cluster all the MBRs into k groups

        Return:
            space_groups: mbr of each space group (there are k groups)
                mbr = [min_long, min_lat, max_long, max_lat] == [left, bottom, right, top]
            labels: list of corresponding space group index for each time group
                index is the index of the space group each time group belongs in
        """
        sorted_lats, sorted_lons = self.get_extrema()
        mbrs = np.hstack([sorted_lats, sorted_lons])

        centroids = [( (x1+x2)/2, (y1+y2)/2 ) for y1, y2, x1, x2 in mbrs]
        kmeans = KMeans(n_clusters = self.k).fit(centroids)
        labels = kmeans.labels_

        space_groups = []
        for i in range(self.k):
            space_groups.append(self.get_bbox(lat=sorted_lats, lon=sorted_lons, labels=labels, i=i))

        return space_groups, labels
    
    def get_road_network(self, mbr):
        """
        CALLED BY MAP_MATCH

        Return OSM road network within the given space group mbr
        """
        return ox.graph_from_bbox(bbox=mbr, network_type=self.network_type, retain_all=True, truncate_by_edge=True, simplify=False)

    def get_all_space_group_points(self, i):
        """
        CALLED BY MAP_MATCH

        Given a space group label
        Return all the point coordinates that correspond to that space group as a list
            need to return lat and long separately because osm.nearest_edges takes them separately
            if it is no longer needed, can do them all at once:
                points_in_space_group = [time_group for time_group, space_idx in zip(self.points_in_time_groups, self.space_group_idx) if space_idx == i]
                return np.vstack(points_in_space_group).tolist()
        """
        lats_in_space_group = np.hstack([time_group for time_group, space_idx in zip(self.lats_in_time_groups, self.space_group_idx) if space_idx == i]).tolist()
        lons_in_space_group = np.hstack([time_group for time_group, space_idx in zip(self.lons_in_time_groups, self.space_group_idx) if space_idx == i]).tolist()
        indices = np.hstack([time_idx for time_idx, space_idx in zip(self.time_idx_exp, self.space_group_idx) if space_idx == i]).tolist()
        return lats_in_space_group, lons_in_space_group, indices

    def map_match(self):
        """
        CALLED BY GET_POINTS_INFO

        For each of the k space groups:
            get all points in that group, 
            map match all points using that road network
        Return the matching edges and distance from edge for each point
            in sorted order (sorted like original df: by traj_id and timestamp)
        """
        #TODO maybe rewrite with zip
        edges = []
        distances = []
        indices = []    # original point indices in order of space group
        for i in range(self.k):

            Y, X, all_t_idx_for_s = self.get_all_space_group_points(i)
            G = self.get_road_network(self.space_group_mbrs[i])
            e, d = ox.distance.nearest_edges(G, X, Y, return_dist=True)
            edges.append(e)
            distances.append(d)
            indices.append(all_t_idx_for_s)

        edges_list = np.hstack(edges).tolist()
        distance_list = np.hstack(distances).tolist()
        indices_list = np.hstack(indices).tolist()

        paired_e = list(zip(indices_list, edges_list))
        sorted_edges = [value for _, value in paired_e]

        paired_d = list(zip(indices_list, distance_list))
        sorted_distances = [value for _, value in paired_d]

        return sorted_edges, sorted_distances
    
    def get_point_info(self):
        """
        Return an np array with all the points in df 
            ordered by their trajectory ids and timestamp.

            Currently array only has trajector id, speed, edge, 
            and distance from edge of each point
        """
        # edges, distances = self.map_match()
        # base_df = np.stack([self.traj_ids, self.speeds, edges, distances]).transpose()
        # return base_df

        return np.stack([self.traj_ids, self.speeds]).transpose()

In [65]:
import time
start_time = time.time()

partition = DataPartition(df=df)

partition.print_group_indices()



time group index: [7, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 49, 62, 75, 86, 115, 116, 117]

time index expanded: [array([0, 1, 2, 3, 4, 5, 6]), array([ 7,  8,  9, 10, 11, 12, 13]), array([14, 15, 16]), array([17, 18, 19]), array([20, 21, 22]), array([23, 24, 25]), array([26, 27, 28]), array([29, 30, 31]), array([32, 33, 34]), array([35, 36, 37]), array([38, 39, 40]), array([41, 42, 43]), array([44, 45, 46]), array([47, 48]), array([49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]), array([62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]), array([75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85]), array([ 86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
       112, 113, 114]), array([115]), array([116]), array([117, 118, 119])]

space group MBRs: [(np.float64(4.0), np.float64(2.0), np.float64(10.0), np.float64(5.0)), (np.float64(-2.0), np.float64(0.1), np.float64(5.0), np.float64(6.0)), (np.

Notes for *data_partition_test.csv*

**traj_id a**
* 16 total points
* timestamps go up by 20 seconds --> time group indices [(0,7), (7,16), (16,18)] time group lat split [3.5, 4.9]
* lat/long go up continuously

**traj_id b**
* 32 total points
* timestamps go up by 1 minute --> time group indices [(16,19), (19,22), (22,25), (25,28), (28,31), (31, 34), (34, 37), (37, 40), (40,43), (43, 46), (46, 47)]
* lat/long go up and down, have negative values

**traj_id c**
* 37 total points
* timestamps go up by 10 seconds --> time group indices should be equally spaced apart by 13 (12?) lat/long splits [(1,0.7), (5,0) ]

**traj_id d**
* 29 total points
* timestamps go up by 1 second --> time group indices are one interval

**traj_id e**
* 6 total points
* timestamps go up by more than 2 seconds except second to last one, should be 4 time groups



In [66]:
points = partition.get_point_info()

In [67]:
points

array([['a', '0.014'],
       ['a', '0.021'],
       ['a', '0.021'],
       ['a', '0.011'],
       ['a', '0.05'],
       ['a', '0.014'],
       ['a', '0.018000001'],
       ['a', '0.0425'],
       ['a', '0.011'],
       ['a', '0.05'],
       ['a', '0.014'],
       ['a', '0.021'],
       ['a', '0.021'],
       ['a', '0.011'],
       ['a', '0.05'],
       ['a', '0.05'],
       ['b', '0.0046666665'],
       ['b', '0.0069999998'],
       ['b', '0.0069999998'],
       ['b', '0.0036666666'],
       ['b', '0.016666668'],
       ['b', '0.0046666665'],
       ['b', '0.006'],
       ['b', '0.014166667'],
       ['b', '0.0036666666'],
       ['b', '0.016666668'],
       ['b', '0.0046666665'],
       ['b', '0.0069999998'],
       ['b', '0.0069999998'],
       ['b', '0.0036666666'],
       ['b', '0.016666668'],
       ['b', '0.17950001'],
       ['b', '0.033333335'],
       ['b', '0.016666668'],
       ['b', '0.016666668'],
       ['b', '0.016666668'],
       ['b', '0.016666668'],
       ['b', '0.0

In [69]:
points_df = pd.DataFrame(points)
points_df

Unnamed: 0,0,1
0,a,0.014
1,a,0.021
2,a,0.021
3,a,0.011
4,a,0.05
...,...,...
115,e,0.008294118
116,e,0.007724138
117,e,0.013043478
118,e,0.006666667
