In [1]:
import pandas as pd
from bisect import bisect_left, bisect_right
from pyproj import Transformer
import folium
from IPython.display import display
import numpy as np
import tqdm
from multiprocessing import Pool


dir_path = "../LaDe/"

In [2]:
#trajecotry 20s sampling data
#courier level data
df_trajectory = pd.read_pickle(f"{dir_path}/data_with_trajectory_20s/courier_detailed_trajectory_20s.pkl.xz", compression="xz")

In [3]:
#conversion of gps_timestamp to pandas datetime objects
#taking base year as 2020

df_trajectory["gps_datetime"] = pd.to_datetime(
    "2021-" + df_trajectory["gps_time"],
    format="%Y-%m-%d %H:%M:%S"
)

In [4]:
#Arranging the dataframe so that for any postman_id a group of rows can be
#identified by unique (i,j)
df_trajectory_sorted = (
    df_trajectory.sort_values(["postman_id", "gps_datetime"])
)


In [5]:
df_trajectory_sorted.head()

Unnamed: 0,ds,postman_id,gps_time,lat,lng,gps_datetime
37652971,321,0008c2b6a2314db8715301b7eeeebc5a,03-21 00:00:02,1708151.0,1064734.0,2021-03-21 00:00:02
37652972,321,0008c2b6a2314db8715301b7eeeebc5a,03-21 00:00:32,1708151.0,1064734.0,2021-03-21 00:00:32
37652973,321,0008c2b6a2314db8715301b7eeeebc5a,03-21 00:01:02,1708151.0,1064734.0,2021-03-21 00:01:02
37652974,321,0008c2b6a2314db8715301b7eeeebc5a,03-21 00:01:32,1708151.0,1064734.0,2021-03-21 00:01:32
37652975,321,0008c2b6a2314db8715301b7eeeebc5a,03-21 00:02:02,1708151.0,1064734.0,2021-03-21 00:02:02


In [5]:
# Store indices in idx
df_trajectory_sorted.reset_index(drop=True)
df_trajectory_sorted["idx"] = df_trajectory_sorted.index

In [6]:
#delivery data
#order level data
#order level > courier level
#order_id unique
df_delivery = pd.read_pickle(f"{dir_path}/data_with_trajectory_20s/delivery_five_cities.pkl.xz", compression="xz")

In [7]:
df_delivery["receipt_datetime"] = pd.to_datetime(
    "2021-" + df_delivery["receipt_time"],
    format="%Y-%m-%d %H:%M:%S"
)

df_delivery["sign_datetime"] = pd.to_datetime(
    "2021-" + df_delivery["sign_time"],
    format="%Y-%m-%d %H:%M:%S"
)

In [8]:
#intermediate datastructure to lookup trajectory indices
# We are storing all time and index from 20s sampling data to reduce time for query
# (Query) -> (index_range i.e. from ith to jth row) -> search ith to jth row in 20s sampling table
trajectory_index_lookup = {}

for pid, g in df_trajectory_sorted.groupby("postman_id", sort=False):
    trajectory_index_lookup[pid] = {
        "time": g["gps_datetime"].values,
        "idx": g["idx"].values,
    }

In [9]:
# (i,j) represents a group of rows from ith row to jth row in trajectory_table
# Delivery table is order level data
# Here (i,j) identifies the group of rows that belong to the trajectory of that particular order.

# How we can proceed: 
# column i: We have receipt time, delivery_user_id 
# Find the first row with same postman_id value and gps_datetime above or equal to receipt time 
# 
# column j: We have sign_time, delivery_user_id 
# Find the last row with same postman_id value and gps_datetime below or equal to the sign_time
def find_i_j(row):
    pid = row["delivery_user_id"]
    
    if pid not in trajectory_index_lookup:
        return pd.Series([None, None])
    
    times = trajectory_index_lookup[pid]["time"]
    idxs = trajectory_index_lookup[pid]["idx"]

    # i: first row for which time >= receipt_time
    left = bisect_left(times, row["receipt_datetime"])
    
    # j: last row for which time <= sign_time
    right = bisect_right(times, row["sign_datetime"]) - 1

    if left >= len(times) or right < 0 or left > right:
        return pd.Series([None, None])

    return pd.Series([idxs[left], idxs[right]])


In [None]:
# Adding column i and column j
# Here (i,j) identifies the group of rows that belong to the trajectory of that particular order.
df_delivery[["i", "j"]] = df_delivery.apply(find_i_j,axis=1) 

In [48]:
def time2min(dt):
    """
    datetime -> minute of day (float)
    """
    return dt.hour * 60 + dt.minute + dt.second / 60

In [49]:

def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

In [62]:
def idx(df, col_name):
    _idx_ = list(df.columns).index(col_name)
    return _idx_

def split_trajectory(df):
    """
    split the dataframe into trajectories of different couriers
    """
    courier_l = []
    temp = df.values[0]
    c_idx=idx(df, 'delivery_user_id')
    ds_idx =  idx(df, 'ds')
    f = 0
    t = 0
    for row in df.values: #df.values: data in the format of numpy arrary
        if row[c_idx] != temp[c_idx] or row[ds_idx] != temp[ds_idx]:
            courier_l.append(df[f:t]) # pick out the data when enters a new courier id or a new date
            f = t
        t = t + 1
        temp = row
    courier_l.append(df[f:t])
    return courier_l

In [51]:
def dict_merge(dict_list = []):
    dict_ =  {}
    for dic in dict_list:
        assert isinstance(dic, dict), "object is not a dict!"
        dict_ = {**dict_, **dic}
    return dict_

In [52]:
def reindex(dic):
    idx = 0
    map_dic = {}
    for k in dic.keys():
        map_dic[k] = idx
        idx += 1
    return map_dic

In [53]:
def chunk_list(lst, num_chunks):
    return np.array_split(lst, num_chunks)

In [54]:
def multi_thread_work(parameter_queue, function_name, thread_number=5):
    if not parameter_queue:
        return []
    with Pool(thread_number) as pool:
        return pool.map(function_name, parameter_queue)


In [55]:
def courier_info(df):
    """
    get courier's feature
    """
    couriers = sorted(df['delivery_user_id'].unique())

    feature_dict = {}
    for key in [
        'index', 'id', 'order_sum', 'dis_sum', 'work_days',
        'order_avg_day', 'dis_avg_day',
        'time_avg_order', 'dis_avg_order', 'speed_avg_order'
    ]:
        feature_dict[key] = {}

    for idx, c in enumerate(couriers):
        c_df = df[df['delivery_user_id'] == c]

        feature_dict['index'][c] = idx
        feature_dict['id'][c] = c
        feature_dict['order_sum'][c] = c_df.shape[0]
        feature_dict['dis_sum'][c] = c_df['dis_to_last_package'].sum()
        feature_dict['work_days'][c] = c_df['ds'].nunique()

        feature_dict['order_avg_day'][c] = (
            feature_dict['order_sum'][c] / feature_dict['work_days'][c]
        )

        feature_dict['dis_avg_day'][c] = (
            feature_dict['dis_sum'][c] / feature_dict['work_days'][c]
        )

        feature_dict['time_avg_order'][c] = c_df['time_to_last_package'].mean()
        feature_dict['dis_avg_order'][c] = c_df['dis_to_last_package'].mean()

        total_time = c_df['time_to_last_package'].sum()
        feature_dict['speed_avg_order'][c] = (
            feature_dict['dis_sum'][c] / total_time if total_time > 0 else 5
        )

    return couriers, feature_dict


In [56]:
# def make_aoi_dict():
#     print('start make aoi dict')
#     df = pd.read_csv("package_feature.csv", sep=',', encoding='utf-8')
#     aois = df['aoi_id'].value_counts().to_dict()
#     aoi_dict = reindex(aois)
#     df['aoi_id'] = df['aoi_id'].apply(lambda x: aoi_dict[x])
#     courier_l = split_trajectory(df)
#     aoi_nums = df["aoi_id"].nunique()
#     aoi_frequency_adj = np.zeros([aoi_nums, aoi_nums])
#     aoi_time_adj = np.zeros([aoi_nums, aoi_nums])
#     aoi_order_num = np.zeros([aoi_nums])
#     aoi_type = np.zeros([aoi_nums])
#     aoi_actime = np.zeros([aoi_nums])
#     aoi_lng = np.zeros([aoi_nums])
#     aoi_lat = np.zeros([aoi_nums])
#     pbar = tqdm(total=len(courier_l))


In [57]:
def process_courier_chunk(args):
    c_lst = args["c_lst"]
    df_traj = args["df_traj"]   # pass explicitly

    result = {}

    for c in c_lst:
        c_v = c.reset_index(drop=True)

        for n, row in c_v.iterrows():

            # ----- time logic -----
            at = time2min(row["receipt_datetime"])
            if row["sign_datetime"].date() != row["receipt_datetime"].date():
                at -= 1440

            o_id = row["order_id"]

            # ----- last package info -----
            if n == 0:
                last_ft = row["finish_time_minute"]
                dis_to_last = 0
            else:
                prev_row = c_v.iloc[n - 1]
                last_ft = prev_row["finish_time_minute"]

                prev_j = int(prev_row["j"])
                curr_j = int(row["j"])

                last_x = df_traj.loc[prev_j, "lng"]
                last_y = df_traj.loc[prev_j, "lat"]
                curr_x = df_traj.loc[curr_j, "lng"]
                curr_y = df_traj.loc[curr_j, "lat"]

                dis_to_last = int(
                    euclidean_distance(last_x, last_y, curr_x, curr_y)
                )

            # ----- update result dict -----
            result[(o_id, "accept_time_minute")] = at
            result[(o_id, "time_to_last_package")] = (
                row["finish_time_minute"] - last_ft
            )
            result[(o_id, "dis_to_last_package")] = dis_to_last

    return result


In [58]:
def list2str(x):
    if len(x) == 0:
        return ""
    return ",".join(str(i) for i in x)


In [59]:
def get_todo_kernel(args):
    """
    Compute unfinished (todo) tasks for each order.
    """
    result = {}
    c_lst = args["c_lst"]

    for c in c_lst:
        # reset index to ensure positional consistency
        c_v = c.reset_index(drop=True)

        
        order_ids = c_v["order_idx"].values
        accept_times = c_v["accept_time_minute"].values
        finish_times = c_v["finish_time_minute"].values

        n = len(c_v)

        for i in range(n):
            now_id = order_ids[i]
            now_finish = finish_times[i]

            # accepted before now_finish
            accepted_mask = accept_times < now_finish

            # but finished later than now (i.e., appear after i)
            todo_mask = accepted_mask & (order_ids > now_id)

            todo_ids = order_ids[todo_mask]

            result[(now_id, "todo_task")] = list2str(todo_ids)
            result[(now_id, "todo_task_num")] = int(todo_ids.shape[0])

    return result


In [60]:
import math


def pre_process(df,thread_num=20):
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    df = df.sort_values(by=['ds', 'delivery_user_id', 'sign_datetime'])
    df['finish_time_minute'] = df["sign_datetime"].apply(lambda x: time2min(x))
    courier_l = split_trajectory(df)
    n = len(courier_l)
    task_num = math.ceil(n / thread_num)
    args_lst = [
        {
            "c_lst": courier_l[i : min(i + task_num, n)],
            "df_traj": df_trajectory_sorted
        }
        for i in range(0, n, task_num)
    ]  
    results = multi_thread_work(
        args_lst,
        process_courier_chunk,
        thread_num
    )
    result_dict = dict_merge(results)

    init_value = [0] * len(df)
    expand_column = ['accept_time_minute', 'time_to_last_package', 'dis_to_last_package']
    for col in expand_column:
        df[col] = init_value
        df[col] = df['order_id'].apply(lambda x: result_dict[(x, col)])
    print('Basic information expanded...')

    df.insert(0, 'order_idx', range(1, len(df) + 1))

    print('Get unfinished tasks ...')
    courier_l = split_trajectory(df)
    n =  len(courier_l)
    args_lst = [{'c_lst': courier_l[i: min(i + task_num, n)]} for i in range(0, n, task_num)]
    results = multi_thread_work(args_lst, get_todo_kernel, thread_num)
    result_dict = dict_merge(results)

    init_value = [0] * len(df)
    expand_column = ['todo_task', 'todo_task_num']
    for col in expand_column:
        df[col] = init_value
        df[col] = df['index'].apply(lambda x: result_dict[(x, col)])
    print('Get unfinished tasks done...')

    couriers, couriers_feature = courier_info(df)
    df['dis_avg_day'] = [couriers_feature['dis_avg_day'][c] for c in df['courier_id']]
    df['time_avg_order'] = [couriers_feature['time_avg_order'][c] for c in df['courier_id']]

    df['relative_dis_to_last_package'] = df.apply(lambda r: r['dis_to_last_package'] / r['dis_avg_day'] * 100 if r['dis_avg_day'] !=0 else 0, axis=1)
    GRID = 20  # meters

    df['grid_x'] = (df['lng'] // GRID).astype(int)
    df['grid_y'] = (df['lat'] // GRID).astype(int)
    df['grid_id'] = df['grid_x'].astype(str) + "_" + df['grid_y'].astype(str)

    print('Features between adjacent tasks constructed...')

    days = sorted(list(set(df['ds'])))
    df['days'] =  [days.index(d) for d in df['ds']]

    cou_df_dic = {}
    for fea, fea_dict in couriers_feature.items():
        fea_lst = [fea_dict[c] for c in couriers]
        cou_df_dic[fea] = fea_lst
    cou_df = pd.DataFrame(cou_df_dic)

    df.to_csv('package_feature.csv', index=False)
    cou_df.to_csv('courier_feature.csv', index=False)
    # make_aoi_dict()

    print('Data preprocessing is done...')
    return df, cou_df



In [None]:
pre_process(df_delivery,thread_num=10)