# 计算轨迹相似度

In [1]:
import torch

# Configs
directory = 'tdrive-data/'
training_set_file = directory + 'tdrive-r-train-ps-40'
sp_pretrain_model_path = directory + 'sp_pretrain_model.pt'
ts_pretrain_model_path = directory + 'ts_pretrain_model.pt'
sm_pretrain_model_path = directory + 'semantic2vec.model'
at2vec_model_path = directory + 'at2vec_model.pt'
trajectory_file = directory + 'tdrive-chosen-fill-with-ts'

ts_gap = 10 * 60 * 1000
num_x_grids = 200
num_y_grids = 200
sp_len, ts_len, sm_len = 100, 100, 100
pt_len = sp_len + ts_len + sm_len
hidden_len = 256
sampled_tr_len, complete_tr_len = 40, 50

device = torch.device('cpu')

INT_MAX = 214748364700000000000
FLOAT_MAX = 1E8

In [2]:
import math

# These params are needed for loading the model
num_sp_grids = None
num_ts_grids = None

# Recover range info from training set
ts_range = (INT_MAX, -INT_MAX)
x_range = (FLOAT_MAX, -FLOAT_MAX)
y_range = (FLOAT_MAX, -FLOAT_MAX)

with open(training_set_file) as f:
    for line in f:
        fields = line.strip().split('\t')
        ts = int(fields[1])
        x = float(fields[2])
        y = float(fields[3])
        ts_range = min(ts_range[0], ts), max(ts_range[1], ts)
        x_range = min(x_range[0], x), max(x_range[1], x)
        y_range = min(y_range[0], y), max(y_range[1], y)
        

def pair2spid(x_id: int, y_id: int, num_x_grids: int):
    return y_id * num_x_grids + x_id

def sp2id(x: float, y: float,
          min_x: float, min_y: float,
          max_x: float, max_y: float,
          x_gap: float, y_gap: float):
    """
    (x, y)坐标转换为空间网格令牌值。假设max_x和max_y不能取到。

    Returns:
        令牌值, (x轴编号, y轴编号)
    """
    x, y = max(min_x, x), max(min_y, y)
    x, y = min(max_x, x), min(max_y, y)
    num_x_grids = int(math.ceil((max_x - min_x) / x_gap))
    x_grid, y_grid = (int(math.floor((x - min_x) / x_gap)),
                      int(math.floor((y - min_y) / y_gap)))
    return pair2spid(x_grid, y_grid, num_x_grids)

print('ts_range', ts_range)
num_ts_grids = (ts_range[1] - ts_range[0]) // ts_gap + 1
x_gap = (x_range[1] - x_range[0]) / num_x_grids
y_gap = (y_range[1] - y_range[0]) / num_y_grids
num_sp_grids = sp2id(x_range[1], y_range[1],
                     x_range[0], y_range[0],
                     x_range[1], y_range[1],
                     x_gap, y_gap)

ts_range (1201930247000, 1202463545000)


In [3]:
from at2vec import PretrainModel, EncoderDecoder, BareDataset, get_mat
import torch
from gensim.models import Word2Vec
from functools import partial

# load the model
sp_model = PretrainModel(num_sp_grids, sp_len, device)
sp_model.load_state_dict(torch.load(sp_pretrain_model_path)['model'])
ts_model = PretrainModel(num_ts_grids, ts_len, device)
ts_model.load_state_dict(torch.load(ts_pretrain_model_path)['model'])
sm_model = Word2Vec.load(sm_pretrain_model_path)

model = EncoderDecoder(sampled_tr_len, complete_tr_len, pt_len, hidden_len,
                       num_sp_grids, num_ts_grids, len(sm_model.wv), device)
state = torch.load(at2vec_model_path)
model.load_state_dict(state['model'])

<All keys matched successfully>

In [4]:
# load data
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, bare_dataset, raw2tr):
        self.bare_dataset = bare_dataset
        self.vectors = []
        for i in range(len(self.bare_dataset)):
            _, raw = self.bare_dataset[i]
            self.vectors.append(raw2tr(raw))

    def __len__(self):
        return len(self.bare_dataset)

    def __getitem__(self, index):
        """
        Returns:
            (index, tr)
        """
        tr = self.vectors[index]
        return index, tr


def ts2id(ts: int, min_ts: int, max_ts: int, ts_gap: int):
    ts = min(max_ts, ts)
    ts = max(min_ts, ts)
    return int((ts - min_ts) // ts_gap)


def get_mat(tr, sp_model, ts_model, sm_model):
    get_spid = partial(sp2id, min_x=x_range[0], max_x=x_range[1], min_y=y_range[0], max_y=y_range[1],
                       x_gap=x_gap, y_gap=y_gap)
    get_tsid = partial(
        ts2id, min_ts=ts_range[0],  max_ts=ts_range[1], ts_gap=ts_gap)
    ts_col, all_cols, sm_col = (tr.iloc[:, 1],
                                tr.iloc[:, 2:4],
                                tr.iloc[:, 4])
    sp_vec = torch.stack([sp_model.embed(get_spid(al.iloc[0], al.iloc[1]))
                          for (_, al) in all_cols.iterrows()], dim=0)
    ts_vec = torch.stack([ts_model.embed(get_tsid(ts))
                         for (_, ts) in ts_col.iteritems()], dim=0)
    # semantics are more complicated
    vec_set = []
    for _, sm in sm_col.iteritems():
        # For each trajectory point
        # keyword list of this point
        kws = sm.replace(' ', '-').split(',')
        # 所有关键词向量取平均并归一化，作为该点语义向量
        avg_vec = torch.from_numpy(sm_model.wv.get_mean_vector(
            kws, pre_normalize=True, post_normalize=True))
        vec_set.append(avg_vec)
    sm_vec = torch.stack(vec_set, dim=0)
    # returns: (tr_len, sp_len)
    return torch.cat((sp_vec, ts_vec, sm_vec), dim=1)


raw2tr = partial(get_mat, sp_model=sp_model,
                 ts_model=ts_model, sm_model=sm_model)

bare_dataset = BareDataset(None, trajectory_file, update_ctx=False, ctx=None)
print(len(bare_dataset))
dataset = TestDataset(bare_dataset, raw2tr)

total lines: 294
6


In [5]:
import torch

query_vec = model.get_rep_vector(dataset[0][1])

for _, t in dataset:
    vec = model.get_rep_vector(t)
    dist = torch.dist(query_vec, vec)
    print(dist)

tensor(0.)
tensor(10.3989)
tensor(9.4192)
tensor(10.5366)
tensor(9.8658)
tensor(10.2114)
