In [1]:
using JSON
using Serialization
using DelimitedFiles
using Distances

In [2]:
include("utils.jl")

knnPrecision (generic function with 1 method)

In [3]:
datapath = "/home/xiucheng/Github/t2vec/data"

"/home/xiucheng/Github/t2vec/data"

In [4]:
param = JSON.parsefile("../hyper-parameters.json")
regionps = param["region"]
cityname = regionps["cityname"]
cellsize = regionps["cellsize"]

100.0

In [5]:
region = SpatialRegion(cityname,
                       regionps["minlon"], regionps["minlat"],
                       regionps["maxlon"], regionps["maxlat"],
                       cellsize, cellsize,
                       regionps["minfreq"], # minfreq
                       40_000, # maxvocab_size
                       10, # k
                       4)

println("Building spatial region with:
        cityname=$(region.name),
        minlon=$(region.minlon),
        minlat=$(region.minlat),
        maxlon=$(region.maxlon),
        maxlat=$(region.maxlat),
        xstep=$(region.xstep),
        ystep=$(region.ystep),
        minfreq=$(region.minfreq)")

paramfile = "$datapath/$(region.name)-param-cell$(Int(cellsize))"
if isfile(paramfile)
    println("Reading parameter file from $paramfile")
    region = deserialize(paramfile)
    println("Loaded $paramfile into region")
else
    println("Cannot find $paramfile")
end

Building spatial region with:
        cityname=porto,
        minlon=-8.735152,
        minlat=40.953673,
        maxlon=-8.156309,
        maxlat=41.307945,
        xstep=100.0,
        ystep=100.0,
        minfreq=100
Reading parameter file from /home/xiucheng/Github/t2vec/data/porto-param-cell100
Loaded /home/xiucheng/Github/t2vec/data/porto-param-cell100 into region


## Exp1 Similar search without downsampling and distorting

In [8]:
## create querydb 
prefix = "exp1"
do_split = true
start = 1_000_000+20_000
num_query = 1000
num_db = 100_000
querydbfile = joinpath(datapath, "$prefix-querydb.h5")
tfile = joinpath(datapath, "$prefix-trj.t")
labelfile = joinpath(datapath, "$prefix-trj.label")
vecfile = joinpath(datapath, "$prefix-trj.h5")

"/home/xiucheng/Github/t2vec/data/exp1-trj.h5"

In [9]:
createQueryDB("$datapath/$cityname.h5", start, num_query, num_db,
              (x, y)->(x, y),
              (x, y)->(x, y);
              do_split=do_split,
              querydbfile=querydbfile)
createTLabel(region, querydbfile; tfile=tfile, labelfile=labelfile)

101000

In [12]:
checkpoint = joinpath(datapath, "best_model.pt")
t2vec = `python t2vec.py -mode 2 -vocab_size 18864 -checkpoint $checkpoint -prefix $prefix`
println(t2vec)

cd("/home/xiucheng/Github/t2vec/")
run(t2vec)
cd("/home/xiucheng/Github/t2vec/experiment")
pwd()

`[4mpython[24m [4mt2vec.py[24m [4m-mode[24m [4m2[24m [4m-vocab_size[24m [4m18864[24m [4m-checkpoint[24m [4m/home/xiucheng/Github/t2vec/data/best_model_gen.pt[24m [4m-prefix[24m [4mexp1[24m`
Namespace(batch=128, bidirectional=True, bucketsize=[(20, 30), (30, 30), (30, 50), (50, 50), (50, 70), (70, 70), (70, 100), (100, 100)], checkpoint='/home/xiucheng/Github/t2vec/data/best_model_gen.pt', criterion_name='NLL', cuda=True, data='/home/xiucheng/Github/t2vec/data', discriminative_w=0.1, dist_decay_speed=0.8, dropout=0.2, embedding_size=256, epochs=15, generator_batch=32, hidden_size=256, knearestvocabs=None, learning_rate=0.001, max_grad_norm=5.0, max_length=200, max_num_line=20000000, mode=2, num_layers=3, prefix='exp1', pretrained_embedding=None, print_freq=50, save_freq=1000, start_iteration=0, t2vec_batch=256, use_discriminative=False, vocab_size=18864)
=> loading checkpoint '/home/xiucheng/Github/t2vec/data/best_model_gen.pt'
0: Encoding 256 trjs...
100: Encoding 2

"/home/xiucheng/Github/t2vec/experiment"

In [13]:
## load vectors and labels
vecs = h5open(vecfile, "r") do f
    read(f["layer3"])
end
label = readdlm(labelfile, Int)

query, db = vecs[:, 1:num_query], vecs[:, num_query+1:end]
queryLabel, dbLabel = label[1:num_query], label[num_query+1:end]
query, db = [query[:, i] for i in 1:size(query, 2)], [db[:, i] for i in 1:size(db, 2)];

# without discriminative loss
dbsizes = [20_000, 40_000, 60_000, 80_000, 100_000]
for dbsize in dbsizes
    ranks = ranksearch(query, queryLabel, db[1:dbsize], dbLabel[1:dbsize], euclidean)
    println("mean rank: $(mean(ranks)) with dbsize: $dbsize")
end
# mean rank: 2.135 with dbsize: 20000
# mean rank: 3.132 with dbsize: 40000
# mean rank: 4.244 with dbsize: 60000
# mean rank: 5.553 with dbsize: 80000
# mean rank: 6.662 with dbsize: 100000

mean rank: 2.214 with dbsize: 20000
mean rank: 3.317 with dbsize: 40000
mean rank: 4.532 with dbsize: 60000
mean rank: 6.022 with dbsize: 80000
mean rank: 7.224 with dbsize: 100000


## Exp2 Similar search with downsampling

### create querydb

In [115]:
rate = 0.6
prefix = "exp2-r$(Int(10rate))"
do_split = true
start = 1_000_000+20_000
num_query = 1000
num_db = 100_000

querydbfile = joinpath(datapath, "$prefix-querydb.h5")
tfile = joinpath(datapath, "$prefix-trj.t")
labelfile = joinpath(datapath, "$prefix-trj.label")
vecfile = joinpath(datapath, "$prefix-trj.h5")

"/home/xiucheng/Github/t2vec/data/exp2-r6-trj.h5"

In [116]:
# for rate in [0.2, 0.3, 0.4, 0.5]
#     querydbfile = joinpath(datapath, "$prefix-r$(Int(10rate))-querydb.h5")
#     tfile = joinpath(datapath, "$prefix-r$(Int(10rate))-trj.t")
#     labelfile = joinpath(datapath, "$prefix-r$(Int(10rate))-trj.label")
#     vecfile = joinpath(datapath, "$prefix-r$(Int(10rate))-trj.h5")
#     createQueryDB("$datapath/$cityname.h5", start, num_query, num_db,
#               (x, y)->downsampling(x, y, rate),
#               (x, y)->downsampling(x, y, rate);
#               do_split=do_split,
#               querydbfile=querydbfile)
#     createTLabel(region, querydbfile; tfile=tfile, labelfile=labelfile)
# end

createQueryDB("$datapath/$cityname.h5", start, num_query, num_db,
              (x, y)->downsampling(x, y, rate),
              (x, y)->downsampling(x, y, rate);
              do_split=do_split,
              querydbfile=querydbfile)
createTLabel(region, querydbfile; tfile=tfile, labelfile=labelfile)

101000

In [119]:
checkpoint = joinpath(datapath, "best_model_gen.pt")
t2vec = `python t2vec.py -mode 2 -vocab_size 18864 -checkpoint $checkpoint -prefix $prefix`
println(t2vec)

cd("/home/xiucheng/Github/t2vec/")
run(t2vec)
cd("/home/xiucheng/Github/t2vec/experiment")
pwd()

`[4mpython[24m [4mt2vec.py[24m [4m-mode[24m [4m2[24m [4m-vocab_size[24m [4m18864[24m [4m-checkpoint[24m [4m/home/xiucheng/Github/t2vec/data/best_model_gen.pt[24m [4m-prefix[24m [4mexp2-r6[24m`
Namespace(batch=128, bidirectional=True, bucketsize=[(20, 30), (30, 30), (30, 50), (50, 50), (50, 70), (70, 70), (70, 100), (100, 100)], checkpoint='/home/xiucheng/Github/t2vec/data/best_model_gen.pt', criterion_name='NLL', cuda=True, data='/home/xiucheng/Github/t2vec/data', discriminative_w=0.1, dist_decay_speed=0.8, dropout=0.2, embedding_size=256, epochs=15, generator_batch=32, hidden_size=256, knearestvocabs=None, learning_rate=0.001, max_grad_norm=5.0, max_length=200, max_num_line=20000000, mode=2, num_layers=3, prefix='exp2-r6', pretrained_embedding=None, print_freq=50, save_freq=1000, start_iteration=0, t2vec_batch=256, use_discriminative=False, vocab_size=18864)
=> loading checkpoint '/home/xiucheng/Github/t2vec/data/best_model_gen.pt'
0: Encoding 256 trjs...
100: Enco

"/home/xiucheng/Dropbox/code/t2vec+/experiment"

In [120]:
vecs = h5open(vecfile, "r") do f
    read(f["layer3"])
end
label = readdlm(labelfile, Int)

query, db = vecs[:, 1:num_query], vecs[:, num_query+1:end]
queryLabel, dbLabel = label[1:num_query], label[num_query+1:end]
query, db = [query[:, i] for i in 1:size(query, 2)], [db[:, i] for i in 1:size(db, 2)];

# without discriminative loss
dbsize = 100_000
ranks = ranksearch(query, queryLabel, db[1:dbsize], dbLabel[1:dbsize], euclidean)
println("mean rank: $(mean(ranks)) with dbsize: $dbsize")

mean rank: 16.451 with dbsize: 100000
