-
Notifications
You must be signed in to change notification settings - Fork 44
/
SpatialRegionTools.jl
368 lines (335 loc) · 11.8 KB
/
SpatialRegionTools.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#module SpatialRegionTools
using StatsBase:rle
using HDF5
using DataStructures
using NearestNeighbors
include("utils.jl")
#export SpatialRegion, makeVocab!, gps2vocab, saveKNearestVocabs,
# trip2seq, seq2trip, createTrainVal
const UNK = 3
"""
example:
region = SpatialRegion("porto",
-8.735152, 40.953673,
-8.156309, 41.307945,
cellsize, cellsize,
50, # minfreq
50_000, # maxvocab_size
5, # k
4) # vocab_start
"""
mutable struct SpatialRegion
name::String
## bounding box
minlon::Float64
minlat::Float64
maxlon::Float64
maxlat::Float64
minx::Float64
miny::Float64
maxx::Float64
maxy::Float64
xstep::Float64
ystep::Float64
numx::Int
numy::Int
minfreq::Int
maxvocab_size::Int
k::Int
## the number of hitting
cellcount
## hot cells
hotcell::Vector{Int}
## hot cell kdtree
hotcell_kdtree
## map a hot cell into the vocabulary id
hotcell2vocab::Dict{Int, Int}
## map the vocabulary id into its hot cell
vocab2hotcell::Dict{Int, Int}
## vocab start
vocab_start::Int
## vocabulary size
vocab_size::Int
## whether indices have been built
built::Bool
function SpatialRegion(name::String,
minlon::Float64,
minlat::Float64,
maxlon::Float64,
maxlat::Float64,
xstep::Float64,
ystep::Float64,
minfreq::Int,
maxvocab_size::Int,
k::Int,
vocab_start::Int)
minx, miny = lonlat2meters(minlon, minlat)
maxx, maxy = lonlat2meters(maxlon, maxlat)
numx = round(maxx - minx, digits=6) / xstep
numx = convert(Int, ceil(numx))
numy = round(maxy - miny, digits=6) / ystep
numy = convert(Int, ceil(numy))
new(name,
minlon, minlat, maxlon, maxlat,
minx, miny, maxx, maxy,
xstep, ystep,
numx, numy, minfreq, maxvocab_size, k,
Accumulator{Int, Int}(),
Int[],
Any,
Dict(),
Dict(),
vocab_start,
vocab_start,
false)
end
end
#function gps2cell(region::SpatialRegion, lon::Float64, lat::Float64)
# lonoffset = round(lon - region.minlon, 6) / region.hstep
# latoffset = round(lat - region.minlat, 6) / region.vstep
# lonoffset = convert(Int64, floor(lonoffset))
# latoffset = convert(Int64, floor(latoffset))
# latoffset * region.numlon + lonoffset
#end
#
#function cell2gps(region::SpatialRegion, cell::Int)
# latoffset = div(cell, region.numlon)
# lonoffset = mod(cell, region.numlon)
# lat = region.minlat + (latoffset + 0.5) * region.vstep
# lon = region.minlon + (lonoffset + 0.5) * region.hstep
# lon, lat
#end
#cell2gps(region, 0)
#gps2cell(region, (126.48349, 45.6468)...)
"""
Web Mercator coordinate to cell id
"""
function coord2cell(region::SpatialRegion, x::Float64, y::Float64)
xoffset = round(x - region.minx, digits=6) / region.xstep
yoffset = round(y - region.miny, digits=6) / region.ystep
xoffset = convert(Int, floor(xoffset))
yoffset = convert(Int, floor(yoffset))
yoffset * region.numx + xoffset
end
"""
Cell id to Web Mercator coordinate
"""
function cell2coord(region::SpatialRegion, cell::Int)
yoffset = div(cell, region.numx)
xoffset = mod(cell, region.numx)
y = region.miny + (yoffset + 0.5) * region.ystep
x = region.minx + (xoffset + 0.5) * region.xstep
x, y
end
function gps2cell(region::SpatialRegion, lon::Float64, lat::Float64)
x, y = lonlat2meters(lon, lat)
coord2cell(region, x, y)
end
function cell2gps(region::SpatialRegion, cell::Int)
x, y = cell2coord(region, cell)
meters2lonlat(x, y)
end
#lon, lat = -8.735152, 40.953673
#x, y = lonlat2meters(lon, lat)
#coord2cell(region, x, y+300)
#cell2coord(region, 10)
function inregion(region::SpatialRegion, lon::Float64, lat::Float64)
lon >= region.minlon && lon < region.maxlon &&
lat >= region.minlat && lat < region.maxlat
end
function inregion(region::SpatialRegion, trip::Matrix{Float64})
for i = 1:size(trip, 2)
inregion(region, trip[:, i]...) || return false
end
true
end
#inregion(region, -8.503, 41.35)
#inregion(region, [-8.503 -8.5; 41.01 41.01])
"""
Build the vocabulary from the raw trajectories stored in the hdf5 file.
For a trip (trajectory), each point lies in a column when reading with Julia
while it lies in a row if reading with Python.
"""
function makeVocab!(region::SpatialRegion, trjfile::String)
region.cellcount = Accumulator{Int, Int}()
## useful for evaluating the size of region bounding box
num_out_region = 0
## scan all trips (trajectories)
h5open(trjfile, "r") do f
num = read(attrs(f)["num"])
for i = 1:num
trip = read(f["/trips/$i"])
for p = 1:size(trip, 2)
lon, lat = trip[:, p]
if !inregion(region, lon, lat)
num_out_region += 1
else
cell = gps2cell(region, lon, lat)
push!(region.cellcount, cell)
end
end
i % 100_000 == 0 && println("Processed $i trips")
#i >= 8_000 && break
end
end
## filter out all hot cells
max_num_hotcells = min(region.maxvocab_size, length(region.cellcount))
topcellcount = sort(collect(region.cellcount), by=last, rev=true)[1:max_num_hotcells]
println("Cell count at max_num_hotcells:$(max_num_hotcells) is $(last(topcellcount[end]))")
#region.hotcell =
# filter((k,v)->v >= region.minfreq, Dict(topcellcount))|>keys|>collect
region.hotcell = filter(p -> last(p) >= region.minfreq, topcellcount) .|> first
## build the map between cell and vocab id
region.hotcell2vocab = Dict([(cell, i-1+region.vocab_start)
for (i, cell) in enumerate(region.hotcell)])
#region.vocab2hotcell = map(reverse, region.hotcell2vocab)
region.vocab2hotcell = Dict(last(p) => first(p) for p in region.hotcell2vocab)
## vocabulary size
region.vocab_size = region.vocab_start + length(region.hotcell)
## build the hot cell kdtree to facilitate search
coord = hcat(map(x->collect(cell2coord(region, x)), region.hotcell)...)
region.hotcell_kdtree = KDTree(coord)
region.built = true
num_out_region
end
function knearestHotcells(region::SpatialRegion, cell::Int, k::Int)
@assert region.built == true "Build index for region first"
coord = cell2coord(region, cell) |> collect
idxs, dists = knn(region.hotcell_kdtree, coord, k)
region.hotcell[idxs], dists
end
function nearestHotcell(region::SpatialRegion, cell::Int)
@assert region.built == true "Build index for region first"
hotcell, _ = knearestHotcells(region, cell, 1)
hotcell[1]
end
"""
k-nearest vocabs and corresponding distances for each vocab.
This is used in training for KLDiv loss.
"""
function saveKNearestVocabs(region::SpatialRegion)
V = zeros(Int, region.k, region.vocab_size)
D = zeros(Float64, region.k, region.vocab_size)
for vocab in 0:region.vocab_start-1
V[:, vocab+1] .= vocab
D[:, vocab+1] .= 0.0
end
for vocab in region.vocab_start:region.vocab_size-1
cell = region.vocab2hotcell[vocab]
kcells, dists = knearestHotcells(region, cell, region.k)
kvocabs = map(x->region.hotcell2vocab[x], kcells)
V[:, vocab+1] .= kvocabs
D[:, vocab+1] .= dists
end
cellsize = Int(region.xstep)
file = joinpath("../data", region.name * "-vocab-dist-cell$(cellsize).h5")
h5open(file, "w") do f
f["V"], f["D"] = V, D
end
println("Saved cell distance into $file")
nothing
end
"""
Return the vocab id for a cell in the region.
If the cell is not hot cell, the function will first search its nearest
hotcell and return the corresponding vocab id.
"""
function cell2vocab(region::SpatialRegion, cell::Int)
@assert region.built == true "Build index for region first"
if haskey(region.hotcell2vocab, cell)
return region.hotcell2vocab[cell]
else
hotcell = nearestHotcell(region, cell)
return region.hotcell2vocab[hotcell]
end
end
"""
Mapping a gps point to the vocab id in the vocabulary consists of hot cells,
each hot cell has an unique vocab id (hotcell2vocab).
If the point falls out of the region, `UNK` will be returned.
If the point falls into the region, but out of the hot cells its nearest hot cell
will be used.
"""
function gps2vocab(region::SpatialRegion, lon::Float64, lat::Float64)
inregion(region, lon, lat) || return UNK
cell2vocab(region, gps2cell(region, lon, lat))
end
function trip2seq(region::SpatialRegion, trip::Matrix{Float64})
seq = Int[]
for i in 1:size(trip, 2)
lon, lat = trip[:, i]
push!(seq, gps2vocab(region, lon, lat))
end
seq |> rle |> first
end
function seq2trip(region::SpatialRegion, seq::Vector{Int})
trip = zeros(Float64, 2, length(seq))
for i = 1:length(seq)
cell = get(region.vocab2hotcell, seq[i], -1)
cell == -1 && error("i=$i is out of vocabulary")
lon, lat = cell2gps(region, cell)
trip[:, i] = [lon, lat]
end
trip
end
"""
Create training and validation dataset
createTrainVal(region, "porto.h5", downsampling, 50, 10)
"""
function createTrainVal(region::SpatialRegion,
trjfile::String,
injectnoise::Function,
ntrain::Int,
nval::Int;
nsplit=5,
min_length=20,
max_length=100)
seq2str(seq) = join(map(string, seq), " ") * "\n"
h5open(trjfile, "r") do f
trainsrc, traintrg = open("../data/train.src", "w"), open("../data/train.trg", "w")
validsrc, validtrg = open("../data/val.src", "w"), open("../data/val.trg", "w")
for i = 1:ntrain+nval
trip = f["/trips/$i"] |> read
min_length <= size(trip, 2) <= max_length || continue
trg = trip2seq(region, trip) |> seq2str
noisetrips = injectnoise(trip, nsplit)
srcio, trgio = i <= ntrain ? (trainsrc, traintrg) : (validsrc, validtrg)
for noisetrip in noisetrips
## here: feel weird
#src = noisetrip |> trip2seq |> seq2str
src = trip2seq(region, noisetrip) |> seq2str
write(srcio, src)
write(trgio, trg)
end
i % 100_000 == 0 && println("Scaned $i trips...")
#i >= 8_000 && break
end
close(trainsrc), close(traintrg), close(validsrc), close(validtrg)
end
nothing
end
function saveregion(region::SpatialRegion, paramfile::String)
save(paramfile,
# JLD cannot handle Accumulator correctly
#"cellcount", region.cellcount.map,
"hotcell", region.hotcell,
"hotcell2vocab", region.hotcell2vocab,
"vocab2hotcell", region.vocab2hotcell,
"hotcell_kdtree", region.hotcell_kdtree,
"vocab_size", region.vocab_size)
nothing
end
function loadregion!(region::SpatialRegion, paramfile::String)
jldopen(paramfile, "r") do f
#region.cellcount = read(f, "cellcount")
region.hotcell = read(f, "hotcell")
region.hotcell2vocab = read(f, "hotcell2vocab")
region.vocab2hotcell = read(f, "vocab2hotcell")
region.hotcell_kdtree = read(f, "hotcell_kdtree")
region.vocab_size = read(f, "vocab_size")
region.built = true
end
nothing
end
#end # module
#createTrainVal(region, "porto.h5", downsampling, 50, 10)