/
optimizer.py
334 lines (277 loc) · 12.7 KB
/
optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
from scipy.sparse.linalg import spsolve as sparse_solve
# from .math_utils import *
from math_utils import *
import sys
from functools import partial
class Variable:
def __init__(self, arg, param_slice):
self.arg = arg
self.param_slice = param_slice
self.backup_arg = None
def backup(self):
self.backup_arg = self.arg.data()
def restore(self):
self.arg.set_data(self.backup_arg)
class BlockDB:
def __init__(self):
self.blocks = {}
self.tot_data_size = 0
self.tot_param_size = 0
def add(self, arg):
uid = id(arg) # python object id
if uid not in self.blocks:
plen = param_len(arg)
pslice = slice(self.tot_param_size, self.tot_param_size + plen)
self.blocks[uid] = Variable(arg, pslice)
self.tot_param_size += plen
return pslice
return self.blocks[uid].param_slice
class Cost:
def __init__(self, func, args, param_slices, elim=True):
self.func = func
self.args = args
self.param_slices = param_slices
self.elim = elim
class Optimizer:
MOODS = {
"excited": "ヽ(^@^)ノ",
"worried": " (~@~メ) ",
"crying": " (╥﹏╥) ",
"shrug": "¯\\_(ツ)_/¯",
"impatient": "(゚@゚;≡;゚@゚)",
}
def __init__(self):
self.mult_damping = 3e-2
# self.add_damping = 1e-4
self.elims = BlockDB()
self.survs = BlockDB()
self.costs = []
# amir added
self.initial_cameras_locations = []
# add a residual, first argument is Schur elimination
def add_elim_cost(self, func, *args):
param_slices = [
self.elims.add(args[0]),
*(self.survs.add(arg) for arg in args[1:]),
]
self.costs.append(Cost(func, args, param_slices, elim=True))
# add a residual, no Schur elimination parameter
def add_cost(self, func, *args):
param_slices = [self.survs.add(arg) for arg in args]
self.costs.append(Cost(func, args, param_slices, elim=False))
# Create matrices for blocks of the Hessian, to apply Schur elimination
# trick. Schur elimination allows to project eliminate a subset of parameters
# from the linear system, the math is explained below. From optimization point
# of view we will be computing the step on cameras solving a reduced
# optimization problem: we turn the quadratic model of the cost on
# points+cameras to a quadratic model only on cameras, assuming the points are
# going to be placed to their best possible position. After solving on cameras
# we go back to points and compute the step on points.
# This process is also called marginalization (this is probability jargon, if
# the Hessian is the precision =covariance^{-1} matrix we marginalize away some
# variables, and we get to work with the precision matrix of the subset - on
# covariance side it's easy as we would just get a block of the matrix, since
# we are working with the inverse we must perform a small computation to the
# "inverse of the block of the inverse"). It can be useful in general to
# understand how the constraints map to the subset of parameters, for instance
# we can get a camera-camera edge marginalizing away the points and some neighboring
# cameras (be aware that inaccuracies arise if we marginalize away stuff multiple
# times... so this cannot be used to simplify a complex problem without any loss)
#
# We want to solve H * x = b, b is the gradient and H the (damped) Hessian. Splitting
# the camera (X, surv) and point (P, elim) parts we want to solve
# | A B | | deltaX | | errorX |
# | | * | | = | |
# | B^T D | | deltaP | | errorP |
#
# The key observation is that D is block-diagonal (and in real optimizers it's
# kept as a list of blocks), so it's very easy to invert, inverting individual
# blocks (inversion of a dense matrix is O(n^3), so many small blocks are harmless
# compared to a large dense matrix)
#
# The above equation can be rewritten as
# | A * deltaX + B * deltaP = errorX
# | B^T * deltaX + D * deltaP = errorP
# Since we can invert D we can explicitate deltaP from the second:
# deltaP = D^{-1}(errorP - B^T * deltaX)
# and replacing this value of deltaP in the first:
# A * deltaX + B * D^{-1}(errorP - B^T * deltaX) = errorX
# The only unknown is deltaX! therefore we can collect it on one side:
# (A - B * D^{-1} * B^T) * deltaX = errorX - B * D^{-1} * errorP
# The matrix S = (A - B * D^{-1} * B^T) is the Schur complement.
def setup(self):
self.deltaP = np.ndarray(self.elims.tot_param_size, dtype=np.float64)
self.deltaX = np.ndarray(self.survs.tot_param_size, dtype=np.float64)
self.errorP = np.ndarray(self.elims.tot_param_size, dtype=np.float64)
self.errorX = np.ndarray(self.survs.tot_param_size, dtype=np.float64)
self.A = lil_matrix((self.survs.tot_param_size, self.survs.tot_param_size))
self.D = lil_matrix((self.elims.tot_param_size, self.elims.tot_param_size))
self.B = lil_matrix((self.survs.tot_param_size, self.elims.tot_param_size))
# Costs is everything we have to find. Num_points*Num_camersa
for res in self.costs:
for i, s_i in enumerate(res.param_slices):
iSurv = i >= (1 if res.elim else 0)
for j in range(i + 1): # j = 0,1,..,i
s_j = res.param_slices[j]
jSurv = j >= (1 if res.elim else 0)
if jSurv:
# TODO: use solver just requiring lower half
self.A[s_i, s_j] = 1.0
self.A[s_j, s_i] = 1.0
elif iSurv:
self.B[s_i, s_j] = 1.0
else:
self.D[s_i, s_j] = 1.0
self.A = self.A.tocsr()
self.D = self.D.tocsr()
self.invD = self.D.copy()
self.B = self.B.tocsr()
def compute_cost(self):
cost = 0
for res in self.costs:
err = res.func(*res.args)
if isinstance(err, Vec):
cost += err.squared_norm() * 0.5 # cost contribution
else: # assume plain number
cost += err * err * 0.5
return cost
def compute_gradient(self):
self.errorP.fill(0)
self.errorX.fill(0)
cost = 0
for res in self.costs:
err, pders = Jet.compute_first_order(res.func, *res.args)
cost += (err.T @ err)[0, 0] * 0.5 # cost contribution
for i, s_i in enumerate(res.param_slices):
iSurv = i >= (1 if res.elim else 0)
# gradient contribution
(self.errorX if iSurv else self.errorP)[s_i] += (
err.T @ pders[i]
).reshape(-1)
return cost
def compute_gradient_hessian(self, calc_hessian_flag=True):
self.errorP.fill(0)
self.errorX.fill(0)
self.A.data.fill(0)
self.B.data.fill(0)
self.D.data.fill(0)
cost = 0
for res in self.costs:
err, pders = Jet.compute_first_order(res.func, *res.args)
cost += (err.T @ err)[0, 0] * 0.5 # cost contribution
for i, s_i in enumerate(res.param_slices):
iSurv = i >= (1 if res.elim else 0)
# gradient contribution
(self.errorX if iSurv else self.errorP)[s_i] += (
err.T @ pders[i]
).reshape(-1)
if calc_hessian_flag is True:
for j in range(i + 1): # j = 0,1,..,i
s_j = res.param_slices[j]
jSurv = j >= (1 if res.elim else 0)
block = pders[i].T @ pders[j] # reshape(-1, 1)
if jSurv:
# TODO: use solver just requiring lower half
self.A[s_i, s_j] += block
if i != j:
self.A[s_j, s_i] += block.T
elif iSurv:
self.B[s_i, s_j] += block
else:
self.D[s_i, s_j] += block
return cost
def optimize(self):
self.setup()
print(
"Baby-Optimizer is up and toddling its baby optimization-steps!\n"
"\n"
' ,=""=,\n'
" c , _,{\n"
" /\\ @ ) __\n"
" / ^~~^\\ <=.,__/ '}=\n"
"(_/ ,, ,,) \_ _>_/~\n"
" ~\\_(/-\\)'-,_,_,_,-'(_)-(_)\n"
)
# main loop
iteration = 0
last_problematic_iteration = 0
last_improvement = 0
while True:
cost = self.compute_gradient_hessian()
# apply damping to D, compute invD inverting one block at a time
for arg_info in self.elims.blocks.values():
s = arg_info.param_slice
D_block = self.D[s, s]
D_damped_block = D_block.todense() + np.diag(
D_block.diagonal() * self.mult_damping # + self.add_damping
)
self.D[s, s] = D_damped_block
self.invD[s, s] = np.linalg.inv(D_damped_block)
# apply damping to A
self.A.setdiag(
self.A.diagonal() * (1 + self.mult_damping) # + self.add_damping
)
# compute Schur complement
S = self.A - (self.B @ (self.invD @ self.B.T))
# compute error for reduced linear system
reducedErrorX = self.errorX - self.B @ (self.invD @ self.errorP)
self.deltaX = sparse_solve(S, reducedErrorX)
self.deltaP = self.invD @ (self.errorP - self.B.T @ self.deltaX)
# expected cost improvement, according to the quadratic model
expected_cost_improvement = 0.5 * (
np.dot(self.deltaX, self.errorX) + np.dot(self.deltaP, self.errorP)
)
for b in self.survs.blocks.values():
b.backup()
apply_parametrized_step(b.arg, -self.deltaX[b.param_slice])
for b in self.elims.blocks.values():
b.backup()
apply_parametrized_step(b.arg, -self.deltaP[b.param_slice])
new_cost = self.compute_gradient()
new_expected_cost_improvement = 0.5 * (
np.dot(self.deltaX, self.errorX) + np.dot(self.deltaP, self.errorP)
)
cost_improvement = cost - new_cost
improvement_ratio = cost_improvement / expected_cost_improvement
if new_cost > cost: # cost got worse? that's BAD!
self.mult_damping = self.mult_damping * 2
# self.add_damping = self.add_damping * 3
mood = Optimizer.MOODS["crying"]
last_problematic_iteration = iteration
# restore previous state
for b in self.survs.blocks.values():
b.restore()
for b in self.elims.blocks.values():
b.restore()
else: # cost improved? check how well it matches with quadratic model
# at least some degree of agreement with quadratic model
if improvement_ratio > 0.4:
self.mult_damping = self.mult_damping * 0.5
# self.add_damping = self.add_damping * 0.7
mood = Optimizer.MOODS["excited"]
else:
self.mult_damping = self.mult_damping * 2
# self.add_damping = self.add_damping * 1.5
mood = Optimizer.MOODS["worried"]
# print status
rel_change_percentage = (new_cost - cost) * 100.0 / cost
print(
f"{mood} [{iteration}]"
+ f" {cost:.03e} -> {new_cost:.03e} ({rel_change_percentage:.02f}%)"
+ f" q-rel-\u03b4: {improvement_ratio:.02f}"
+ f" \u03bb: {self.mult_damping:.02e}"
)
# check stopping condition (they are all sloppy...)
if new_cost < cost * 0.999:
last_improvement = iteration
if (
iteration >= last_improvement + 3
and iteration <= last_problematic_iteration + 2
) or new_cost < 1e-15:
mood = Optimizer.MOODS["shrug"]
print(f"{mood} converged or no improvement")
break
iteration += 1