This repository has been archived by the owner on Nov 17, 2023. It is now read-only.
/
initializer.jl
197 lines (162 loc) · 6.43 KB
/
initializer.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
AbstractInitializer
The abstract base class for all initializers.
To define a new initializer, it is
enough to derive a new type, and implement one or more of the following methods:
_init_weight(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
_init_bias(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
_init_gamma(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
_init_beta(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
Or, if full behavior customization is needed, override the following function
init(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
"""
abstract type AbstractInitializer end
function init(self :: T, name :: Base.Symbol, array :: NDArray) where T<:AbstractInitializer
strname = string(name)
if startswith(strname,"upsampling")
_init_bilinear(self,name, array)
elseif startswith(strname,"stn_loc") && endswith(strname,"weight")
_init_zero(self,name, array)
elseif startswith(strname,"stn_loc") && endswith(strname,"bias")
_init_loc_bias(self,name, array)
elseif endswith(strname, "bias")
_init_bias(self, name, array)
elseif endswith(strname, "gamma")
_init_gamma(self, name, array)
elseif endswith(strname, "beta")
_init_beta(self, name, array)
elseif endswith(strname, "weight")
_init_weight(self, name, array)
elseif endswith(strname, "moving_mean")
_init_zero(self, name, array)
elseif endswith(strname, "moving_var")
_init_zero(self, name, array)
else
_init_default(self, name, array)
end
end
function _init_loc_bias(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
assert(size(array) == (6,))
array[:]= [1.0, 0, 0, 0, 1.0, 0]
end
function _init_bilinear(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
@assert ndims(array) == 4
W, H, C, N = size(array) # Inverse of NCHW layout
filter = Base.zeros(eltype(array), W, H)
@assert H == W
f = ceil(Int, W / 2) # factor
c = (2 * f - 1 - f % 2) / (2 * f) # center
for x in 0:(W-1)
for y in 0:(H-1)
filter[x+1, y+1] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
end
end
@nd_as_jl rw=array begin
for i in 1:N
for j in 1:C
array[:,:, j, i] = filter
end
end
end
end
function _init_bias(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
array[:] = 0
end
function _init_gamma(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
array[:] = 1
end
function _init_beta(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
array[:] = 0
end
function _init_zero(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
array[:] = 0
end
function _init_default(self :: AbstractInitializer, name :: Base.Symbol, array :: NDArray)
error("Do not know how to init $name")
end
"""
UniformInitializer
Initialize weights according to a uniform distribution within the provided scale.
"""
struct UniformInitializer <: AbstractInitializer
scale :: AbstractFloat
end
"""
UniformInitializer(scale=0.07)
Construct a `UniformInitializer` with the specified scale.
"""
UniformInitializer() = UniformInitializer(0.07)
_init_weight(i::UniformInitializer, name::Symbol, x::NDArray) =
rand!(x, low = -i.scale, high = i.scale)
"""
NormalInitializer
Initialize weights according to a univariate Gaussian distribution.
"""
struct NormalInitializer <: AbstractInitializer
μ :: AbstractFloat
σ :: AbstractFloat
end
"""
NormalInitializer(; mu=0, sigma=0.01)
Construct a `NormalInitializer` with mean `mu` and variance `sigma`.
"""
NormalInitializer(; mu=0, sigma=0.01) = NormalInitializer(mu, sigma)
_init_weight(i::NormalInitializer, name::Symbol, x::NDArray) =
randn!(x, μ = i.μ, σ = i.σ)
@enum XavierDistribution xv_uniform xv_normal
@enum XavierRegularization xv_avg xv_in xv_out
"""
XavierInitializer
The initializer documented in the paper [Bengio and Glorot 2010]: *Understanding
the difficulty of training deep feedforward neuralnetworks*.
There are several different version of the XavierInitializer used in the wild.
The general idea is that the variance of the initialization distribution is controlled
by the dimensionality of the input and output. As a distribution one can either choose
a normal distribution with μ = 0 and σ² or a uniform distribution from -σ to σ.
Several different ways of calculating the variance are given in the literature or are
used by various libraries.
* [Bengio and Glorot 2010]: `mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 1)`
* [K. He, X. Zhang, S. Ren, and J. Sun 2015]: `mx.XavierInitializer(distribution = mx.xv_gaussian, regularization = mx.xv_in, magnitude = 2)`
* caffe_avg: `mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 3)`
"""
struct XavierInitializer <: AbstractInitializer
distribution :: XavierDistribution
regularization :: XavierRegularization
magnitude :: Float64
end
XavierInitializer(; distribution = xv_uniform, regularization = xv_avg, magnitude = 3.0) =
XavierInitializer(distribution, regularization, magnitude)
function _init_weight(self :: XavierInitializer, name :: Base.Symbol, array :: NDArray)
dims = size(array)
fan_in = prod(dims[2:end])
fan_out = dims[1]
if self.regularization == xv_avg
factor = (fan_in + fan_out) / 2
elseif self.regularization == xv_in
factor = fan_in
elseif self.regularization == xv_out
factor = fan_out
end
σ = √(self.magnitude / factor)
if self.distribution == xv_uniform
rand!(array, low = -σ, high = σ)
elseif self.distribution == xv_normal
randn!(array; μ = 0.0, σ = σ)
end
end