-
Notifications
You must be signed in to change notification settings - Fork 2
/
activation.py
166 lines (126 loc) · 4.75 KB
/
activation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from numpy.typing import ArrayLike
from tricycle import TRICYCLE_CONTEXT
from tricycle.functions import Sigmoid
from tricycle.initialisers import init_xavier
from tricycle.layers import Dense, Layer
from tricycle.optimisers import Optimiser
from tricycle.tensor import Tensor
from tricycle.unary import UnaryMax
class ReLU(Layer):
def forward(self, x: Tensor):
return UnaryMax()(x, 0)
class Swish(Layer):
"""
A Swish activation function. Note, because we have omitted the bias, this
is equivalent to the Silu activation function
"""
def backward(self, grad: Tensor):
xp = grad.xp
# Exponents tend to overflow/underflow when using 16 bit precision
# so we need to switch to 32 bit
if TRICYCLE_CONTEXT.use_mixed_precision:
self._input = self._input.astype(xp.float32)
exp = xp.exp(-self._input)
numerator = 1 + exp + self._input * exp
denominator = (1 + exp) ** 2
coef = numerator / denominator
if TRICYCLE_CONTEXT.use_mixed_precision:
coef = coef.astype(xp.float16)
return Tensor(grad * coef)
def forward(self, tensor: Tensor):
xp = tensor.xp
self._input = tensor.array
# Exponents tend to overflow/underflow when using 16 bit precision
# so we need to switch to 32 bit
if TRICYCLE_CONTEXT.use_mixed_precision:
self._input = self._input.astype(xp.float32)
out = tensor.array / (1 + xp.exp(-tensor.array))
if TRICYCLE_CONTEXT.use_mixed_precision:
self._input = self._input.astype(xp.float16)
out = out.astype(xp.float16)
return Tensor(
out, args=(tensor,), back_fns=(self.backward,), name="swish"
)
class GeLU(Layer):
"""
A GeLU activation function.
Because the 100% accurate version uses erf, which involves an integral,
we provide a fast approximation of the function
"""
CONST_1 = 0.7978845608028654
CONST_2 = 0.044715
def __init__(self, *args, approximate: bool = False, **kwargs):
super().__init__(*args, **kwargs)
self.approximate = approximate
def backward(self, grad: Tensor):
xp = grad.xp
# Hyperbolic trig functions (cosh and tanh) use exponents under the
# hood which can overflow/underflow when using 16 bit precision so
# we need to switch to 32 bit precision
if TRICYCLE_CONTEXT.use_mixed_precision:
self._input = self._input.astype(xp.float32)
inner = (
self.CONST_1 * self._input * (1 + self.CONST_2 * self._input**2)
)
coef = (
self.CONST_1
* self._input
* (1 + self.CONST_2 * 3 * self._input**2)
)
left = xp.tanh(inner)
cosh = xp.cosh(inner)
right = coef / (cosh * cosh)
if TRICYCLE_CONTEXT.use_mixed_precision:
left = left.astype(xp.float16)
right = right.astype(xp.float16)
self._grad = 0.5 * (1 + left + right) * grad.array
result = Tensor(
self._grad,
is_batched=grad.is_batched,
requires_grad=grad.requires_grad,
)
result.name = "gelu_back"
return result
def forward(self, tensor: Tensor):
xp = tensor.xp
self._input = tensor.array
# Tanh tends to overflow/underflow when using 16 bit precision
# so we need to switch to 32 bit
if TRICYCLE_CONTEXT.use_mixed_precision:
self._input = self._input.astype(xp.float32)
inner = self.CONST_1 * (self._input + self.CONST_2 * self._input**3)
result = self._input * 0.5 * (1 + xp.tanh(inner))
if TRICYCLE_CONTEXT.use_mixed_precision:
self._input = self._input.astype(xp.float16)
result = result.astype(xp.float16)
result = Tensor(
result,
is_batched=tensor.is_batched,
requires_grad=tensor.requires_grad,
)
result.name = "gelu"
result.args = (tensor,)
result.back_fns = (self.backward,)
return result
class GLU(Layer):
"""
A gated linear unit
"""
linear: Dense
def __init__(self, size: int, initialiser=init_xavier, *args, **kwargs):
super().__init__(*args, **kwargs)
self.linear = Dense(size, 2 * size, initialiser)
self.layers = [self.linear]
self.sigmoid = Sigmoid()
def forward(self, x: Tensor):
x = self.linear(x)
left, right = x.split(2)
return left * self.sigmoid(right)
def update(self, optimiser: Optimiser):
self.linear.update(optimiser)
def zero_grad(self):
self.linear.zero_grad()
def to_gpu(self):
self.linear.to_gpu()
def from_gpu(self):
self.linear.from_gpu()