-
Notifications
You must be signed in to change notification settings - Fork 237
/
load_boost.py
226 lines (178 loc) · 7.62 KB
/
load_boost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Provides functions to temporary boost autoscaler results
Useful during big service bounce or failovers to preemptively increase capacity.
This works by setting a temporary multiplier on the initial measured load.
The resulting increased capacity is guaranteed until the end of the boost.
If usage gets higher the pool will behave normally and scale up
Default duration of the boost factor is 40 minutes and default value is 1.5
"""
import logging
from collections import namedtuple
from datetime import datetime
from time import time as get_time
from kazoo.client import KazooClient
from kazoo.exceptions import NoNodeError
from paasta_tools.clusterman import get_clusterman_metrics
from paasta_tools.utils import load_system_paasta_config
from paasta_tools.utils import ZookeeperPool
clusterman_metrics, __ = get_clusterman_metrics()
DEFAULT_BOOST_FACTOR = 1.5
DEFAULT_BOOST_DURATION = 40
MIN_BOOST_FACTOR = 1.0
MAX_BOOST_FACTOR = 3.0
MAX_BOOST_DURATION = 240
BoostValues = namedtuple("BoostValues", ["end_time", "boost_factor", "expected_load"])
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
def get_zk_cluster_boost_path(region: str, pool: str) -> str:
return f"/paasta_cluster_autoscaler/{region}/{pool}/boost"
def get_boosted_load(zk_boost_path: str, current_load: float) -> float:
"""Return the load to use for autoscaling calculations, taking into
account the computed boost, if any.
This function will fail gracefully no matter what (returning the current load)
so we don't block the autoscaler.
"""
try:
current_time = get_time()
with ZookeeperPool() as zk:
boost_values = get_boost_values(zk_boost_path, zk)
if current_time >= boost_values.end_time:
# If there is an expected_load value, that means we've just completed
# a boost period. Reset it to 0
if boost_values.expected_load > 0:
zk.set(zk_boost_path + "/expected_load", "0".encode("utf-8"))
# Boost is no longer active - return current load with no boost
return current_load
# Boost is active. If expected load wasn't already computed, set it now.
if boost_values.expected_load == 0:
expected_load = current_load * boost_values.boost_factor
log.debug(
f"Activating boost, storing expected load: {expected_load} in ZooKeeper"
)
zk.ensure_path(zk_boost_path + "/expected_load")
zk.set(
zk_boost_path + "/expected_load", str(expected_load).encode("utf-8")
)
else:
expected_load = boost_values.expected_load
# We return the boosted expected_load, but only if the current load isn't greater.
return expected_load if expected_load > current_load else current_load
except Exception as e:
# Fail gracefully in the face of ANY error
log.error(f"get_boost failed with: {e}")
return current_load
def get_boost_factor(zk_boost_path: str) -> float:
"""This function returns the boost factor value if a boost is active
"""
current_time = get_time()
with ZookeeperPool() as zk:
boost_values = get_boost_values(zk_boost_path, zk)
if current_time < boost_values.end_time:
return boost_values.boost_factor
else:
return 1.0
def get_boost_values(zk_boost_path: str, zk: KazooClient) -> BoostValues:
# Default values, non-boost.
end_time: float = 0
boost_factor: float = 1.0
expected_load: float = 0
try:
end_time = float(zk.get(zk_boost_path + "/end_time")[0].decode("utf-8"))
boost_factor = float(zk.get(zk_boost_path + "/factor")[0].decode("utf-8"))
expected_load = float(
zk.get(zk_boost_path + "/expected_load")[0].decode("utf-8")
)
except NoNodeError:
# If we can't read boost values from zookeeper
return BoostValues(end_time=0, boost_factor=1.0, expected_load=0)
return BoostValues(
end_time=end_time, boost_factor=boost_factor, expected_load=expected_load
)
def set_boost_factor(
zk_boost_path: str,
region: str = "",
pool: str = "",
send_clusterman_metrics: bool = True,
factor: float = DEFAULT_BOOST_FACTOR,
duration_minutes: int = DEFAULT_BOOST_DURATION,
override: bool = False,
) -> bool:
"""
Set a boost factor for a path in zk
Can be used to boost either cluster or service autoscalers.
If using for cluster you must specify region, pool and set
send_clusterman_metrics=True so that clusterman metrics are updated
otherwise just zk_boost_path is enough.
"""
if factor < MIN_BOOST_FACTOR:
log.error(f"Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}")
return False
if factor > MAX_BOOST_FACTOR:
log.warning(
"Boost factor {} does not sound reasonable. Defaulting to {}".format(
factor, MAX_BOOST_FACTOR
)
)
factor = MAX_BOOST_FACTOR
if duration_minutes > MAX_BOOST_DURATION:
log.warning(
"Boost duration of {} minutes is too much. Falling back to {}.".format(
duration_minutes, MAX_BOOST_DURATION
)
)
duration_minutes = MAX_BOOST_DURATION
current_time = get_time()
end_time = current_time + 60 * duration_minutes
if clusterman_metrics and send_clusterman_metrics:
cluster = load_system_paasta_config().get_cluster()
metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(
region_name=region, app_identifier=pool
)
with metrics_client.get_writer(clusterman_metrics.APP_METRICS) as writer:
metrics_key = clusterman_metrics.generate_key_with_dimensions(
"boost_factor", {"cluster": cluster, "pool": pool}
)
writer.send((metrics_key, current_time, factor))
if duration_minutes > 0:
writer.send((metrics_key, end_time, 1.0))
zk_end_time_path = zk_boost_path + "/end_time"
zk_factor_path = zk_boost_path + "/factor"
zk_expected_load_path = zk_boost_path + "/expected_load"
with ZookeeperPool() as zk:
if not override and current_time < get_boost_values(zk_boost_path, zk).end_time:
log.error("Boost already active. Not overriding.")
return False
try:
zk.ensure_path(zk_end_time_path)
zk.ensure_path(zk_factor_path)
zk.ensure_path(zk_expected_load_path)
zk.set(zk_end_time_path, str(end_time).encode("utf-8"))
zk.set(zk_factor_path, str(factor).encode("utf-8"))
zk.set(zk_expected_load_path, "0".encode("utf-8"))
except Exception:
log.error("Error setting the boost in Zookeeper")
raise
log.info(
"Load boost: Set capacity boost factor {} at path {} until {}".format(
factor, zk_boost_path, datetime.fromtimestamp(end_time).strftime("%c")
)
)
# Let's check that this factor has been properly written to zk
return get_boost_values(zk_boost_path, zk) == BoostValues(
end_time=end_time, boost_factor=factor, expected_load=0
)
def clear_boost(
zk_boost_path: str,
region: str = "",
pool: str = "",
send_clusterman_metrics: bool = True,
) -> bool:
return set_boost_factor(
zk_boost_path,
region=region,
pool=pool,
send_clusterman_metrics=send_clusterman_metrics,
factor=1,
duration_minutes=0,
override=True,
)