This repository has been archived by the owner on Jan 14, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
upgrader.go
349 lines (321 loc) · 12.9 KB
/
upgrader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
package dcosupgrade
import (
"encoding/json"
"fmt"
"strings"
"github.com/Azure/dcos-engine/pkg/acsengine"
"github.com/Azure/dcos-engine/pkg/operations"
)
type agentAttr struct {
OS string `json:"os"`
PublicIP string `json:"public_ip,omitempty"`
}
type agentInfo struct {
ID string `json:"id"`
Hostname string `json:"hostname"`
Attributes agentAttr `json:"attributes"`
}
type agentList struct {
Agents []*agentInfo `json:"slaves"`
}
type dcosVersion struct {
Version string `json:"version,omitempty"`
DcosImageCommit string `json:"dcos-image-commit,omitempty"`
BootstrapID string `json:"bootstrap-id,omitempty"`
DcosVariant string `json:"dcos-variant,omitempty"`
}
var bootstrapUpgradeScript = `#!/bin/bash
source /opt/azure/containers/provision_source.sh
echo "Setting up bootstrap node"
rm -rf /opt/azure/dcos/upgrade/NEW_VERSION
mkdir -p /opt/azure/dcos/upgrade/NEW_VERSION/genconf
cp /opt/azure/dcos/genconf/ip-detect /opt/azure/dcos/upgrade/NEW_VERSION/genconf/ip-detect
cp config.NEW_VERSION.yaml /opt/azure/dcos/upgrade/NEW_VERSION/genconf/config.yaml
dns=\$(grep search /etc/resolv.conf | cut -d " " -f 2)
sed -i "/dns_search:/c dns_search: \$dns" /opt/azure/dcos/upgrade/NEW_VERSION/genconf/config.yaml
cd /opt/azure/dcos/upgrade/NEW_VERSION/
retrycmd_if_failure 10 10 120 curl -fsSL -o ./dcos_generate_config.sh BOOTSTRAP_URL
bash ./dcos_generate_config.sh --generate-node-upgrade-script CURR_VERSION | tee /opt/azure/dcos/upgrade/NEW_VERSION/log
process=\$(docker ps -f ancestor=nginx -q)
if [ ! -z "\$process" ]; then
echo "Stopping nginx service \$process"
docker kill \$process
fi
echo "Starting nginx service"
docker run -d -p 8086:80 -v \$PWD/genconf/serve:/usr/share/nginx/html:ro nginx
docker ps
grep 'Node upgrade script URL' /opt/azure/dcos/upgrade/NEW_VERSION/log | awk -F ': ' '{print \$2}' | cat > /opt/azure/dcos/upgrade/NEW_VERSION/upgrade_url
upgrade_url=\$(cat /opt/azure/dcos/upgrade/NEW_VERSION/upgrade_url)
if [ -z \${upgrade_url} ]; then
rm -f /opt/azure/dcos/upgrade/NEW_VERSION/upgrade_url
echo "Failed to set up bootstrap node. Please try again"
exit 1
else
echo "Setting up bootstrap node completed. Node upgrade script URL \${upgrade_url}"
fi
`
var nodeUpgradeScript = `#!/bin/bash
source /opt/azure/containers/provision_source.sh
echo "Starting node upgrade"
mkdir -p /opt/azure/dcos/upgrade/NEW_VERSION
cd /opt/azure/dcos/upgrade/NEW_VERSION
retrycmd_if_failure 10 10 120 curl -fsSL -o ./dcos_node_upgrade.sh UPGRADE_SCRIPT_URL
bash ./dcos_node_upgrade.sh
`
func (uc *UpgradeCluster) runUpgrade() error {
if uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.LinuxBootstrapProfile == nil {
return fmt.Errorf("LinuxBootstrapProfile is not set")
}
newVersion := uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.OrchestratorVersion
masterDNS := acsengine.FormatAzureProdFQDN(uc.ClusterTopology.DataModel.Properties.MasterProfile.DNSPrefix, uc.ClusterTopology.DataModel.Location)
// get the agents
strOut, strErr, err := operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, "curl -fsSL http://leader.mesos:5050/slaves")
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
agents := &agentList{}
if err = json.Unmarshal([]byte(strOut), agents); err != nil {
return err
}
var hasWindowsAgents bool
for _, agent := range agents.Agents {
if strings.Compare(agent.Attributes.OS, "Windows") == 0 {
hasWindowsAgents = true
break
}
}
masterCount := uc.ClusterTopology.DataModel.Properties.MasterProfile.Count
bootstrapIP := uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.LinuxBootstrapProfile.StaticIP
uc.Logger.Infof("masterDNS:%s masterCount:%d", masterDNS, masterCount)
uc.Logger.Infof("bootstrapIP:%s", bootstrapIP)
var winBootstrapIP string
if hasWindowsAgents {
if uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.WindowsBootstrapProfile == nil {
return fmt.Errorf("WindowsBootstrapProfile is not set")
}
winBootstrapIP = uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.WindowsBootstrapProfile.StaticIP
uc.Logger.Infof("Windows bootstrapIP:%s", winBootstrapIP)
}
// copy SSH key to master
uc.Logger.Infof("Copy SSH key to master")
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, fmt.Sprintf("cat << END > .ssh/id_rsa_cluster\n%s\nEND\n", string(uc.SSHKey)))
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// set SSH key permissions
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, "chmod 600 .ssh/id_rsa_cluster")
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// upgrade bootstrap node
bootstrapScript := strings.Replace(bootstrapUpgradeScript, "CURR_VERSION", uc.CurrentDcosVersion, -1)
bootstrapScript = strings.Replace(bootstrapScript, "NEW_VERSION", newVersion, -1)
bootstrapScript = strings.Replace(bootstrapScript, "BOOTSTRAP_URL", uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.LinuxBootstrapProfile.BootstrapURL, -1)
upgradeScriptURL, err := uc.upgradeBootstrapNode(masterDNS, bootstrapIP, bootstrapScript)
if err != nil {
return err
}
uc.Logger.Infof("upgradeScriptURL %s", upgradeScriptURL)
if hasWindowsAgents {
winUpgradeScriptURL, err := uc.upgradeWindowsBootstrapNode(masterDNS, winBootstrapIP, newVersion)
if err != nil {
return err
}
uc.Logger.Infof("winUpgradeScriptURL %s", winUpgradeScriptURL)
if err = uc.createWindowsAgentScript(masterDNS, winUpgradeScriptURL, newVersion); err != nil {
return err
}
}
nodeScript := strings.Replace(nodeUpgradeScript, "NEW_VERSION", newVersion, -1)
nodeScript = strings.Replace(nodeScript, "UPGRADE_SCRIPT_URL", upgradeScriptURL, -1)
// upgrade master nodes
if err = uc.upgradeMasterNodes(masterDNS, masterCount, nodeScript); err != nil {
return err
}
// upgrade agent nodes
for _, agent := range agents.Agents {
if strings.Compare(agent.Attributes.OS, "Windows") == 0 {
if err = uc.upgradeWindowsAgent(masterDNS, agent); err != nil {
return err
}
} else {
if err = uc.upgradeLinuxAgent(masterDNS, agent); err != nil {
return err
}
}
}
return nil
}
func (uc *UpgradeCluster) upgradeBootstrapNode(masterDNS, bootstrapIP, bootstrapScript string) (string, error) {
// copy bootstrap script to master
uc.Logger.Infof("Copy bootstrap script to master")
strOut, strErr, err := operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, fmt.Sprintf("cat << END > bootstrap_upgrade.sh\n%s\nEND\n", bootstrapScript))
if err != nil {
uc.Logger.Errorf(strErr)
return "", err
}
// set script permissions
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, "chmod 755 ./bootstrap_upgrade.sh")
if err != nil {
uc.Logger.Errorf(strErr)
return "", err
}
// copy bootstrap config to master
configFilename := fmt.Sprintf("config.%s.yaml", uc.DataModel.Properties.OrchestratorProfile.OrchestratorVersion)
uc.Logger.Infof("Copy bootstrap config to master")
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, fmt.Sprintf("cat << END > %s\n%s\nEND\n",
configFilename, acsengine.GetDCOSBootstrapConfig(uc.DataModel)))
if err != nil {
uc.Logger.Errorf(strErr)
return "", err
}
// copy bootstrap script to the bootstrap node
uc.Logger.Infof("Copy bootstrap script to the bootstrap node")
cmd := fmt.Sprintf("scp -i .ssh/id_rsa_cluster -o ConnectTimeout=30 -o StrictHostKeyChecking=no bootstrap_upgrade.sh %s:", bootstrapIP)
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmd)
if err != nil {
uc.Logger.Errorf(strErr)
return "", err
}
// copy bootstrap config to the bootstrap node
uc.Logger.Infof("Copy bootstrap config to the bootstrap node")
cmd = fmt.Sprintf("scp -i .ssh/id_rsa_cluster -o ConnectTimeout=30 -o StrictHostKeyChecking=no %s %s:", configFilename, bootstrapIP)
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmd)
if err != nil {
uc.Logger.Errorf(strErr)
return "", err
}
// run bootstrap script
uc.Logger.Infof("Run bootstrap upgrade script")
cmd = fmt.Sprintf("ssh -i .ssh/id_rsa_cluster -o ConnectTimeout=30 -o StrictHostKeyChecking=no %s sudo ./bootstrap_upgrade.sh", bootstrapIP)
strOut, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmd)
if err != nil {
uc.Logger.Errorf(strErr)
return "", err
}
uc.Logger.Info(strOut)
// retrieve upgrade script URL
var url string
arr := strings.Split(strOut, "\n")
prefix := "Setting up bootstrap node completed. Node upgrade script URL"
for _, str := range arr {
if strings.HasPrefix(str, prefix) {
url = strings.TrimSpace(str[len(prefix):])
break
}
}
if len(url) == 0 {
return "", fmt.Errorf("Undefined upgrade script URL")
}
return url, nil
}
func (uc *UpgradeCluster) upgradeMasterNodes(masterDNS string, masterCount int, nodeScript string) error {
// run master upgrade script
catCmd := fmt.Sprintf("cat << END > node_upgrade.sh\n%s\nEND\n", nodeScript)
for i := 0; i < masterCount; i++ {
uc.Logger.Infof("Upgrading master node #%d", i+1)
port := 2200 + i
// check current version
strOut, strErr, err := operations.RemoteRun("azureuser", masterDNS, port, uc.SSHKey, "cat /opt/mesosphere/etc/dcos-version.json")
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
uc.Logger.Infof("Current DCOS Version for %s:%d\n%s", masterDNS, port, strings.TrimSpace(strOut))
dcosVer, err := getDCOSVersion(strOut)
if err != nil {
uc.Logger.Errorf("failed to parse dcos-version.json")
return err
}
// partial upgrade case
if uc.CurrentDcosVersion != uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.OrchestratorVersion &&
dcosVer.Version == uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.OrchestratorVersion {
uc.Logger.Infof("Master node is up-to-date. Skipping upgrade")
continue
}
// copy script to the node
uc.Logger.Infof("Copy script to master node")
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, port, uc.SSHKey, catCmd)
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// set script permissions
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, port, uc.SSHKey, "chmod 755 ./node_upgrade.sh")
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// run the script
uc.Logger.Infof("Run script on master node")
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, port, uc.SSHKey, "sudo ./node_upgrade.sh")
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// check new version
strOut, strErr, err = operations.RemoteRun("azureuser", masterDNS, port, uc.SSHKey, "cat /opt/mesosphere/etc/dcos-version.json")
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
uc.Logger.Infof("New DCOS Version for %s:%d\n%s", masterDNS, port, strings.TrimSpace(strOut))
}
return nil
}
func (uc *UpgradeCluster) upgradeLinuxAgent(masterDNS string, agent *agentInfo) error {
uc.Logger.Infof("Upgrading Linux agent %s", agent.Hostname)
// check current version
cmdCheckVersion := fmt.Sprintf("ssh -i .ssh/id_rsa_cluster -o ConnectTimeout=30 -o StrictHostKeyChecking=no %s cat /opt/mesosphere/etc/dcos-version.json", agent.Hostname)
strOut, strErr, err := operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmdCheckVersion)
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
uc.Logger.Infof("Current DCOS Version for %s\n%s", agent.Hostname, strings.TrimSpace(strOut))
dcosVer, err := getDCOSVersion(strOut)
if err != nil {
uc.Logger.Errorf("failed to parse dcos-version.json")
return err
}
// partial upgrade case
if uc.CurrentDcosVersion != uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.OrchestratorVersion &&
dcosVer.Version == uc.ClusterTopology.DataModel.Properties.OrchestratorProfile.OrchestratorVersion {
uc.Logger.Infof("Agent node is up-to-date. Skipping upgrade")
return nil
}
// copy script to the node
uc.Logger.Infof("Copy script to agent %s", agent.Hostname)
cmd := fmt.Sprintf("scp -i .ssh/id_rsa_cluster -o ConnectTimeout=30 -o StrictHostKeyChecking=no node_upgrade.sh %s:", agent.Hostname)
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmd)
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// run the script
uc.Logger.Infof("Run script on agent %s", agent.Hostname)
cmd = fmt.Sprintf("ssh -i .ssh/id_rsa_cluster -o ConnectTimeout=30 -o StrictHostKeyChecking=no %s sudo ./node_upgrade.sh", agent.Hostname)
_, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmd)
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
// check new version
strOut, strErr, err = operations.RemoteRun("azureuser", masterDNS, 2200, uc.SSHKey, cmdCheckVersion)
if err != nil {
uc.Logger.Errorf(strErr)
return err
}
uc.Logger.Infof("New DCOS Version for %s\n%s", agent.Hostname, strings.TrimSpace(strOut))
return nil
}
func getDCOSVersion(data string) (*dcosVersion, error) {
dcosVer := &dcosVersion{}
if err := json.Unmarshal([]byte(data), dcosVer); err != nil {
return nil, err
}
return dcosVer, nil
}