forked from vitessio/vitess
-
Notifications
You must be signed in to change notification settings - Fork 0
/
restore.go
177 lines (155 loc) · 6.96 KB
/
restore.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Copyright 2015, Google Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package tabletmanager
import (
"flag"
"fmt"
log "github.com/golang/glog"
"golang.org/x/net/context"
"github.com/youtube/vitess/go/mysqlconn/replication"
"github.com/youtube/vitess/go/vt/logutil"
"github.com/youtube/vitess/go/vt/mysqlctl"
"github.com/youtube/vitess/go/vt/topo/topoproto"
topodatapb "github.com/youtube/vitess/go/vt/proto/topodata"
)
// This file handles the initial backup restore upon startup.
// It is only enabled if restore_from_backup is set.
var (
restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there")
restoreConcurrency = flag.Int("restore_concurrency", 4, "(init restore parameter) how many concurrent files to restore at once")
)
// RestoreData is the main entry point for backup restore.
// It will either work, fail gracefully, or return
// an error in case of a non-recoverable error.
// It takes the action lock so no RPC interferes.
func (agent *ActionAgent) RestoreData(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error {
agent.actionMutex.Lock()
defer agent.actionMutex.Unlock()
return agent.restoreDataLocked(ctx, logger, deleteBeforeRestore)
}
func (agent *ActionAgent) restoreDataLocked(ctx context.Context, logger logutil.Logger, deleteBeforeRestore bool) error {
// change type to RESTORE (using UpdateTabletFields so it's
// always authorized)
var originalType topodatapb.TabletType
if _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, func(tablet *topodatapb.Tablet) error {
originalType = tablet.Type
tablet.Type = topodatapb.TabletType_RESTORE
return nil
}); err != nil {
return fmt.Errorf("Cannot change type to RESTORE: %v", err)
}
// let's update our internal state (stop query service and other things)
if err := agent.refreshTablet(ctx, "restore from backup"); err != nil {
return fmt.Errorf("failed to update state before restore: %v", err)
}
// Try to restore. Depending on the reason for failure, we may be ok.
// If we're not ok, return an error and the agent will log.Fatalf,
// causing the process to be restarted and the restore retried.
// Record local metadata values based on the original type.
localMetadata := agent.getLocalMetadataValues(originalType)
tablet := agent.Tablet()
dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard)
pos, err := mysqlctl.Restore(ctx, agent.MysqlDaemon, dir, *restoreConcurrency, agent.hookExtraEnv(), localMetadata, logger, deleteBeforeRestore, topoproto.TabletDbName(tablet))
switch err {
case nil:
// Starting from here we won't be able to recover if we get stopped by a cancelled
// context. Thus we use the background context to get through to the finish.
// Reconnect to master.
if err := agent.startReplication(context.Background(), pos, originalType); err != nil {
return err
}
case mysqlctl.ErrNoBackup:
// No-op, starting with empty database.
case mysqlctl.ErrExistingDB:
// No-op, assuming we've just restarted. Note the
// replication reporter may restart replication at the
// next health check if it thinks it should. We do not
// alter replication here.
default:
return fmt.Errorf("Can't restore backup: %v", err)
}
// If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore
// similar to completely clean start from scratch.
if (originalType == topodatapb.TabletType_BACKUP || originalType == topodatapb.TabletType_RESTORE) && *initTabletType != "" {
initType, err := topoproto.ParseTabletType(*initTabletType)
if err == nil {
originalType = initType
}
}
// Change type back to original type if we're ok to serve.
if _, err := agent.TopoServer.UpdateTabletFields(context.Background(), tablet.Alias, func(tablet *topodatapb.Tablet) error {
tablet.Type = originalType
return nil
}); err != nil {
return fmt.Errorf("Cannot change type back to %v: %v", originalType, err)
}
// let's update our internal state (start query service and other things)
if err := agent.refreshTablet(context.Background(), "after restore from backup"); err != nil {
return fmt.Errorf("failed to update state after backup: %v", err)
}
return nil
}
func (agent *ActionAgent) startReplication(ctx context.Context, pos replication.Position, tabletType topodatapb.TabletType) error {
// Set the position at which to resume from the master.
cmds, err := agent.MysqlDaemon.SetSlavePositionCommands(pos)
if err != nil {
return err
}
if err := agent.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil {
return fmt.Errorf("failed to set slave position: %v", err)
}
// Read the shard to find the current master, and its location.
tablet := agent.Tablet()
si, err := agent.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
if err != nil {
return fmt.Errorf("can't read shard: %v", err)
}
if si.MasterAlias == nil {
// We've restored, but there's no master. This is fine, since we've
// already set the position at which to resume when we're later reparented.
// If we had instead considered this fatal, all tablets would crash-loop
// until a master appears, which would make it impossible to elect a master.
log.Warningf("Can't start replication after restore: shard %v/%v has no master.", tablet.Keyspace, tablet.Shard)
return nil
}
if topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) {
// We used to be the master before we got restarted in an empty data dir,
// and no other master has been elected in the meantime.
// This shouldn't happen, so we'll let the operator decide which tablet
// should actually be promoted to master.
log.Warningf("Can't start replication after restore: master record still points to this tablet.")
return nil
}
ti, err := agent.TopoServer.GetTablet(ctx, si.MasterAlias)
if err != nil {
return fmt.Errorf("Cannot read master tablet %v: %v", si.MasterAlias, err)
}
// If using semi-sync, we need to enable it before connecting to master.
if err := agent.fixSemiSync(tabletType); err != nil {
return err
}
// Set master and start slave.
cmds, err = agent.MysqlDaemon.SetMasterCommands(ti.Hostname, int(ti.PortMap["mysql"]))
if err != nil {
return fmt.Errorf("MysqlDaemon.SetMasterCommands failed: %v", err)
}
cmds = append(cmds, "START SLAVE")
if err := agent.MysqlDaemon.ExecuteSuperQueryList(ctx, cmds); err != nil {
return fmt.Errorf("failed to start replication: %v", err)
}
return nil
}
func (agent *ActionAgent) getLocalMetadataValues(tabletType topodatapb.TabletType) map[string]string {
tablet := agent.Tablet()
values := map[string]string{
"Alias": topoproto.TabletAliasString(tablet.Alias),
"ClusterAlias": fmt.Sprintf("%s.%s", tablet.Keyspace, tablet.Shard),
"DataCenter": tablet.Alias.Cell,
"PromotionRule": "must_not",
}
if isMasterEligible(tabletType) {
values["PromotionRule"] = "neutral"
}
return values
}