Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minimal cluster recovery #149

Merged
merged 14 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions cluster/recover.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package cluster

import (
"fmt"

"github.com/canonical/go-dqlite"
dqliteClient "github.com/canonical/go-dqlite/client"
)

// DqliteMember is the information that can be derived locally about a cluster
// member without access to the dqlite database.
type DqliteMember struct {
// dqlite.NodeInfo fields
DqliteID uint64 `json:"id" yaml:"id"`
Address string `json:"address" yaml:"address"`
Role string `json:"role" yaml:"role"`

Name string `json:"name" yaml:"name"`
}

// NodeInfo is used for interop with go-dqlite.
func (m DqliteMember) NodeInfo() (*dqlite.NodeInfo, error) {
var role dqliteClient.NodeRole
switch m.Role {
case "voter":
role = dqliteClient.Voter
case "stand-by":
role = dqliteClient.StandBy
case "spare":
role = dqliteClient.Spare
default:
return nil, fmt.Errorf("invalid dqlite role %q", m.Role)
}

return &dqlite.NodeInfo{
ID: m.DqliteID,
Role: role,
Address: m.Address,
}, nil
}
143 changes: 141 additions & 2 deletions example/cmd/microctl/cluster_members.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,48 @@
package main

import (
"bufio"
"context"
"fmt"
"io"
"os"
"sort"
"strconv"
"strings"

"github.com/canonical/lxd/shared"
cli "github.com/canonical/lxd/shared/cmd"
"github.com/canonical/lxd/shared/termios"
"github.com/spf13/cobra"
"golang.org/x/sys/unix"
"gopkg.in/yaml.v2"

"github.com/canonical/microcluster/client"
"github.com/canonical/microcluster/cluster"
"github.com/canonical/microcluster/microcluster"
)

const recoveryConfirmation = `You should only run this command if:
- A quorum of cluster members is permanently lost
- You are *absolutely* sure all microd instances are stopped
- This instance has the most up to date database

Do you want to proceed? (yes/no): `

const recoveryYamlComment = `# Member roles can be modified. Unrecoverable nodes should be given the role "spare".
#
# "voter" - Voting member of the database. A majority of voters is a quorum.
# "stand-by" - Non-voting member of the database; can be promoted to voter.
# "spare" - Not a member of the database.
#
# The edit is aborted if:
# - the number of members changes
# - the name of any member changes
# - the ID of any member changes
# - the address of any member changes
# - no changes are made
`

type cmdClusterMembers struct {
common *CmdControl
}
Expand All @@ -27,6 +60,9 @@ func (c *cmdClusterMembers) command() *cobra.Command {
var cmdList = cmdClusterMembersList{common: c.common}
cmd.AddCommand(cmdList.command())

var cmdRestore = cmdClusterEdit{common: c.common}
cmd.AddCommand(cmdRestore.command())

return cmd
}

Expand All @@ -36,6 +72,9 @@ func (c *cmdClusterMembers) run(cmd *cobra.Command, args []string) error {

type cmdClusterMembersList struct {
common *CmdControl

flagLocal bool
flagFormat string
}

func (c *cmdClusterMembersList) command() *cobra.Command {
Expand All @@ -45,6 +84,9 @@ func (c *cmdClusterMembersList) command() *cobra.Command {
RunE: c.run,
}

cmd.Flags().BoolVarP(&c.flagLocal, "local", "l", false, "display the locally available cluster info (no database query)")
cmd.Flags().StringVarP(&c.flagFormat, "format", "f", cli.TableFormatTable, "Format (csv|json|table|yaml|compact)")

return cmd
}

Expand All @@ -59,6 +101,10 @@ func (c *cmdClusterMembersList) run(cmd *cobra.Command, args []string) error {
return err
}

if c.flagLocal {
return c.listLocalClusterMembers(m)
}

var client *client.Client

// Get a local client connected to the unix socket if no address is specified.
Expand All @@ -74,7 +120,11 @@ func (c *cmdClusterMembersList) run(cmd *cobra.Command, args []string) error {
}
}

clusterMembers, err := client.GetClusterMembers(cmd.Context())
return c.listClusterMembers(cmd.Context(), client)
}

func (c *cmdClusterMembersList) listClusterMembers(ctx context.Context, client *client.Client) error {
clusterMembers, err := client.GetClusterMembers(ctx)
if err != nil {
return err
}
Expand All @@ -87,7 +137,24 @@ func (c *cmdClusterMembersList) run(cmd *cobra.Command, args []string) error {
header := []string{"NAME", "ADDRESS", "ROLE", "CERTIFICATE", "STATUS"}
sort.Sort(cli.SortColumnsNaturally(data))

return cli.RenderTable(cli.TableFormatTable, header, data, clusterMembers)
return cli.RenderTable(c.flagFormat, header, data, clusterMembers)
}

func (c *cmdClusterMembersList) listLocalClusterMembers(m *microcluster.MicroCluster) error {
members, err := m.GetDqliteClusterMembers()
if err != nil {
return err
}

data := make([][]string, len(members))
for i, member := range members {
data[i] = []string{strconv.FormatUint(member.DqliteID, 10), member.Name, member.Address, member.Role}
}

header := []string{"ID", "NAME", "ADDRESS", "ROLE"}
sort.Sort(cli.SortColumnsNaturally(data))

return cli.RenderTable(c.flagFormat, header, data, members)
}

type cmdClusterMemberRemove struct {
Expand Down Expand Up @@ -130,3 +197,75 @@ func (c *cmdClusterMemberRemove) run(cmd *cobra.Command, args []string) error {

return nil
}

type cmdClusterEdit struct {
common *CmdControl
}

func (c *cmdClusterEdit) command() *cobra.Command {
cmd := &cobra.Command{
Use: "edit",
Short: "Recover the cluster from this node if quorum is lost",
RunE: c.run,
}

return cmd
}

func (c *cmdClusterEdit) run(cmd *cobra.Command, args []string) error {
m, err := microcluster.App(microcluster.Args{StateDir: c.common.FlagStateDir, Verbose: c.common.FlagLogVerbose, Debug: c.common.FlagLogDebug})
if err != nil {
return err
}

members, err := m.GetDqliteClusterMembers()
if err != nil {
return err
}

membersYaml, err := yaml.Marshal(members)
if err != nil {
return err
}

var content []byte
if !termios.IsTerminal(unix.Stdin) {
content, err = io.ReadAll(os.Stdin)
if err != nil {
return err
}
} else {
reader := bufio.NewReader(os.Stdin)
fmt.Print(recoveryConfirmation)

input, _ := reader.ReadString('\n')
input = strings.TrimSuffix(input, "\n")

if strings.ToLower(input) != "yes" {
fmt.Println("Cluster edit aborted; no changes made")
return nil
}

content, err = shared.TextEditor("", append([]byte(recoveryYamlComment), membersYaml...))
if err != nil {
return err
}
}

newMembers := []cluster.DqliteMember{}
err = yaml.Unmarshal(content, &newMembers)
if err != nil {
return err
}

tarballPath, err := m.RecoverFromQuorumLoss(newMembers)
if err != nil {
return fmt.Errorf("cluster edit: %w", err)
}

fmt.Printf("Cluster changes applied; new database state saved to %s\n\n", tarballPath)
fmt.Printf("*Before* starting any cluster member, copy %s to %s on all remaining cluster members.\n\n", tarballPath, tarballPath)
fmt.Printf("microd will load this file during startup.\n")

return nil
}
73 changes: 62 additions & 11 deletions example/test/main.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -e

cluster_flags=()

if [ -n "${DEBUG:-}" ]; then
Expand All @@ -18,7 +20,7 @@ if [ -d "${test_dir}" ]; then
rm -r "${test_dir}"
fi

members=("c1" "c2" "c3")
members=("c1" "c2" "c3" "c4" "c5")

for member in "${members[@]}"; do
state_dir="${test_dir}/${member}"
Expand All @@ -30,27 +32,76 @@ done
# Ensure two daemons cannot start in the same state dir
! microd --state-dir "${test_dir}/c1" "${cluster_flags[@]}"

# Ensure only valid member names are used
# Ensure only valid member names are used for bootstrap
! microctl --state-dir "${test_dir}/c1" init "c/1" 127.0.0.1:9001 --bootstrap

microctl --state-dir "${test_dir}/c1" init "c1" 127.0.0.1:9001 --bootstrap

# Ensure only valid member names are used
# Ensure only valid member names are used for join
token_node2=$(microctl --state-dir "${test_dir}/c1" tokens add "c/2")
! microctl --state-dir "${test_dir}/c1" init "c/2" 127.0.0.1:9003 --token "${token_node2}"
! microctl --state-dir "${test_dir}/c1" init "c/2" 127.0.0.1:9002 --token "${token_node2}"

indx=2
for member in "${members[@]:1}"; do
token=$(microctl --state-dir "${test_dir}/c1" tokens add "${member}")

microctl --state-dir "${test_dir}/${member}" init "${member}" "127.0.0.1:900${indx}" --token "${token}"

indx=$((indx + 1))
done

token_node2=$(microctl --state-dir "${test_dir}/c1" tokens add "c2")
token_node3=$(microctl --state-dir "${test_dir}/c1" tokens add "c3")
# dqlite takes a while to form the cluster and assign roles to each node, and
# microcluster takes a while to update the core_cluster_members table
while [[ -n "$(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.role == "PENDING")')" ]]; do
sleep 2
done

microctl --state-dir "${test_dir}/c2" init "c2" 127.0.0.1:9002 --token "${token_node2}"
microctl --state-dir "${test_dir}/c3" init "c3" 127.0.0.1:9003 --token "${token_node3}"
microctl --state-dir "${test_dir}/c1" cluster list

# Clean up
if [ -n "${CLUSTER_INSPECT:-}" ]; then
echo "Pausing to inspect... press enter when done"
read -r
fi

kill %1
kill %2
kill %3
for member in "${members[@]}"; do
microctl --state-dir "${test_dir}/${member}" shutdown
done

# The cluster doesn't always shut down right away; this is fine since we're
# doing recovery next
for jobnum in {1..5}; do
kill -9 %"${jobnum}"
done

microctl --state-dir "${test_dir}/c1" cluster list --local --format yaml |
yq '
sort_by(.name) |
.[0].role = "voter" |
.[1].role = "voter" |
.[2].role = "spare" |
.[3].role = "spare" |
.[4].role = "spare"' |
microctl --state-dir "${test_dir}/c1" cluster edit

cp "${test_dir}/c1/recovery_db.tar.gz" "${test_dir}/c2/"

for member in c1 c2; do
state_dir="${test_dir}/${member}"
microd --state-dir "${state_dir}" "${cluster_flags[@]}" > /dev/null 2>&1 &
done

# microcluster takes a long time to update the member roles in the core_cluster_members table
sleep 90

microctl --state-dir "${test_dir}/c1" cluster list

[[ $(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.clustermemberlocal.name == "c1").role') == "voter" ]]
[[ $(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.clustermemberlocal.name == "c2").role') == "voter" ]]
[[ $(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.clustermemberlocal.name == "c3").role') == "spare" ]]
[[ $(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.clustermemberlocal.name == "c4").role') == "spare" ]]
[[ $(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.clustermemberlocal.name == "c5").role') == "spare" ]]

echo "Tests passed"

kill 0
6 changes: 6 additions & 0 deletions internal/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/canonical/microcluster/internal/db"
"github.com/canonical/microcluster/internal/endpoints"
"github.com/canonical/microcluster/internal/extensions"
"github.com/canonical/microcluster/internal/recover"
internalREST "github.com/canonical/microcluster/internal/rest"
internalClient "github.com/canonical/microcluster/internal/rest/client"
"github.com/canonical/microcluster/internal/rest/resources"
Expand Down Expand Up @@ -127,6 +128,11 @@ func (d *Daemon) Run(ctx context.Context, listenPort string, stateDir string, so
return fmt.Errorf("Control socket already present (%q); is another daemon already running?", d.os.ControlSocketPath())
}

err = recover.MaybeUnpackRecoveryTarball(d.os)
if err != nil {
return fmt.Errorf("Database recovery failed: %w", err)
}

d.extensionServers = extensionServers

err = d.init(listenPort, extensionsSchema, apiExtensions, hooks)
Expand Down
Loading
Loading