Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement the ceph osd destroy feature in ceph-deploy. #254

Closed
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
233 changes: 232 additions & 1 deletion ceph_deploy/osd.py
Expand Up @@ -10,7 +10,7 @@
from cStringIO import StringIO

from ceph_deploy import conf, exc, hosts
from ceph_deploy.util import constants, system
from ceph_deploy.util import constants, system, paths
from ceph_deploy.cliutil import priority
from ceph_deploy.lib import remoto

Expand Down Expand Up @@ -388,6 +388,228 @@ def activate(args, cfg):
distro.conn.exit()


def destroy(args):
cluster = args.cluster

for hostname, disk, journal in args.disk:
if not hostname:
raise RuntimeError('invalided hostname')
LOG.debug(
'Destroy osd id %s in host %s',
args.osd_id, hostname
)

distro = hosts.get(hostname, username=args.username)
LOG.info(
'Distro info: %s %s %s',
distro.name,
distro.release,
distro.codename
)

destroy_osd(distro, cluster, args.osd_id)


def destroy_osd(distro, cluster, osd_id):

found_in_node = False
found_in_stray = False
cluster_path = paths.osd.base(cluster)
conn = distro.conn

if not osd_id or \
osd_id is None:
LOG.info("(NOT IMPLEMENT) remove all osd")
raise NotImplementedError("It need to assign the specific osd id.")

command = [
'ceph',
'--cluster={cluster}'.format(cluster=cluster),
'osd',
'tree',
'--format=json',
]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking at this for a bit, I have a few suggestions/requests. Starting here, I don't think we need to call ceph osd tree explicity here. Contained in this osd.py source file there is already an osd_tree() function that can be re-used.

I also think it would be worthwhile to look at the code in osd_list(). Specifically the part about gathering the osd tree from a monitor node. This code, as written, goes to each server hosting the OSD and calls "ceph osd tree". however, it is possible that OSD nodes do not have the cephx admin key that allows that command to work. It would be better to gather the osd tree once from a monitor node up in destroy(), before going into the loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I completely agree your opinion.
It sounds more sensible abut we use the osd_tree() function.

OK, I think I will check osd_list() and osd_tree() first.
Then the better way is that we can gather the osd tree once from monitor node when we call destroy() function by osd_list()/osd_tree() ?


out, err, code = remoto.process.check(
conn,
command,
)

LOG.info('prepare to search osd in acting set...')
des_osd_in_act_set(out, distro, osd_id, cluster_path)

LOG.info('prepare to search osd in nonacting set...')
des_osd_in_nonact_set(out, distro, osd_id, cluster_path)

conn.exit()


def des_osd_in_act_set(out, distro, osd_id, cluster_path):
osd_name = 'osd.%s' % osd_id
conn = distro.conn
try:
loaded_json = json.loads(''.join(out))
for item in loaded_json['nodes']:
if item[u'name'] == osd_name and item[u'type'] == u'osd':
found_in_node = True
LOG.info(
'Found the osd id %s!',
osd_id
)
takeout_osd(conn, osd_id)
ret = stopping_osd(loaded_json, distro, osd_id)
if ret:
removing_osd(conn, osd_id, cluster_path)
conn.exit()
sys.exit(1)
else:
LOG.debug('CAN NOT STOP CEPH OSD %s', osd_id)
conn.exit()
sys.exit(1)
if not found_in_node:
LOG.info(
'Could not find the situable osd id %s in current acting set.'
'try "stray"', osd_id
)
return False
except ValueError:
conn.exit()
return {}


def des_osd_in_nonact_set(out, distro, osd_id, cluster_path):
osd_name = 'osd.%s' % osd_id
conn = distro.conn
try:
for item in loaded_json['stray']:
if item[u'name'] == osd_name and item[u'type'] == u'osd':
found_in_stray = True
LOG.info(
'Found the osd id %s!',
osd_id
)
takeout_osd(conn, osd_id)
ret = stopping_osd(loaded_json, distro, osd_id)
if not ret:
LOG.debug('THIS OSD %s IS NOT UP', osd_id)
removing_osd(conn, osd_id, cluster_path)
conn.exit()
sys.exit(1)
if not found_in_stray:
LOG.info(
'Could not find the situable osd id %s in "stray". ABANDON!',
osd_id
)
return False
except ValueError:
conn.exit()
return {}


def takeout_osd(conn, osd_id):
command = [
'ceph',
'osd',
'out',
osd_id,
]

remoto.process.run(
conn,
command,
)


def stopping_osd(loaded_json, distro, osd_id):

osd_name = u'osd.%s' % osd_id
conn = distro.conn

for item in loaded_json['nodes']:
if item[u'name'] == osd_name and item[u'status'] == u'down':
LOG.info('OSD already down.')
return True

# With different distros, try to handle differently #
if distro.normalized_name.startswith(('centos', 'red')):
command = [
'/etc/init.d/ceph',
'stop',
'osd.%s' % osd_id,
]
else:
command = [
'stop',
'ceph-osd',
'id=%s' % osd_id,
]

out, err, code = remoto.process.check(
conn,
command,
)

# check out first.
if not out:
return False
elif out[0] == 'ceph-osd stop/waiting' or \
'done' in out[1].split(' ')[-1]:
return True


def removing_osd(conn, osd_id, cluster_path):
command = [
'ceph',
'osd',
'crush',
'remove',
'osd.%s' % osd_id,
]

remoto.process.run(
conn,
command,
)

command = [
'ceph',
'auth',
'del',
'osd.%s' % osd_id,
]

remoto.process.run(
conn,
command,
)

command = [
'ceph',
'osd',
'rm',
osd_id,
]

remoto.process.run(
conn,
command,
)

umount_osd(conn, osd_id, cluster_path)


def umount_osd(conn, osd_id, cluster_path):
command = [
'umount',
cluster_path+'%s' % osd_id,
]

remoto.process.check(
conn,
command,
)


def disk_zap(args):

for hostname, disk, journal in args.disk:
Expand Down Expand Up @@ -602,6 +824,8 @@ def osd(args):
prepare(args, cfg, activate_prepared_disk=True)
elif args.subcommand == 'activate':
activate(args, cfg)
elif args.subcommand == 'destroy':
destroy(args)
else:
LOG.error('subcommand %s not implemented', args.subcommand)
sys.exit(1)
Expand Down Expand Up @@ -706,6 +930,13 @@ def make(parser):
default='/etc/ceph/dmcrypt-keys',
help='directory where dm-crypt keys are stored',
)
parser.add_argument(
'--osd-id',
metavar='OSD_ID',
default=None,
# XXX: (NOT IMPLEMENT) w/o this option will destroy all OSD on HOST'
help='destroy specific osd id on HOST',
)
parser.set_defaults(
func=osd,
)
Expand Down