Skip to content

Commit

Permalink
ceph-volume: work around phantom atari partitions
Browse files Browse the repository at this point in the history
See Rook issue rook/rook#7940 for full
information.

Ceph bluestore disks can sometimes appear as though they have "phantom"
Atari (AHDI) partitions created on them when they don't in reality. This
is due to a series of bugs in the Linux kernel when it is built with
Atari support enabled. This behavior does not appear for raw mode OSDs on
partitions, only on disks.

Changing the on-disk format of Bluestore OSDs comes with
backwards-compatibility challenges, and fixing the issue in the Kernel
could be years before users get a fix. Working around the Kernel issue
in ceph-volume is therefore the best place to fix the issue for Ceph.

To work around the issue in Ceph volume, there are two behaviors that need
adjusted:
1. `ceph-volume inventory` should not report that a partition is
   available if the parent device is a BlueStore OSD.
2. `ceph-volume raw list` should report parent disks if the disk is a
   BlueStore OSD and not report the disk's children, BUT it should still
   report children if the parent disk is not a BlueStore OSD.

Signed-off-by: Blaine Gardner <blaine.gardner@redhat.com>
  • Loading branch information
BlaineEXE committed Aug 5, 2021
1 parent 9d7e142 commit 9212420
Show file tree
Hide file tree
Showing 5 changed files with 351 additions and 73 deletions.
125 changes: 75 additions & 50 deletions src/ceph-volume/ceph_volume/devices/raw/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import logging
from textwrap import dedent
from ceph_volume import decorators, process
from ceph_volume.util import disk


logger = logging.getLogger(__name__)


def direct_report(devices):
"""
Other non-cli consumers of listing information will want to consume the
Expand All @@ -18,6 +20,40 @@ def direct_report(devices):
_list = List([])
return _list.generate(devices)

def _get_bluestore_info(dev):
out, err, rc = process.call([
'ceph-bluestore-tool', 'show-label',
'--dev', dev], verbose_on_failure=False)
if rc:
# ceph-bluestore-tool returns an error (below) if device is not bluestore OSD
# > unable to read label for <device>: (2) No such file or directory
# but it's possible the error could be for a different reason (like if the disk fails)
logger.debug('assuming device {} is not BlueStore; ceph-bluestore-tool failed to get info from device: {}\n{}'.format(dev, out, err))
return None
oj = json.loads(''.join(out))
if dev not in oj:
# should be impossible, so warn
logger.warning('skipping device {} because it is not reported in ceph-bluestore-tool output: {}'.format(dev, out))
return None
try:
if oj[dev]['description'] != 'main':
# ignore non-main devices, for now
logger.info('ignoring non-main device {}'.format(dev))
return None
whoami = oj[dev]['whoami']
return {
'type': 'bluestore',
'osd_id': int(whoami),
'osd_uuid': oj[dev]['osd_uuid'],
'ceph_fsid': oj[dev]['ceph_fsid'],
'device': dev
}
except KeyError as e:
# this will appear for devices that have a bluestore header but aren't valid OSDs
# for example, due to incomplete rollback of OSDs: https://tracker.ceph.com/issues/51869
logger.error('device {} does not have all BlueStore data needed to be a valid OSD: {}\n{}'.format(dev, out, e))
return None


class List(object):

Expand All @@ -27,64 +63,53 @@ def __init__(self, argv):
self.argv = argv

def generate(self, devs=None):
if not devs:
logger.debug('Listing block devices via lsblk...')
logger.debug('Listing block devices via lsblk...')

if devs is None or devs == []:
devs = []
# adding '--inverse' allows us to get the mapper devices list in that command output.
# not listing root devices containing partitions shouldn't have side effect since we are
# in `ceph-volume raw` context.
#
# example:
# running `lsblk --paths --nodeps --output=NAME --noheadings` doesn't allow to get the mapper list
# because the output is like following :
#
# $ lsblk --paths --nodeps --output=NAME --noheadings
# /dev/sda
# /dev/sdb
# /dev/sdc
# /dev/sdd
#
# the dmcrypt mappers are hidden because of the `--nodeps` given they are displayed as a dependency.
#
# $ lsblk --paths --output=NAME --noheadings
# /dev/sda
# |-/dev/mapper/ceph-3b52c90d-6548-407d-bde1-efd31809702f-sda-block-dmcrypt
# `-/dev/mapper/ceph-3b52c90d-6548-407d-bde1-efd31809702f-sda-db-dmcrypt
# /dev/sdb
# /dev/sdc
# /dev/sdd
#
# adding `--inverse` is a trick to get around this issue, the counterpart is that we can't list root devices if they contain
# at least one partition but this shouldn't be an issue in `ceph-volume raw` context given we only deal with raw devices.
# If no devs are given initially, we want to list ALL devices including children and
# parents. Parent disks with child partitions may be the appropriate device to return if
# the parent disk has a bluestore header, but children may be the most appropriate
# devices to return if the parent disk does not have a bluestore header.
out, err, ret = process.call([
'lsblk', '--paths', '--nodeps', '--output=NAME', '--noheadings', '--inverse'
'lsblk', '--paths', '--output=NAME', '--noheadings',
])
assert not ret
devs = out

result = {}
logger.debug('inspecting devices: {}'.format(devs))
for dev in devs:
logger.debug('Examining %s' % dev)
# bluestore?
out, err, ret = process.call([
'ceph-bluestore-tool', 'show-label',
'--dev', dev], verbose_on_failure=False)
if ret:
logger.debug('No label on %s' % dev)
continue
oj = json.loads(''.join(out))
if dev not in oj:
info = disk.lsblk(dev, abspath=True)
# Linux kernels built with CONFIG_ATARI_PARTITION enabled can falsely interpret
# bluestore's on-disk format as an Atari partition table. These false Atari partitions
# can be interpreted as real OSDs if a bluestore OSD was previously created on the false
# partition. See https://tracker.ceph.com/issues/52060 for more info. If a device has a
# parent, it is a child. If the parent is a valid bluestore OSD, the child will only
# exist if it is a phantom Atari partition, and the child should be ignored. If the
# parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to
# determine whether a parent is bluestore, we should err on the side of not reporting
# the child so as not to give a false negative.
if 'PKNAME' in info and info['PKNAME'] != "":
parent = info['PKNAME']
try:
if disk.has_bluestore_label(parent):
logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(dev, parent),
'device is likely a phantom Atari partition. device info: {}'.format(info)))
continue
except OSError as e:
logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(dev),
'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e)))
continue

bs_info = _get_bluestore_info(dev)
if bs_info is None:
# None is also returned in the rare event that there is an issue reading info from
# a BlueStore disk, so be sure to log our assumption that it isn't bluestore
logger.info('device {} does not have BlueStore information'.format(dev))
continue
if oj[dev]['description'] != 'main':
# ignore non-main devices, for now
continue
whoami = oj[dev]['whoami']
result[oj[dev]['osd_uuid']] = {
'type': 'bluestore',
'osd_id': int(whoami),
'osd_uuid': oj[dev]['osd_uuid'],
'ceph_fsid': oj[dev]['ceph_fsid'],
'device': dev
}
result[bs_info['osd_uuid']] = bs_info

return result

@decorators.needs_root
Expand Down
234 changes: 234 additions & 0 deletions src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import pytest
from mock.mock import patch
from ceph_volume.devices import raw

# Sample lsblk output is below that overviews the test scenario. (--json output for reader clarity)
# - sda and all its children are used for the OS
# - sdb is a bluestore OSD with phantom Atari partitions
# - sdc is an empty disk
# - sdd has 2 LVM device children
# > lsblk --paths --json
# {
# "blockdevices": [
# {"name": "/dev/sda", "maj:min": "8:0", "rm": "0", "size": "128G", "ro": "0", "type": "disk", "mountpoint": null,
# "children": [
# {"name": "/dev/sda1", "maj:min": "8:1", "rm": "0", "size": "487M", "ro": "0", "type": "part", "mountpoint": null},
# {"name": "/dev/sda2", "maj:min": "8:2", "rm": "0", "size": "1.9G", "ro": "0", "type": "part", "mountpoint": null},
# {"name": "/dev/sda3", "maj:min": "8:3", "rm": "0", "size": "125.6G", "ro": "0", "type": "part", "mountpoint": "/etc/hosts"}
# ]
# },
# {"name": "/dev/sdb", "maj:min": "8:16", "rm": "0", "size": "1T", "ro": "0", "type": "disk", "mountpoint": null,
# "children": [
# {"name": "/dev/sdb2", "maj:min": "8:18", "rm": "0", "size": "48G", "ro": "0", "type": "part", "mountpoint": null},
# {"name": "/dev/sdb3", "maj:min": "8:19", "rm": "0", "size": "6M", "ro": "0", "type": "part", "mountpoint": null}
# ]
# },
# {"name": "/dev/sdc", "maj:min": "8:32", "rm": "0", "size": "1T", "ro": "0", "type": "disk", "mountpoint": null},
# {"name": "/dev/sdd", "maj:min": "8:48", "rm": "0", "size": "1T", "ro": "0", "type": "disk", "mountpoint": null,
# "children": [
# {"name": "/dev/mapper/ceph--osd--block--1", "maj:min": "253:0", "rm": "0", "size": "512G", "ro": "0", "type": "lvm", "mountpoint": null},
# {"name": "/dev/mapper/ceph--osd--block--2", "maj:min": "253:1", "rm": "0", "size": "512G", "ro": "0", "type": "lvm", "mountpoint": null}
# ]
# }
# ]
# }

def _devices_side_effect():
return {
"/dev/sda": {},
"/dev/sda1": {},
"/dev/sda2": {},
"/dev/sda3": {},
"/dev/sdb": {},
"/dev/sdb2": {},
"/dev/sdb3": {},
"/dev/sdc": {},
"/dev/sdd": {},
"/dev/mapper/ceph--osd--block--1": {},
"/dev/mapper/ceph--osd--block--2": {},
}

def _lsblk_list_output():
return [
'/dev/sda',
'/dev/sda1',
'/dev/sda2',
'/dev/sda3',
'/dev/sdb',
'/dev/sdb2',
'/dev/sdb3',
'/dev/sdc',
'/dev/sdd',
'/dev/mapper/ceph--osd--block--1',
'/dev/mapper/ceph--osd--block--2',
]

# dummy lsblk output for device with optional parent output
def _lsblk_output(dev, parent=None):
if parent is None:
parent = ""
ret = 'NAME="{}" KNAME="{}" PKNAME="{}"'.format(dev, dev, parent)
return [ret] # needs to be in a list form

def _bluestore_tool_label_output_sdb():
return '''{
"/dev/sdb": {
"osd_uuid": "sdb-uuid",
"size": 1099511627776,
"btime": "2021-07-23T16:02:22.809186+0000",
"description": "main",
"bfm_blocks": "268435456",
"bfm_blocks_per_key": "128",
"bfm_bytes_per_block": "4096",
"bfm_size": "1099511627776",
"bluefs": "1",
"ceph_fsid": "sdb-fsid",
"kv_backend": "rocksdb",
"magic": "ceph osd volume v026",
"mkfs_done": "yes",
"osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
"ready": "ready",
"require_osd_release": "16",
"whoami": "0"
}
}'''

def _bluestore_tool_label_output_sdb2():
return '''{
"/dev/sdb2": {
"osd_uuid": "sdb2-uuid",
"size": 1099511627776,
"btime": "2021-07-23T16:02:22.809186+0000",
"description": "main",
"bfm_blocks": "268435456",
"bfm_blocks_per_key": "128",
"bfm_bytes_per_block": "4096",
"bfm_size": "1099511627776",
"bluefs": "1",
"ceph_fsid": "sdb2-fsid",
"kv_backend": "rocksdb",
"magic": "ceph osd volume v026",
"mkfs_done": "yes",
"osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
"ready": "ready",
"require_osd_release": "16",
"whoami": "2"
}
}'''

def _bluestore_tool_label_output_dm_okay():
return '''{
"/dev/mapper/ceph--osd--block--1": {
"osd_uuid": "lvm-1-uuid",
"size": 549751619584,
"btime": "2021-07-23T16:04:37.881060+0000",
"description": "main",
"bfm_blocks": "134216704",
"bfm_blocks_per_key": "128",
"bfm_bytes_per_block": "4096",
"bfm_size": "549751619584",
"bluefs": "1",
"ceph_fsid": "lvm-1-fsid",
"kv_backend": "rocksdb",
"magic": "ceph osd volume v026",
"mkfs_done": "yes",
"osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
"ready": "ready",
"require_osd_release": "16",
"whoami": "2"
}
}'''

def _process_call_side_effect(command, **kw):
if "lsblk" in command:
if "/dev/" in command[-1]:
dev = command[-1]
if dev == "/dev/sda1" or dev == "/dev/sda2" or dev == "/dev/sda3":
return _lsblk_output(dev, parent="/dev/sda"), '', 0
if dev == "/dev/sdb2" or dev == "/dev/sdb3":
return _lsblk_output(dev, parent="/dev/sdb"), '', 0
if dev == "/dev/sda" or dev == "/dev/sdb" or dev == "/dev/sdc" or dev == "/dev/sdd":
return _lsblk_output(dev), '', 0
if "mapper" in dev:
return _lsblk_output(dev, parent="/dev/sdd"), '', 0
pytest.fail('dev {} needs behavior specified for it'.format(dev))
if "/dev/" not in command:
return _lsblk_list_output(), '', 0
pytest.fail('command {} needs behavior specified for it'.format(command))

if "ceph-bluestore-tool" in command:
if "/dev/sdb" in command:
# sdb is a bluestore OSD
return _bluestore_tool_label_output_sdb(), '', 0
if "/dev/sdb2" in command:
# sdb2 is a phantom atari partition that appears to have some valid bluestore info
return _bluestore_tool_label_output_sdb2(), '', 0
if "/dev/mapper/ceph--osd--block--1" in command:
# dm device 1 is a valid bluestore OSD (the other is corrupted/invalid)
return _bluestore_tool_label_output_dm_okay(), '', 0
# sda and children, sdb's children, sdc, sdd, dm device 2 all do NOT have bluestore OSD data
return [], 'fake No such file or directory error', 1
pytest.fail('command {} needs behavior specified for it'.format(command))

def _has_bluestore_label_side_effect(disk_path):
if "/dev/sda" in disk_path:
return False # disk and all children are for the OS
if disk_path == "/dev/sdb":
return True # sdb is a valid bluestore OSD
if disk_path == "/dev/sdb2":
return True # sdb2 appears to be a valid bluestore OSD even though it should not be
if disk_path == "/dev/sdc":
return False # empty disk
if disk_path == "/dev/sdd":
return False # has LVM subdevices
if disk_path == "/dev/mapper/ceph--osd--block--1":
return True # good OSD
if disk_path == "/dev/mapper/ceph--osd--block--2":
return False # corrupted
pytest.fail('device {} needs behavior specified for it'.format(disk_path))

class TestList(object):

@patch('ceph_volume.util.device.disk.get_devices')
@patch('ceph_volume.util.disk.has_bluestore_label')
@patch('ceph_volume.process.call')
def test_raw_list(self, patched_call, patched_bluestore_label, patched_get_devices):
raw.list.logger.setLevel("DEBUG")
patched_call.side_effect = _process_call_side_effect
patched_bluestore_label.side_effect = _has_bluestore_label_side_effect
patched_get_devices.side_effect = _devices_side_effect

result = raw.list.List([]).generate()
assert len(result) == 2

sdb = result['sdb-uuid']
assert sdb['osd_uuid'] == 'sdb-uuid'
assert sdb['osd_id'] == 0
assert sdb['device'] == '/dev/sdb'
assert sdb['ceph_fsid'] == 'sdb-fsid'
assert sdb['type'] == 'bluestore'

lvm1 = result['lvm-1-uuid']
assert lvm1['osd_uuid'] == 'lvm-1-uuid'
assert lvm1['osd_id'] == 2
assert lvm1['device'] == '/dev/mapper/ceph--osd--block--1'
assert lvm1['ceph_fsid'] == 'lvm-1-fsid'
assert lvm1['type'] == 'bluestore'

@patch('ceph_volume.util.device.disk.get_devices')
@patch('ceph_volume.util.disk.has_bluestore_label')
@patch('ceph_volume.process.call')
def test_raw_list_with_OSError(self, patched_call, patched_bluestore_label, patched_get_devices):
def _has_bluestore_label_side_effect_with_OSError(device_path):
if device_path == "/dev/sdd":
raise OSError('fake OSError')
return _has_bluestore_label_side_effect(device_path)

raw.list.logger.setLevel("DEBUG")
patched_call.side_effect = _process_call_side_effect
patched_bluestore_label.side_effect = _has_bluestore_label_side_effect_with_OSError
patched_get_devices.side_effect = _devices_side_effect

result = raw.list.List([]).generate()
assert len(result) == 1
assert 'sdb-uuid' in result

0 comments on commit 9212420

Please sign in to comment.