Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign upbtrfs-dev-docs/tree-items.txt
Go to file| Tree items | |
| ========== | |
| Data is held in the various b-trees in the form of items. Each items has a | |
| custom structure dictated based on its purpose. Items are indexed by keys and | |
| the key for each respective item has custom meanings of its 3 data members. | |
| [refer to table below for mappings from keys to items and the meaning of each | |
| respective field] | |
| Keys | |
| ---- | |
| As explaiend in [trees.txt] there is a generic btrfs_item struct which maps | |
| to the specific btrfs_*_item structs. The table below lists the various data | |
| that the embedded btrfs_key in btrfs_item hold for each item type. | |
| ROOT item | |
| --------- | |
| Every tree in BTRFS is represented by a root item in the root tree. Those tree | |
| can either be one of the default tree as described in trees.txt or a file tree, | |
| which represents a btrfs subvolume. | |
| Key: | |
| btrfs_key.objectid = either one of the well defined BTRFS_*_TREE_OBJECTID | |
| defines, if one of the main metadata tree, or the numerical id of the subvolume, | |
| this root representes. | |
| btrfs_key.type = BTRFS_ROOT_ITEM_KEY | |
| btrfs_key.offset = either 0 if objectid is one of the BTRFS_*_TREE_OBJECTID | |
| defines or if a subvolume and not a snapshot, or if a snapshot the transaction | |
| id when this snapshot was created. | |
| Item data: | |
| struct btrfs_root_item { | |
| /* Inode which represents the root */ | |
| struct btrfs_inode_item inode; | |
| /* Transaction id when this root item was created */ | |
| __le64 generation; | |
| /* For file tree BTRFS_FIRST_FREE_OBJECTID or 0 otherwise */ | |
| __le64 root_dirid; | |
| /* disk offset in bytes for the root node of the tree */ | |
| __le64 bytenr; | |
| /* always 0 (unused) */ | |
| __le64 byte_limit; | |
| /* Counts the number of bytes tree blocks belonging to this root take (unused) */ | |
| __le64 bytes_used; | |
| /* transid of the last transaction that created a snapshot of this tree */ | |
| __le64 last_snapshot; | |
| /* BTRFS_ROOT_SUBVOL_RDONLY - if this is a read-only subvolume | |
| * BTRFS_ROOT_SUBVOL_DEAD - [internal, in-memory only] | |
| */ | |
| __le64 flags; | |
| /* Can take either 0 or 1 */ | |
| __le32 refs; | |
| /* Contains key of last dropped item during subvolume removal or relocation. | |
| * Zeroed otherwise. | |
| */ | |
| struct btrfs_disk_key drop_progress; | |
| /* The tree level of the node described in drop_progress. */ | |
| __u8 drop_level; | |
| /* The height of the tree rooted at bytenr. */ | |
| __u8 level; | |
| /* | |
| * The following fields appear after subvol_uuids+subvol_times | |
| * were introduced. | |
| */ | |
| /* If equal to generation, indicates validity of the following fields. | |
| * If the root is modified using an older kernel, this field and generation will | |
| * become out of sync. | |
| */ | |
| __le64 generation_v2; | |
| /* the uuid of the filesystem */ | |
| __u8 uuid[BTRFS_UUID_SIZE]; | |
| /* uuid of the parent */ | |
| __u8 parent_uuid[BTRFS_UUID_SIZE]; | |
| /* the received uuid */ | |
| __u8 received_uuid[BTRFS_UUID_SIZE]; | |
| /* transaction id when an inode changes */ | |
| __le64 ctransid; | |
| /* transaction id when this tree was created */ | |
| __le64 otransid; | |
| /* transaction id when this volume was sent, non-zero for sent subvol */ | |
| __le64 stransid; | |
| /* transaction id when this volume was received */ | |
| __le64 rtransid; | |
| /* Time stamps for the above types of transaction id */ | |
| struct btrfs_timespec ctime; | |
| struct btrfs_timespec otime; | |
| struct btrfs_timespec stime; | |
| struct btrfs_timespec rtime; | |
| /* Reserved for future use */ | |
| __le64 reserved[8]; | |
| } | |
| ROOT REF item | |
| ------------- | |
| ROOT_REF items are created when either a snapshot or a subvolume is created. | |
| Their purpose is to allow for quick identification of the parent of either a | |
| subvolume or a snapshot. This item is used to represent both a forward | |
| reference (from parent -> child) or a back reference (child -> parent). On | |
| every subvol creation there are 2 such items cread (one for backref and one | |
| for forward ref). Forward refs allows for quickly identfitying the locations | |
| in a directory where subvolumes are rooted. This can be useful e.g. recursive | |
| snapshotting. Root back refs allows to recover lost snapshots. The body of | |
| this item represents information about the direntry of the newly created | |
| subvolume. | |
| Key: | |
| key.objectid = if BTRFS_ROOT_REF_KEY: id of the subvolume of the parent | |
| If BTRFS_ROOT_BACKREF_KEY: id of newly created subvolume or snapshot. | |
| key.type = BTRFS_ROOT_REF_KEY or BTRFS_ROOT_BACKREF_KEY | |
| key.offset = if BTRFS_ROOT_REF_KEY: id of newly created subvolume or snapshot. | |
| if BTRFS_ROOT_BACKREF_KEY: id of the subvolume of the parent | |
| struct btrfs_root_ref { | |
| /* Inode of the parent directory of the dir entry */ | |
| __le64 dirid; | |
| /* the directory index this dir entry has in the parent */ | |
| __le64 sequence; | |
| /* Length of the directory entry referencing the root*/ | |
| __le16 name_len; | |
| /* Actual name string follows */ | |
| } | |
| INODE ITEM | |
| ---------- | |
| This structure contains the information typically associated with a UNIX-style | |
| inode's stat(2) data. | |
| Key: | |
| btrfs_key.type = BTRFS_INODE_ITEM_KEY [0x1] | |
| btrfs_key.objectid = inode number | |
| btrfs_key.offset = 0 | |
| Type of data item: struct btrfs_inode_item | |
| Item data: | |
| struct btrfs_inode_item { | |
| __le64 generation; /* nfs style generation number, essentially transid when the inode was created */ | |
| __le64 transid; /* transid that last touched this inode */ | |
| __le64 size; /* size of file in bytes */ | |
| __le64 nbytes; /* Size allocated to this file in bytes, 0 for dirs, Sum of all 'offset' fields for EXTENT_DATA items */ | |
| __le64 block_group; /* unused for ordinary inodes, otherwise contains the byte offset of blockgroup when used as a freespace inode /* | |
| __le32 nlink; /* stat.st_nlink, count of INODE_REF items entries for the inode, always set to 1 if this item is found outside of the file tree */ | |
| __le32 uid; | |
| __le32 gid; | |
| __le32 mode; | |
| __le64 rdev; | |
| __le64 flags; /* Various flags of the indoe, see below */ | |
| __le64 sequence; /* Sequence number used for NFS compatibility. Initialized to 0 and incremented each time mtime value is changed. */ | |
| __le64 reserved[4]; /* Reserved for future use */ | |
| struct btrfs_timespec atime; | |
| struct btrfs_timespec ctime; | |
| struct btrfs_timespec mtime; | |
| struct btrfs_timespec otime; /* Timestamp of inode creation */ | |
| } | |
| Possible flags are: | |
| * BTRFS_INODE_NODATASUM - Don't perform checksum operations on this inode | |
| * BTRFS_INODE_NODATACOW - Don't perform CoW for data extents on this inode, | |
| when the reference count is 1. | |
| * BTRFS_INODE_READONLY - This is a special-purpose flag used only by the | |
| convert code. It makes the inode readonly but deletable | |
| * BTRFS_INODE_NOCOMPRESS - Do not compress this inode. This flag may be | |
| changed by the kernel as compression ratios change. If the compression ratio | |
| for data associated with an inode becomes undesirable, this flag will be | |
| set. It may be cleared if the data changes and the compression ratio is | |
| favorable again. | |
| * BTRFS_INODE_PREALLOC - Inode contains preallocated extents. This instructs | |
| the kernel to attempt to avoid CoWing those extents. | |
| * BTRFS_INODE_SYNC - Operations on this inode will be performed synchronously. | |
| * BTRFS_INODE_IMMUTABLE - Inode is read-only regardless of UNIX permissions or | |
| ownership. Attempts to modify this inode will result in EPERM being returned | |
| to the user. | |
| * BTRFS_INODE_APPEND - This inode is append-only. | |
| * BTRFS_INODE_NODUMP - This inode is not a candidate for dumping using the | |
| dump(8) program. [Accepted but not implemented] | |
| * BTRFS_INODE_NOATIME - Do not update atime when this inode is accessed. | |
| * BTRFS_INODE_DIRSYNC - Operations on directory operations will be performed | |
| synchronously. | |
| * BTRFS_INODE_COMPRESS - Compression is enabled on this inode. | |
| CHUNK_ITEM | |
| ---------- | |
| The chunk items are used to describe the logical address space of the backing | |
| store. They are divided into data and metadata chunk items. When metadata has | |
| to be written first a chunk item with enough freespace has to be found and | |
| then it's used to write data. Similarly, when data is to be written a data | |
| chunk item is located and used. | |
| Key: | |
| key.type = BTRFS_CHUNK_ITEM_KEY | |
| key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID | |
| key.offset = starting logical address of the chunk. | |
| struct btrfs_chunk { | |
| /* size of this chunk in bytes */ | |
| __le64 length; | |
| /* objectid of the root referencing this chunk. Always set to EXTENT_ROOT */ | |
| __le64 owner; | |
| /* Replication stripe length */ | |
| __le64 stripe_len; | |
| /* Same flags as those in btrfs_block_group_item */ | |
| __le64 type; | |
| /* optimal io alignment for this chunk */ | |
| __le32 io_align; | |
| /* optimal io width for this chunk */ | |
| __le32 io_width; | |
| /* minimal io size for this chunk */ | |
| __le32 sector_size; | |
| /* 2^16 stripes is quite a lot, a second limit is the size of a single | |
| * item in the btree | |
| */ | |
| __le16 num_stripes; | |
| /* Number of replication sub stripes (Apply only to RAID10*/ | |
| __le16 sub_stripes; | |
| /* The first of one or more stripes which map to device extents, this | |
| * is the first member of an array | |
| */ | |
| struct btrfs_stripe stripe; | |
| /* additional stripes go here */ | |
| } | |
| Stripes essentially map a logical (chunk) address to a physical (device) | |
| address. For every stripe in a chunk there is going to be a device extent item | |
| allocated for the respective device | |
| struct btrfs_stripe { | |
| /* Id of device this stripe applies to */ | |
| __le64 devid; | |
| /* Starting offset on the physical disk for this stripe */ | |
| __le64 offset; | |
| /* UUID of device */ | |
| __u8 dev_uuid[BTRFS_UUID_SIZE]; | |
| } | |
| DEVICE item | |
| ----------- | |
| Each constituent device in a btrfs filesystem is represented by a DEVICE item. | |
| It described various characteristics of the underlying physical device such as | |
| sector size, io alignment, speeds and size. | |
| Key: | |
| key.objectid = BTRFS_DEV_ITEMS_OBJECTID | |
| key.type = BTRFS_DEV_ITEM_KEY | |
| key.offset = device id, incrementally increasing ids, starting at 1 | |
| struct btrfs_dev_item { | |
| /* the internal btrfs device id */ | |
| __le64 devid; | |
| /* size of the device */ | |
| __le64 total_bytes; | |
| /* bytes used */ | |
| __le64 bytes_used; | |
| /* optimal io alignment for this device */ | |
| __le32 io_align; | |
| /* optimal io width for this device */ | |
| __le32 io_width; | |
| /* minimal io size for this device */ | |
| __le32 sector_size; | |
| /* type and info about this device */ | |
| __le64 type; | |
| /* expected generation for this device */ | |
| __le64 generation; | |
| /* | |
| * starting byte of this partition on the device, | |
| * to allow for stripe alignment in the future | |
| */ | |
| __le64 start_offset; | |
| /* grouping information for allocation decisions */ | |
| __le32 dev_group; | |
| /* seek speed 0-100 where 100 is fastest */ | |
| __u8 seek_speed; | |
| /* bandwidth 0-100 where 100 is fastest */ | |
| __u8 bandwidth; | |
| /* btrfs generated uuid for this device */ | |
| __u8 uuid[BTRFS_UUID_SIZE]; | |
| /* uuid of FS who owns this device */ | |
| __u8 fsid[BTRFS_UUID_SIZE]; | |
| } | |
| DEVICE EXTENT item | |
| ------------------ | |
| Device extent items represent allocated space on every physical disk. This | |
| structure is used to map physical extents on an individual backing device to a | |
| chunk. This extent may be the only one for a particular chunk or one of | |
| several. | |
| Key: | |
| key.objectid = device id of the device this extent is allocated from | |
| key.type = BTRFS_DEV_EXTENT_KEY | |
| key.offset = address where this extent begins on disk | |
| struct btrfs_dev_extent { | |
| /* Object id of the chunk tree which own this extent, always | |
| * BTRFS_CHUNK_TREE_OBJECTID | |
| */ | |
| __le64 chunk_tree; | |
| /* Object id of the chunk item which own this extent, always | |
| * BTRFS_FIRST_CHUNK_TREE_OBJECTID | |
| */ | |
| __le64 chunk_objectid; | |
| /* Offset of the CHUNK_ITEM that references this extent. */ | |
| __le64 chunk_offset; | |
| /* length of this extent in bytes */ | |
| __le64 length; | |
| /* uuid of chunk tree, redundant way to check ownership */ | |
| __u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; | |
| } | |
| DEVICE STATS item | |
| ----------------- | |
| Every constituent device of a btrfs partition has a DEVICE STATS items associated | |
| with. It stores IO stats in the device tree. The body of the item is a simple | |
| array of 64 bit counters. | |
| Key: | |
| key.objectid = BTRFS_DEV_STATS_OBJECTID | |
| key.type = BTRFS_PERSISTENT_ITEM_KEY | |
| key.offset = device id | |
| struct btrfs_dev_stats_item { | |
| /* Array of available statistics */ | |
| __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; | |
| } | |
| The available counterid are defined by the following enum: | |
| enum btrfs_dev_stat_values { | |
| /* disk I/O failure stats */ | |
| BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ | |
| BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ | |
| BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ | |
| /* stats for indirect indications for I/O failures */ | |
| BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or | |
| * contents is illegal: this is an | |
| * indication that the block was damaged | |
| * during read or write, or written to | |
| * wrong location or read from wrong | |
| * location */ | |
| BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not | |
| * been written */ | |
| BTRFS_DEV_STAT_VALUES_MAX | |
| }; | |
| DEV REPLACE item | |
| ---------------- | |
| When dev replace is initiated a dev replace item is inserted into the device | |
| tree. It describes the state of a device replace operation. When a filesystem | |
| is being mounted btrfs' code will check to see if such an item exists and | |
| populate its in-memory state with the values found in the item. There can ever | |
| be at most one such item in the device tree. | |
| Key: | |
| key.objectid = 0 | |
| key.type = BTRFS_DEV_REPLACE_KEY | |
| key.offset = 0 | |
| struct btrfs_dev_replace_item { | |
| /* Id of the device we are replacing */ | |
| __le64 src_devid; | |
| __le64 cursor_left; | |
| __le64 cursor_right; | |
| __le64 cont_reading_from_srcdev_mode; | |
| /* Can be one of : | |
| * BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED - replace never initiated. | |
| * BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED - replace is running | |
| * BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED - replace has finished | |
| * BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED - replace as been cancelled. | |
| * BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED - replace has been suspended | |
| /* | |
| __le64 replace_state; | |
| /* Time replace was initiated */ | |
| __le64 time_started; | |
| /* Time replace stopped, either due to cancellation or finishing */ | |
| __le64 time_stopped; | |
| /* Write errors encountered while replacing device */ | |
| __le64 num_write_errors; | |
| /* Uncorrectable read errors encountered while replacing device */ | |
| __le64 num_uncorrectable_read_errors; | |
| } | |
| BLOCK GROUP item | |
| ---------------- | |
| While the extent tree defines the address space used for extent allocations | |
| for the entire file system, block groups allocate and define the parameters | |
| within that space. Every extent item or metadata item that describes an extent | |
| in use by the file system is apportioned from allocated block groups. Each | |
| block group can represent space used for system objects (e.g. the chunk tree | |
| and primary super block), metadata trees and items, or data extents. It is | |
| possible to combine metadata and data allocations within a single block group, | |
| though it is not recommended. This mixed allocation policy is typically only | |
| seen on filesystems smaller than approximately 10 GiB in size. | |
| Key: | |
| key.type = BTRFS_BLOCK_GROUP_ITEM_KEY | |
| key.objectid = starting logical address of chunk | |
| key.offset = size of chunk | |
| struct btrfs_block_group_item { | |
| /* Bytes used from this block group */ | |
| __le64 used; | |
| /* The object id of the chunk this block group represents. Always set | |
| to BTRFS_FIRST_CHUNK_TREE_OBJECTID */ | |
| __le64 chunk_objectid; | |
| /* Various flags, defining the allocation and replication policies */ | |
| __le64 flags; | |
| } | |
| Flags is a bitfield with following bits possible: | |
| Allocation policies: | |
| * BTRFS_BLOCK_GROUP_DATA - Whether this block group represents data | |
| * BTRFS_BLOCK_GROUP_SYSTEM - Whether this block group represents the system chunk | |
| * BTRFS_BLOCK_GROUP_METADATA - Whether this block group represents metadata | |
| Replication policies: | |
| * BTRFS_BLOCK_GROUP_RAID0 - Data in this block group is striped | |
| * BTRFS_BLOCK_GROUP_RAID1 - Data in this block group is mirrored | |
| * BTRFS_BLOCK_GROUP_DUP - Data in this block group is duplicated on the same drive | |
| * BTRFS_BLOCK_GROUP_RAID10 - Data in this block group is mirrored across striped devices (raid 1 + 0) | |
| * BTRFS_BLOCK_GROUP_RAID5 - Data in this block group is in Raid 5 mode | |
| * BTRFS_BLOCK_GROUP_RAID6 - Data in this block group is in Raid 6 mode | |
| FILE EXTENT DATA item | |
| --------------------- | |
| Every file in BTRFS is described by any number of file extent data items. | |
| This is similar to how other filesystems describe ranges of file data (e.g. | |
| xfs/ext4). In BTRFS a file extent can be in one of 3 states: | |
| * BTRFS_FILE_EXTENT_INLINE - an extent item who can hold all of the actual file | |
| data in its body is considered an inline extent. In such cases data is read | |
| directly from the body of the extent. | |
| * BTRFS_FILE_EXTENT_PREALLOC - a pre-allocated extent. These are ranges which | |
| have been reserved by using fallocate. | |
| * BTRFS_FILE_EXTENT_REG - regular file extent. Most of the extents of a file | |
| would be regular extents. Those extents hold information necessary to locate | |
| the actual data on disk. | |
| A file extent data item holds information about a particular logical region of | |
| a file. This is done through the disk_bytenr and disk_num_bytes members. File | |
| holes are represented by having disk_bytenr and disk_num_byte equal 0 and the | |
| abscense of relevant EXTENT ITEMs in the extent tree | |
| Key: | |
| key.type = BTRFS_EXTENT_DATA_KEY | |
| key.objectid = inode number of the file this extent describes | |
| key.offset = starting offset within the file | |
| struct btrfs_file_extent_item { | |
| /* | |
| * transaction id that created this extent | |
| */ | |
| __le64 generation; | |
| /* | |
| * max number of bytes to hold this extent in ram | |
| * when we split a compressed extent we can't know how big | |
| * each of the resulting pieces will be. So, this is | |
| * an upper limit on the size of the extent in ram instead of | |
| * an exact limit. | |
| */ | |
| __le64 ram_bytes; | |
| /* Compression type. Can be one of: BTRFS_COMPRESS_NONE, | |
| * BTRFS_COMPRESS_ZLIB or BTRFS_COMPRESS_LZO | |
| */ | |
| __u8 compression; | |
| /* Encryption type. Currently unused */ | |
| __u8 encryption; | |
| /* Any other transformation which might have been applied to this extent. | |
| * Currently unused. | |
| __le16 other_encoding; | |
| /* Type of extent: BTRFS_FILE_EXTENT_INLINE, BTRFS_FILE_EXTENT_REG, | |
| * or BTRFS_FILE_EXTENT_PREALLOC. | |
| __u8 type; | |
| /* The following fields apply only to BTRFS_FILE_EXTENT_REG and | |
| * BTRFS_FILE_EXTENT_PREALLOC. | |
| */ | |
| /* Logical address of the start of the extent. Note: This is the | |
| * key.objectid of the corresponding EXTENT_ITEM. If this is an inline | |
| * extent, then the inline data will start at this point. | |
| */ | |
| __le64 disk_bytenr; | |
| /* Number of on-disk bytes of the extent (compressed). Note: This is the | |
| * key.offset for the corresponding EXTENT_ITEM | |
| */ | |
| __le64 disk_num_bytes; | |
| /* | |
| * the logical offset in file blocks (no csums) | |
| * this extent record is for. This allows a file extent to point | |
| * into the middle of an existing extent on disk, sharing it | |
| * between two snapshots (useful if some bytes in the middle of the | |
| * extent have changed | |
| */ | |
| __le64 offset; | |
| /* | |
| * the logical number of file blocks (no csums included). This | |
| * always reflects the size uncompressed and without encoding. | |
| */ | |
| __le64 num_bytes; | |
| } | |
| EXTENT_ITEM | |
| ----------- | |
| EXTENT_ITEM items describe the space allocated for metadata tree nodes and | |
| leafs as well as data extents. The space is allocated from block groups that | |
| define the appropriate regions. In addition to functioning as basic allocation | |
| records, EXTENT_ITEM items also contain back references that can be used to | |
| repair the file system or resolve extent ownership back to a set of one or | |
| more file trees. Although EXTENT_ITEM items can be used to describe both DATA | |
| and tree block extents, newer file systems with the skinny metadata feature | |
| enabled at mkfs time, use the distinct METADATA_ITEM items to represent metadata | |
| extents instead. One extent record item exists for each extent allocated on a btrfs | |
| file system. Each item tracks the number of explicit references to the extent, | |
| records whether the extent contains file data or tree metadata and, if the | |
| latter, if the item contains a full back reference. It is followed by back | |
| reference records for each explicit reference held. | |
| [TODO: Document implied references] | |
| Key: | |
| key.objectid = logical address of the extent | |
| key.type = BTRFS_EXTENT_ITEM_KEY | |
| key.offset = length of extent | |
| struct btrfs_extent_item { | |
| /* Number of references of this extent */ | |
| __le64 refs; | |
| /* id of transaction that allocated this extent */ | |
| __le64 generation; | |
| /* Defines type of data this extent defines, those flags also defined | |
| * the type of data which follow this struct. | |
| */ | |
| __le64 flags; | |
| } | |
| * BTRFS_EXTENT_FLAG_DATA [0x1] - Flag to indicate that the following record refers | |
| to a data extent | |
| * BTRFS_EXTENT_FLAG_TREE_BLOCK [0x2] - Flag to indicate that the following record | |
| refers to a metadata tree block | |
| * BTRFS_BLOCK_FLAG_FULL_BACKREF [0x80] - Tree block back reference contains a | |
| full back reference. | |
| As stated previously the btrfs_extent_item is generally followed by multiple | |
| structures, based on the type of the data that the extent holds (metadata or | |
| data). Each backref begins with a btrfs_extent_inline_ref structure: | |
| struct btrfs_extent_inline_ref { | |
| /* The type of the backref that follows */ | |
| __u8 type; | |
| /* Dependent on the actual type of backref */ | |
| __le64 offset; | |
| } | |
| Backrefs are destinguished based on : | |
| 1. whether they contain data or metadata (as defined by btrfs_extent_item.flags) | |
| 2. whether the extent is shared or not (as defined by btrfs_extent_inline_ref.type) | |
| The type can take the following values: | |
| BTRFS_EXTENT_DATA_REF_KEY: For a file data extent that is indirect/normal. | |
| The content of this backref is a btrfs_extent_data_ref strucutre that overlaps | |
| with the btrfs_extent_inline_ref.offset field: | |
| struct btrfs_extent_data_ref { | |
| /* Subvolume tree id that references the extent */ | |
| __le64 root; | |
| /* inode number of the file referencing the extent in the given root */ | |
| __le64 objectid; | |
| /* Byte offset for the extent within the file in case partial | |
| * sharing | |
| */ | |
| __le64 offset; | |
| /* reference count for the extent */ | |
| __le32 count; | |
| } | |
| BTRFS_SHARED_DATA_REF_KEY: For a file data extent that is shared among subvols/ | |
| snapshots. The content of this backref is the bytenr of the btree leaf | |
| containing the BTRFS_EXTENT_DATA_KEY item for this extent reference. This value | |
| overlays btrfs_extent_inline_ref.offset field. The btrfs_extent_inline_ref is | |
| also followed by the following btrfs_shared_data_ref struct: | |
| struct btrfs_shared_data_ref { | |
| /* reference count for the extent */ | |
| __le32 count; | |
| } | |
| Ext refs: | |
| In case an extent item doesn't have enough space to store all the references to | |
| it, the filesystem reverts to storing the so called extref. An extref is an | |
| additional item which is recorded in the extent tree. The key/item body depend | |
| on the kind of reference being tracked. | |
| For ordinary data extents it has the following format: | |
| key.objectid = logical bytenr of the extent being referenced | |
| key.type = BTRFS_EXTENT_DATA_REF_KEY | |
| key.offset = crc32c hash of [root, objectid, offset] | |
| Item body is `struct btrfs_extent_data_ref`, see above for detailed info. | |
| For shared data extents it's: | |
| key.objectid = logical bytenr of the extent being referenced | |
| key.type = BTRFS_SHARED_DATA_REF_KEY | |
| key.offset = the bytenr of the btree leaf containing the BTRFS_EXTENT_DATA_KEY | |
| item for this extent reference | |
| Item body is `struct btrfs_shared_data_ref` | |
| METADATA extent item | |
| -------------------- | |
| METADATA items exist only on filesystem which have enabled "skinny metadata" | |
| feature during mkfs time. They describe the space allocated for metadata tree | |
| nodes and leafs. The space is allocated from block groups that define metadata | |
| regions. In addition to functioning as basic allocation records, METADATA_ITEM | |
| items also contain back references that can be used to repair the file system | |
| or resolve extent ownership back to a set of one or more file trees. | |
| Key: | |
| key.objectid = logical address of start of extent | |
| key.type = BTRFS_METADATA_ITEM_KEY | |
| key.offset = level of block in the btree that contains it | |
| The body composition of the METADATA ITEM is similar to that of the EXTENT_ITEM. | |
| First we have struct btrfs_extent_item, followed by btrfs_extent_inline_ref. | |
| However, there is a difference in the values stored in the btrfs_extent_inline_ref | |
| for a metadata item as follows: | |
| struct btrfs_extent_inline_ref { | |
| /* The type of the backref that follows: BTRFS_SHARED_BLOCK_REF_KEY | |
| * or BTRFS_TREE_BLOCK_REF_KEY | |
| */ | |
| __u8 type; | |
| /* If type is BTRFS_SHARED_BLOCK_REF_KEY: contains the logical address | |
| * for the parent metadata block. | |
| * | |
| * If type is BTRFS_TREE_BLOCK_REF_KEY: contains the objectid for the | |
| * B-tree root. | |
| */ | |
| __le64 offset; | |
| } | |
| CHECKSUM ITEM | |
| ------------- | |
| BTRFS (unless otherwise) instructed checksums every 4kb of every file written. | |
| Those checksums are stored in checksum items. All the checksum items key have | |
| identical values for their objectid and type fields, the only way to | |
| destinguish them is via their offset. It describes the starting offset on disk | |
| for which the checksum were calculated. Currently all checksums are CRC32C | |
| number and are 4 bytes, but this might change in the future. Knowing this the | |
| end of the byte range which this csum item described can be derived with the | |
| following equations: | |
| file item size / 4 = number of checksums | |
| offset + (number of checksums * 4096) = end logical address | |
| For example if we have an offset of 50000, then the first checksum in this item | |
| would be for logical address 50000, the next one will be for logical address | |
| 54096, the one after that for 58192 and so on. | |
| This way it's possible to find the checksum for any piece of data in a file by | |
| simply finding such a csum item whose [start, end] ofsets contain the | |
| [disk_bytenr, disk_bytenr + disk_num_bytes] for the respective file extent. | |
| Key: | |
| key.objectid = BTRFS_EXTENT_CSUM_OBJECTID | |
| key.type = BTRFS_EXTENT_CSUM_KEY | |
| key.offset = starting logical disk address of the checksummed region | |
| struct btrfs_csum_item { | |
| /* Start of a variable length sequence of checksum */ | |
| __u8 csum; | |
| } | |
| FREE SPACE INFO item (v2 cache only) | |
| ------------------------------------ | |
| Each block group is represented in the free space tree by a free space info | |
| item that stores accounting information: whether the free space for this block | |
| group is stored as bitmaps or extents and how many extents of free space exist | |
| for this block group (regardless of which format is being used in the tree). | |
| key.objectid = key.objectid of the block group this item represents (e.g. starting | |
| logical address of said group) | |
| key.type = BTRFS_FREE_SPACE_INFO_KEY | |
| key.offset = key.offset of the block group this item represents (e.g. size of the | |
| block group) | |
| struct btrfs_free_space_info { | |
| /* Number of free extents in this block group */ | |
| __le32 extent_count; | |
| /* flags, currently only BTRFS_FREE_SPACE_USING_BITMAPS */ | |
| __le32 flags; | |
| } | |
| FREE SPACE EXTENT item (v2 cache only) | |
| -------------------------------------- | |
| When prudent, the actual freespace in the freespace tree is described with | |
| keys, that do not have any associated data with them i.e. plain key strucutre. | |
| They contains the start/end of the freespace extent. | |
| key.objectid = start address of free space | |
| key.type = BTRFS_FREE_SPACE_EXTENT_KEY | |
| key.offset = length of this extent | |
| FREE SPACE BITMAP item (v2 cache only) | |
| -------------------------------------- | |
| When a blockgroup becomes very fragmented it might be more space-efficient to | |
| store the free space information in free space bitmap items. Those items | |
| consist of a key and their associated data is just an array of bytes. Each bit | |
| in the bitmap represents the state of a single sectorsize worth of data. | |
| key.objectid = starting address of the region this item describes | |
| key.type = BTRFS_FREE_SPACE_BITMAP_KEY | |
| key.offset = length of the region described by this item | |
| FREE SPACE HEADER (v1 cache only) | |
| --------------------------------- | |
| The old (pre-freespace btree) free space cache uses free space header item to | |
| as a descriptor for a free space. Its contents consist of a | |
| btrfs_free_space_header struct. | |
| Key: | |
| key.objectid = BTRFS_FREE_SPACE_OBJECTID | |
| key.type = 0 | |
| key.offset = objectid (i.e. starting logical address) of the block group | |
| this free space header describes | |
| struct btrfs_free_space_header { | |
| /* disk key of free space inode */ | |
| struct btrfs_disk_key location; | |
| __le64 generation; | |
| /* Number of entries in the free space file */ | |
| __le64 num_entries; | |
| __le64 num_bitmaps; | |
| } | |
| The contents of the freespace file have the following layout | |
| <CRC PAGE><EXTENTS><BITMAPS> | |
| CRC page holds the crc for extents and bitmaps. The format of those two is | |
| described by the following structure: | |
| struct btrfs_free_space_entry { | |
| /* Logical address of start of free space */ | |
| __le64 offset; | |
| /* Length in bytes */ | |
| __le64 bytes; | |
| /* Whether it's BTRFS_FREE_SPACE_BITMAP or BTRFS_FREE_SPACE_EXTENT */ | |
| __u8 type; | |
| } | |
| DIR item | |
| -------- | |
| Dir items are used for both standard user-visible directories and internal | |
| directories, used to manage named extended attributes. For every visible name | |
| in a filesystem there is a corresponding DIR item. This item is also used to | |
| house the XATTR for a file, in this case there is one DIR item for every xattr | |
| attribute. | |
| Key: | |
| key.objectid = inode of directory containing this entry | |
| key.type = BTRFS_DIR_ITEM_KEY | BTRFS_XATTR_ITEM_KEY | |
| key.offset = crc32c name hash of either the direntry name (user visible) or the name | |
| of the extended attribute name | |
| truct btrfs_dir_item { | |
| /* In case of ordinary direntry it holds the inode key it applies to, | |
| * in the xattr case it's zeroed out | |
| */ | |
| struct btrfs_disk_key location; | |
| /* Id of transaction which modified this dir entry */ | |
| __le64 transid; | |
| /* Length of XATTR value, 0 otherwise */ | |
| __le16 data_len; | |
| /* Length of either the name of the XATTR or the length of the dir entry*/ | |
| __le16 name_len; | |
| /* BTRFS_FT_XATTR if XATTR otherwise signals the type of the inode */ | |
| __u8 type; | |
| /* In case of an ordinary dir entry then the name string will come after | |
| * the type. In case of an XATTR then first comes the xattr name, followed | |
| * by the xattr value | |
| */ | |
| } | |
| In case the dir item doesn't describe an XATTR, type can be one of: | |
| * BTRFS_FT_REG_FILE - Indicates a regular file. | |
| * BTRFS_FT_DIR - The target object is a directory | |
| * BTRFS_FT_CHRDEV - The target object is an INODE_ITEM representing a character device node. | |
| * BTRFS_FT_BLKDEV - The target object is an INODE_ITEM representing a block device node. | |
| * BTRFS_FT_FIFO - The target object is an INODE_ITEM representing a FIFO device node. | |
| * BTRFS_FT_SOCK - The target object is an INODE_ITEM representing a socket device node. | |
| * BTRFS_FT_SYMLINK - The target object is an INODE_ITEM representing a symbolic link. | |
| DIR INDEX item | |
| -------------- | |
| Dir index items are used when looking up entries in a directory. Their data | |
| portion is identical to that of the DIR item. The only difference is in the | |
| key used to lookup the item. This duplication is necessary to ensure good | |
| readdir perfromance. Since readdir requires monotonicaly incrementing indices, | |
| the key's offset field is modified to contain a counter, which is incremented | |
| everytime a new entry appears in a directory. | |
| Key: | |
| key.objectid = inode number of directory we are looking up into | |
| key.type = BTRFS_DIR_INDEX_KEY | |
| key.offset = index id in the directory. Indices start at 2 (due to '.' and '..) | |
| INODE REF item | |
| -------------- | |
| This item stores a reference to the parent directory for a file. It's a | |
| reliability aide which allows the file system to find out the parent directory | |
| of a recovered file. There is one such item per file in the filesystem. | |
| Additionally it also stores all hardlink to the file in a btrfs_inode_ref | |
| structure. Files with hard links in multiple directories have multiple | |
| reference items, one for each parent directory. Files with multiple hard links | |
| in the same directory pack all of the links' filenames into the same reference | |
| item. | |
| Key: | |
| key.objectid = inode number of file | |
| key.type = BTRFS_INODE_REF_KEY | |
| key.offset = inode number of parent of file | |
| struct btrfs_inode_ref { | |
| /* Index in the parent directory of this hardlink, given by key.offset */ | |
| __le64 index; | |
| /* Length of string, coming right after this member */ | |
| __le16 name_len; | |
| /* Variable length, non-null terminated string of length 'name_len' */ | |
| } | |
| EXTENDED INODE REF item | |
| ----------------------- | |
| The total number of inode refs that a given inode number / parent inode number | |
| tuple can have is limited by our leaf size. Unfortunately this limits the | |
| maximum number of hardlinks we can have. Extended inode refs fix this | |
| limitation by using a key offset that's a hash of the link name and parent | |
| inode number. Extended refs don't replace the existing ref array. An inode | |
| gets an extended ref for a given link only after the ref array has been | |
| filled. | |
| Key: | |
| key.objectid = inode number of file | |
| key.type = BTRFS_INODE_EXTREF_KEY | |
| key.offset = hash of link name and parent inode number | |
| struct btrfs_inode_extref { | |
| /* Inode number of parent directory */ | |
| __le64 parent_objectid; | |
| /* Index in the parent directory of this hardlink */ | |
| __le64 index; | |
| /* Length of string, coming right after this member */ | |
| __le16 name_len; | |
| /* Variable length, non-null terminated string of length 'name_len' */ | |
| __u8 name[0]; | |
| } __attribute__ ((__packed__)); | |
| QGROUP STATUS item | |
| ------------------ | |
| Used to hold information about the qgroup states for a particular subvolume. | |
| This implies there is one such item for every qgroup tree. | |
| Key: | |
| key.objectid = 0 | |
| key.type = BTRFS_QGROUP_STATUS_KEY | |
| key.offset = 0 | |
| truct btrfs_qgroup_status_item { | |
| /* Version of the qgroup subsystem, currently set to BTRFS_QGROUP_STATUS_VERSION (1) | |
| __le64 version; | |
| /* | |
| * the generation is updated during every commit. As older | |
| * versions of btrfs are not aware of qgroups, it will be | |
| * possible to detect inconsistencies by checking the | |
| * generation on mount time | |
| */ | |
| __le64 generation; | |
| /* flag definitions */ | |
| __le64 flags; | |
| /* | |
| * only used during scanning to record the progress | |
| * of the scan. It contains a logical address | |
| */ | |
| __le64 rescan; | |
| } | |
| * BTRFS_QGROUP_STATUS_FLAG_ON [0x1] - Subvolume quota turned on. | |
| * BTRFS_QGROUP_STATUS_FLAG_RESCAN [0x2] - Signals whether rescan is in progress. | |
| * BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT [0x4] - Turned on when some qgroup | |
| entries are known to be out of date, either because the configuration has | |
| changed in a certain way that requires rescan or because the fs has been | |
| mounted with a non-qgroup-aware version. Turning qouta off and on again | |
| makes it inconsistent, too. | |
| QGROUP INFO item | |
| ---------------- | |
| Items of this type are used to track the amount of space used by a particular | |
| subvolume. There is one such item per subvolume. | |
| TODO: Expand a bit on exclusive VS referenced bytes | |
| Key: | |
| key.objectid = 0 | |
| key.type = BTRFS_QGROUP_INFO_KEY | |
| key.offste = id of qgroup (the same as the objectid of the subvolume this qgroup | |
| accounts). | |
| struct btrfs_qgroup_info_item { | |
| /* Id of transaction when this item was created */ | |
| __le64 generation; | |
| /* Referenced bytes (including shared data) */ | |
| __le64 rfer; | |
| /* Referenced bytes compressed */ | |
| __le64 rfer_cmpr; | |
| /* Exclusive bytes (i.e. bytes not being shared by a subvolume */ | |
| __le64 excl; | |
| /* Exlucisve bytes compressed */ | |
| __le64 excl_cmpr; | |
| } | |
| QGROUP LIMIT item | |
| ----------------- | |
| Configured qgroup limits are persisted on disk by the QGROUP limit items. When | |
| some data is to be written on disk the usage space in QGROUP INFO items are | |
| checked against the limits in QGROUP LIMIT items. | |
| Key: | |
| key.objectid = 0 | |
| key.type = BTRFS_QGROUP_LIMIT_KEY | |
| key.offset = id of qgroup (the same as the objectid of the subvolume this qgroup | |
| limits) | |
| struct btrfs_qgroup_limit_item { | |
| /* Contains information about enabled limits. */ | |
| __le64 flags; | |
| /* Maximum referenced bytes (including shared data) */ | |
| __le64 max_rfer; | |
| /* Maximum exclusive bytes */ | |
| __le64 max_excl; | |
| /* Unused */ | |
| __le64 rsv_rfer; | |
| /* Unused */ | |
| __le64 rsv_excl; | |
| } | |
| Flags: | |
| * BTRFS_QGROUP_LIMIT_MAX_RFER [0x1] Limit on maximum referenced data enabled | |
| * BTRFS_QGROUP_LIMIT_MAX_EXCL [0x2] Limit on maximum eclusive data enabled | |
| * BTRFS_QGROUP_LIMIT_RSV_RFER [0x4] Unused | |
| * BTRFS_QGROUP_LIMIT_RSV_EXCL [0x8] Unused | |
| * BTRFS_QGROUP_LIMIT_RFER_CMPR [0x10] (Unused) Limit on maximum compressed referenced data enabled | |
| * BTRFS_QGROUP_LIMIT_EXCL_CMPR [0x20] (Unused) Limit on maximum exclusive compressed data size enabled | |
| QGROUP RELATION item | |
| -------------------- | |
| Qgroup relation items are used to record a parent<=>child relantionship in the | |
| quota tree. Those items are also created in pairs. In order to distinguish | |
| child from parents a numerical comparison needs to be performed of .objectid | |
| and .offset. The key assumption is that child qgroups (which inherit the id of | |
| the subvolume they represent) will have increasing objectids. Those items have | |
| no body, only a key. | |
| Key: | |
| key.objectid = parent qgroup | |
| key.type = BTRFS_QGROUP_RELATION_KEY | |
| key.offset = child qgroup | |
| ORPHAN item | |
| ----------- | |
| ORPHAN items are used to track inodes which are being deleted/truncated. They | |
| are added to the filesystem root of an inode at the beginning of a delete or | |
| truncate operation. This ensures that in case of a crash during a complex i | |
| operation the filesystem will be able to recognise that this inode is likely | |
| to have garbage content and will clean it up. This item doesn't have a body | |
| but the key is used to only store the orphaned inode number. | |
| Key: | |
| key.objectid = BTRFS_ORPHAN_OBJECTID | |
| key.type = BTRFS_ORPHAN_ITEM_KEY | |
| key.offset = inode number | |
| DIR LOG item | |
| ------------ | |
| TODO: Explain what exactly this item logs and how the key type varies and how | |
| it's supposed to be used in the context of tree logging. | |
| Key: | |
| key.objectid = inode number of the directory | |
| key.type = BTRFS_DIR_LOG_ITEM_KEY or BTRFS_DIR_LOG_INDEX_KEY (Todo: Depends on what ? ) | |
| key.offset = first offset (TODO: What exactly is it? ) | |
| struct btrfs_dir_log_item { | |
| /* End of range [key.offset, end] */ | |
| __le64 end; | |
| } | |
| BALANCE item | |
| ------------ | |
| BALANCE item is inserted when a balancing operation commences. Its purpose is | |
| to enable balancing to continue in case a crash occurs. It stores the | |
| parameters passed from userspace via the ioctl interface. | |
| Key: | |
| key.objectid = BTRFS_BALANCE_OBJECTID | |
| key.type = BTRFS_TEMPORARY_ITEM_KEY | |
| key.offset = 0 | |
| struct btrfs_balance_item { | |
| /* BTRFS_BALANCE_* */ | |
| __le64 flags; | |
| /* Options for DATA chunks */ | |
| struct btrfs_disk_balance_args data; | |
| /* Options for META chunks */ | |
| struct btrfs_disk_balance_args meta; | |
| /* Options for SYSTEM chunks */ | |
| struct btrfs_disk_balance_args sys; | |
| __le64 unused[4]; | |
| } | |
| struct btrfs_disk_balance_args { | |
| /* | |
| * profiles to operate on, single is denoted by | |
| * BTRFS_AVAIL_ALLOC_BIT_SINGLE | |
| */ | |
| __le64 profiles; | |
| /* | |
| * usage filter | |
| * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' | |
| * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max | |
| */ | |
| union { | |
| __le64 usage; | |
| struct { | |
| __le32 usage_min; | |
| __le32 usage_max; | |
| }; | |
| }; | |
| /* devid filter */ | |
| __le64 devid; | |
| /* devid subset filter [pstart..pend) */ | |
| __le64 pstart; | |
| __le64 pend; | |
| /* btrfs virtual address space subset filter [vstart..vend) */ | |
| __le64 vstart; | |
| __le64 vend; | |
| /* | |
| * profile to convert to, single is denoted by | |
| * BTRFS_AVAIL_ALLOC_BIT_SINGLE | |
| */ | |
| __le64 target; | |
| /* BTRFS_BALANCE_ARGS_* */ | |
| __le64 flags; | |
| /* | |
| * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' | |
| * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum | |
| * and maximum | |
| */ | |
| union { | |
| __le64 limit; | |
| struct { | |
| __le32 limit_min; | |
| __le32 limit_max; | |
| }; | |
| }; | |
| /* | |
| * Process chunks that cross stripes_min..stripes_max devices, | |
| * BTRFS_BALANCE_ARGS_STRIPES_RANGE | |
| */ | |
| __le32 stripes_min; | |
| __le32 stripes_max; | |
| __le64 unused[6]; | |
| } | |
| UUID item | |
| ---------------- | |
| For every created subvolume/snapshot an item is inserted into the UUID tree. | |
| This items is used to map from uuid to subvolume id. | |
| Key: | |
| key.objectid = first half of UUID | |
| key.type = UUID_KEY_SUBVOL or UUID_KEY_RECEIVED_SUBVOL | |
| key.offset = second half of UUID | |
| The body of the items consists of an array of one or more (but typically one) | |
| subvolume id. For subvolumes the key's type field has a value of UUID_KEY_SUBVOL. | |
| When a subvolume is received it creates both a UUID_KEY_RECEIVED_SUBVOL and | |
| UUID_KEY_SUBVOL items with the same subvol_id body. |