diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 643c70d2b2e65a..9235643640981f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -244,8 +244,10 @@ struct btrfs_super_block { __le64 cache_generation; __le64 uuid_tree_generation; + /* r5log journal tail (where recovery starts) */ + __le64 journal_tail; /* future expansion */ - __le64 reserved[30]; + __le64 reserved[29]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -697,6 +699,7 @@ struct btrfs_stripe_hash_table { void btrfs_init_async_reclaim_work(struct work_struct *work); /* fs_info */ +struct btrfs_r5l_log; struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; @@ -1114,6 +1117,9 @@ struct btrfs_fs_info { u32 nodesize; u32 sectorsize; u32 stripesize; + + /* raid56 log */ + struct btrfs_r5l_log *r5log; }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) @@ -2287,6 +2293,8 @@ BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_journal_tail, struct btrfs_super_block, + journal_tail, 64); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, @@ -2932,6 +2940,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + if (fs_info->r5log) + kfree(fs_info->r5log); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); @@ -3278,6 +3288,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); +/* raid56.c */ +void btrfs_r5l_write_journal_tail(struct btrfs_fs_info *fs_info); + + static inline __printf(2, 3) void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8685d67185d01b..3fbd34799d0f37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2987,6 +2987,22 @@ int open_ctree(struct super_block *sb, fs_info->generation = generation; fs_info->last_trans_committed = generation; + if (fs_info->r5log) { + u64 cp = btrfs_super_journal_tail(fs_info->super_copy); +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("%s: get journal_tail %llu\n", __func__, cp); +#endif + /* if the data is not replayed, data and parity on + * disk are still consistent. So we can move on. + * + * About fsync, since fsync can make sure data is + * flushed onto disk and only metadata is kept into + * write-ahead log, the fsync'd data will never ends + * up with being replayed by raid56 log. + */ + btrfs_r5l_load_log(fs_info, NULL, cp); + } + ret = btrfs_recover_balance(fs_info); if (ret) { btrfs_err(fs_info, "failed to recover balance: %d", ret); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e176375f374f91..3d1ef4df4a4fde 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2653,6 +2653,50 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) return ret; } +/* identical to btrfs_ioctl_add_dev, but this is with flags */ +static long btrfs_ioctl_add_dev_v2(struct btrfs_fs_info *fs_info, void __user *arg) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + + mutex_lock(&fs_info->volume_mutex); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + if (vol_args->flags & BTRFS_DEVICE_RAID56_LOG && + fs_info->r5log) { + ret = -EEXIST; + btrfs_info(fs_info, "r5log: attempting to add another log device!"); + goto out_free; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(fs_info, vol_args->name, vol_args->flags); + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_RAID56_LOG) { + ASSERT(fs_info->r5log); + btrfs_info(fs_info, "disk added %s as raid56 log", vol_args->name); + } else { + btrfs_info(fs_info, "disk added %s", vol_args->name); + } + } +out_free: + kfree(vol_args); +out: + mutex_unlock(&fs_info->volume_mutex); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + return ret; +} + static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; @@ -2672,7 +2716,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_init_new_device(fs_info, vol_args->name); + ret = btrfs_init_new_device(fs_info, vol_args->name, 0); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); @@ -5539,6 +5583,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(fs_info, argp); + case BTRFS_IOC_ADD_DEV_V2: + return btrfs_ioctl_add_dev_v2(fs_info, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_RM_DEV_V2: diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8ea0eb76325e9..ceca41537dddc3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -43,6 +43,7 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" +#include "hash.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -177,6 +178,33 @@ struct btrfs_raid_bio { unsigned long *dbitmap; }; +/* raid56 log */ +struct btrfs_r5l_log { + /* protect this struct and log io */ + struct mutex io_mutex; + + spinlock_t io_list_lock; + struct list_head io_list; + + /* r5log device */ + struct btrfs_device *dev; + + struct btrfs_fs_info *fs_info; + + /* allocation range for log entries */ + u64 data_offset; + u64 device_size; + + u64 next_checkpoint; + + u64 last_checkpoint; + u64 last_cp_seq; + u64 seq; + u64 log_start; + u32 uuid_csum; + struct btrfs_r5l_io_unit *current_io; +}; + static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct btrfs_work *work); @@ -1034,130 +1062,1193 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) return -ENOMEM; rbio->stripe_pages[i] = page; } - return 0; -} + return 0; +} + +/* only allocate pages for p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + + for (; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + } + return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +static int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) +{ + struct bio *last = bio_list->tail; + u64 last_end = 0; + int ret; + struct bio *bio; + struct btrfs_bio_stripe *stripe; + u64 disk_start; + + stripe = &rbio->bbio->stripes[stripe_nr]; + disk_start = stripe->physical + (page_index << PAGE_SHIFT); + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) + return fail_rbio_index(rbio, stripe_nr); + + /* see if we can add this page onto our existing bio */ + if (last) { + last_end = (u64)last->bi_iter.bi_sector << 9; + last_end += last->bi_iter.bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && stripe->dev->bdev && + !last->bi_error && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, page, PAGE_SIZE, 0); + if (ret == PAGE_SIZE) + return 0; + } + } + + /* put a new bio on the list */ + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + if (!bio) + return -ENOMEM; + + bio->bi_iter.bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_iter.bi_sector = disk_start >> 9; + + bio_add_page(bio, page, PAGE_SIZE, 0); + bio_list_add(bio_list, bio); + return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk. This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction. The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ + if (rbio->faila >= 0 || rbio->failb >= 0) { + BUG_ON(rbio->faila == rbio->real_stripes - 1); + __raid56_parity_recover(rbio); + } else { + finish_rmw(rbio); + } +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + struct bio_vec *bvec; + u64 start; + unsigned long stripe_offset; + unsigned long page_index; + int i; + + spin_lock_irq(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) { + start = (u64)bio->bi_iter.bi_sector << 9; + stripe_offset = start - rbio->bbio->raid_map[0]; + page_index = stripe_offset >> PAGE_SHIFT; + + bio_for_each_segment_all(bvec, bio, i) + rbio->bio_pages[page_index + i] = bvec->bv_page; + } + spin_unlock_irq(&rbio->bio_list_lock); +} + +/* r5log */ +/* XXX: this allocation may be done earlier, eg. when allocating rbio */ +static struct btrfs_r5l_io_unit *btrfs_r5l_alloc_io_unit(struct btrfs_r5l_log *log) +{ + struct btrfs_r5l_io_unit *io; + gfp_t gfp = GFP_NOFS; + + io = kzalloc(sizeof(*io), gfp); + ASSERT(io); + io->log = log; + /* need to use kmap. */ + io->meta_page = alloc_page(gfp | __GFP_HIGHMEM | __GFP_ZERO); + ASSERT(io->meta_page); + + return io; +} + +static void btrfs_r5l_free_io_unit(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io) +{ + __free_page(io->meta_page); + ASSERT(list_empty(&io->list)); + kfree(io); +} + +static u64 btrfs_r5l_ring_add(struct btrfs_r5l_log *log, u64 start, u64 inc) +{ + start += inc; + if (start >= log->device_size) + start = start - log->device_size; + return start; +} + +static void btrfs_r5l_reserve_log_entry(struct btrfs_r5l_log *log, struct btrfs_r5l_io_unit *io) +{ + log->log_start = btrfs_r5l_ring_add(log, log->log_start, PAGE_SIZE); + io->log_end = log->log_start; + + if (log->log_start == 0) + io->need_split_bio = true; +} + +/* the IO order is maintained in log->io_list. */ +static void btrfs_r5l_finish_io(struct btrfs_r5l_log *log) +{ + struct btrfs_r5l_io_unit *io, *next; + + spin_lock(&log->io_list_lock); + list_for_each_entry_safe(io, next, &log->io_list, list) { + if (io->status != BTRFS_R5L_STRIPE_END) + break; + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("current log->next_checkpoint %llu (will be %llu after writing to RAID\n", log->next_checkpoint, io->log_start); +#endif + + list_del_init(&io->list); + log->next_checkpoint = io->log_start; + btrfs_r5l_free_io_unit(log, io); + } + spin_unlock(&log->io_list_lock); +} + +static void btrfs_write_rbio(struct btrfs_raid_bio *rbio); + +static void btrfs_r5l_log_endio(struct bio *bio) +{ + struct btrfs_r5l_io_unit *io = bio->bi_private; + struct btrfs_r5l_log *log = io->log; + + bio_put(bio); + + /* move data to RAID. */ + btrfs_write_rbio(io->rbio); + + io->status = BTRFS_R5L_STRIPE_END; + /* After stripe data has been flushed into raid, set ->next_checkpoint. */ + btrfs_r5l_finish_io(log); +} + +static struct bio *btrfs_r5l_bio_alloc(struct btrfs_r5l_log *log) +{ + /* this allocation will not fail. */ + struct bio *bio = btrfs_io_bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + + /* We need to make sure data/parity are settled down on the log disk. */ + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA; + bio->bi_bdev = log->dev->bdev; + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("log->data_offset 0x%llx log->log_start 0x%llx\n", log->data_offset, log->log_start); +#endif + bio->bi_iter.bi_sector = (log->data_offset + log->log_start) >> 9; + + return bio; +} + +static struct btrfs_r5l_io_unit *btrfs_r5l_new_meta(struct btrfs_r5l_log *log) +{ + struct btrfs_r5l_io_unit *io; + struct btrfs_r5l_meta_block *block; + + io = btrfs_r5l_alloc_io_unit(log); + ASSERT(io); + + block = kmap(io->meta_page); + clear_page(block); + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("%s pos %llu seq %llu\n", __func__, log->log_start, log->seq); +#endif + + block->magic = cpu_to_le32(BTRFS_R5LOG_MAGIC); + block->seq = cpu_to_le64(log->seq); + block->position = cpu_to_le64(log->log_start); + + kunmap(io->meta_page); + + io->log_start = log->log_start; + io->meta_offset = sizeof(struct btrfs_r5l_meta_block); + io->seq = log->seq++; + + io->need_split_bio = false; + io->split_bio = NULL; + io->current_bio = btrfs_r5l_bio_alloc(log); + io->current_bio->bi_end_io = btrfs_r5l_log_endio; + io->current_bio->bi_private = io; + + bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + + btrfs_r5l_reserve_log_entry(log, io); + + INIT_LIST_HEAD(&io->list); + spin_lock(&log->io_list_lock); + list_add_tail(&io->list, &log->io_list); + spin_unlock(&log->io_list_lock); + return io; +} + +static int btrfs_r5l_get_meta(struct btrfs_r5l_log *log, struct btrfs_raid_bio *rbio, int payload_size) +{ + /* always allocate new meta block. */ + log->current_io = btrfs_r5l_new_meta(log); + ASSERT(log->current_io); + log->current_io->rbio = rbio; + return 0; +} + +static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid, u32 csum) +{ + struct btrfs_r5l_io_unit *io = log->current_io; + struct btrfs_r5l_payload *payload; + void *ptr; + + ptr = kmap(io->meta_page); + payload = ptr + io->meta_offset; + payload->type = cpu_to_le16(type); + payload->flags = cpu_to_le16(0); + + if (type == R5LOG_PAYLOAD_DATA) + payload->size = cpu_to_le32(1); + else if (type == R5LOG_PAYLOAD_PARITY) + payload->size = cpu_to_le32(16); /* stripe_len / PAGE_SIZE */ + payload->devid = cpu_to_le64(devid); + payload->location = cpu_to_le64(location); + payload->csum = cpu_to_le32(csum); + kunmap(io->meta_page); + + io->meta_offset += sizeof(*payload); + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("io->meta_offset %d\n", io->meta_offset); +#endif +} + +static void btrfs_r5l_append_payload_page(struct btrfs_r5l_log *log, struct page *page) +{ + struct btrfs_r5l_io_unit *io = log->current_io; + + if (io->need_split_bio) { + /* We're submitting too much data at a time!! */ + BUG_ON(io->split_bio); + io->split_bio = io->current_bio; + io->current_bio = btrfs_r5l_bio_alloc(log); + bio_chain(io->current_bio, io->split_bio); + io->need_split_bio = false; + } + + ASSERT(bio_add_page(io->current_bio, page, PAGE_SIZE, 0)); + + btrfs_r5l_reserve_log_entry(log, io); +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("log->log_start %llu io->current_bio bi_iter (bi_sector 0x%llx bi_size %d\n", log->log_start, io->current_bio->bi_iter.bi_sector << 9, io->current_bio->bi_iter.bi_size); +#endif + +} + +static u64 btrfs_compute_location(struct btrfs_raid_bio *rbio, int stripe_nr, unsigned long page_index) +{ + struct btrfs_bio_stripe *stripe; + + stripe = &rbio->bbio->stripes[stripe_nr]; + return stripe->physical + (page_index << PAGE_SHIFT); +} + +static u64 btrfs_compute_devid(struct btrfs_raid_bio *rbio, int stripe_nr) +{ + struct btrfs_bio_stripe *stripe; + + stripe = &rbio->bbio->stripes[stripe_nr]; + ASSERT(stripe->dev); + return stripe->dev->devid; +} + +static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int parity_pages, struct btrfs_raid_bio *rbio) +{ + int meta_size; + int stripe, pagenr; + struct page *page; + char *kaddr; + u32 csum; + u64 location; + u64 devid; + + /* + * parity pages are contiguous on disk, thus only one + * payload is required. + */ + meta_size = sizeof(struct btrfs_r5l_payload) * data_pages + + sizeof(struct btrfs_r5l_payload) * (rbio->real_stripes - rbio->nr_data); + + /* add meta block */ + btrfs_r5l_get_meta(log, rbio, meta_size); + + /* add data blocks which need to be written */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + /* the page is from bio, queued for log bio */ + location = btrfs_compute_location(rbio, stripe, pagenr); + devid = btrfs_compute_devid(rbio, stripe); +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("data: stripe %d pagenr %d location 0x%llx devid %llu\n", stripe, pagenr, location, devid); +#endif + kaddr = kmap(page); + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + kunmap(page); + + btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid, csum); + btrfs_r5l_append_payload_page(log, page); + } + } + } + + /* add the whole parity blocks */ + for (; stripe < rbio->real_stripes; stripe++) { + location = btrfs_compute_location(rbio, stripe, 0); + devid = btrfs_compute_devid(rbio, stripe); + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("parity: stripe %d location 0x%llx devid %llu\n", stripe, location, devid); +#endif + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + page = rbio_stripe_page(rbio, stripe, pagenr); + + kaddr = kmap(page); + if (pagenr == 0) + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + else + csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE); + kunmap(page); + + btrfs_r5l_append_payload_page(log, page); + } + + btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid, csum); + } +} + +static void btrfs_r5l_submit_current_io(struct btrfs_r5l_log *log) +{ + struct btrfs_r5l_io_unit *io = log->current_io; + struct btrfs_r5l_meta_block *mb; + u32 csum; + + if (!io) + return; + + mb = kmap(io->meta_page); + mb->meta_size = cpu_to_le32(io->meta_offset); + ASSERT(mb->csum == 0); + csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE); + mb->csum = cpu_to_le32(csum); + kunmap(io->meta_page); + + log->current_io = NULL; +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("io->current bio bi_sector 0x%llx devid %llu\n", io->current_bio->bi_iter.bi_sector << 9, log->dev->devid); +#endif + /* + * make sure that r5l_log_endio does not run in interrupt + * context. + * + * if io->split_bio is available, then current_bio is just a + * chained bio. + */ + if (io->split_bio) + btrfs_bio_wq_end_io(log->fs_info, io->split_bio, BTRFS_WQ_ENDIO_RAID56); + else + btrfs_bio_wq_end_io(log->fs_info, io->current_bio, BTRFS_WQ_ENDIO_RAID56); + + submit_bio(io->current_bio); + if (io->split_bio) + submit_bio(io->split_bio); +} + +static u64 btrfs_r5l_ring_distance(struct btrfs_r5l_log *log, u64 start, u64 end) +{ + if (end >= start) + return end - start; + else + return end + (log->device_size) - start; +} + +static bool btrfs_r5l_has_free_space(struct btrfs_r5l_log *log, u64 size) +{ + u64 used_size; + used_size = btrfs_r5l_ring_distance(log, log->last_checkpoint, + log->log_start); + return log->device_size > (used_size + size); +} + +static int btrfs_r5l_sync_page_io(struct btrfs_r5l_log *log, + struct btrfs_device *dev, sector_t sector, + int size, struct page *page, int op) +{ + struct bio *bio = btrfs_io_bio_alloc(GFP_NOFS, 1); + int ret; + + bio->bi_bdev = dev->bdev; + bio->bi_opf = op; + if (dev == log->dev) + bio->bi_iter.bi_sector = (log->data_offset >> 9) + sector; + else + bio->bi_iter.bi_sector = sector; + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("%s: op %d bi_sector 0x%llx\n", __func__, op, (bio->bi_iter.bi_sector << 9)); +#endif + + bio_add_page(bio, page, size, 0); + submit_bio_wait(bio); + ret = !bio->bi_error; + bio_put(bio); + return ret; +} + +static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos, u64 seq) +{ + struct page *page; + struct btrfs_r5l_meta_block *mb; + u32 csum; + int ret = 0; + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("%s: pos %llu seq %llu\n", __func__, pos, seq); +#endif + + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO); + ASSERT(page); + + mb = kmap(page); + mb->magic = cpu_to_le32(BTRFS_R5LOG_MAGIC); + mb->meta_size = cpu_to_le32(sizeof(struct btrfs_r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); + + csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE); + mb->csum = cpu_to_le32(csum); + kunmap(page); + + if (!btrfs_r5l_sync_page_io(log, log->dev, (pos >> 9), PAGE_SIZE, page, REQ_OP_WRITE | REQ_FUA)) { + ret = -EIO; + } + + __free_page(page); + return ret; +} + +#define BTRFS_R5L_RECOVER_IO_POOL_SIZE BIO_MAX_PAGES +struct btrfs_r5l_recover_ctx { + u64 pos; + u64 seq; + u64 total_size; + struct page *meta_page; + struct page *io_page; + + struct page *ra_pages[BTRFS_R5L_RECOVER_IO_POOL_SIZE]; + struct bio *ra_bio; + int total; + int valid; + u64 start_offset; + + struct btrfs_r5l_log *log; +}; + +static int btrfs_r5l_recover_read_ra(struct btrfs_r5l_recover_ctx *ctx, u64 offset) +{ + bio_reset(ctx->ra_bio); + ctx->ra_bio->bi_bdev = ctx->log->dev->bdev; + ctx->ra_bio->bi_opf = REQ_OP_READ; + ctx->ra_bio->bi_iter.bi_sector = (ctx->log->data_offset + offset) >> 9; + + ctx->valid = 0; + ctx->start_offset = offset; + + while (ctx->valid < ctx->total) { + bio_add_page(ctx->ra_bio, ctx->ra_pages[ctx->valid++], PAGE_SIZE, 0); + + offset = btrfs_r5l_ring_add(ctx->log, offset, PAGE_SIZE); + if (offset == 0) + break; + } + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("to read %d pages starting from 0x%llx\n", ctx->valid, ctx->log->data_offset + ctx->start_offset); +#endif + return submit_bio_wait(ctx->ra_bio); +} + +static int btrfs_r5l_recover_read_page(struct btrfs_r5l_recover_ctx *ctx, struct page *page, u64 offset) +{ + struct page *tmp; + int index; + char *src; + char *dst; + int ret; + + if (offset < ctx->start_offset || offset >= (ctx->start_offset + ctx->valid * PAGE_SIZE)) { + ret = btrfs_r5l_recover_read_ra(ctx, offset); + if (ret) + return ret; + } + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("offset 0x%llx start->offset 0x%llx ctx->valid %d\n", offset, ctx->start_offset, ctx->valid); +#endif + + ASSERT(IS_ALIGNED(ctx->start_offset, PAGE_SIZE)); + ASSERT(IS_ALIGNED(offset, PAGE_SIZE)); + + index = (offset - ctx->start_offset) >> PAGE_SHIFT; + ASSERT(index < ctx->valid); + + tmp = ctx->ra_pages[index]; + src = kmap(tmp); + dst = kmap(page); + memcpy(dst, src, PAGE_SIZE); + kunmap(page); + kunmap(tmp); + return 0; +} + +static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx) +{ + struct btrfs_r5l_meta_block *mb; + u32 csum; + u32 expected; + int ret = 0; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->meta_page, ctx->pos); + if (ret) + return ret; + + mb = kmap(ctx->meta_page); +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("ctx->pos %llu ctx->seq %llu pos %llu seq %llu\n", ctx->pos, ctx->seq, le64_to_cpu(mb->position), le64_to_cpu(mb->seq)); +#endif + + if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC || + le64_to_cpu(mb->position) != ctx->pos || + le64_to_cpu(mb->seq) != ctx->seq) { +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC); +#endif + ret = -EINVAL; + goto out; + } + + expected = le32_to_cpu(mb->csum); + /* + * when we calculate mb->csum, it's zero, so we need to zero + * it back. + */ + mb->csum = 0; + csum = btrfs_crc32c(ctx->log->uuid_csum, mb, PAGE_SIZE); + if (csum != expected) { +#ifdef BTRFS_DEBUG_R5LOG + pr_info("%s: mismatch checksum for r5l meta block\n", __func__); +#endif + ret = -EINVAL; + goto out; + } + + ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE); + /* meta_block */ + ctx->total_size = PAGE_SIZE; + +out: + kunmap(ctx->meta_page); + + return ret; +} + +static int btrfs_r5l_recover_verify_checksum(struct btrfs_r5l_recover_ctx *ctx) +{ + u64 offset; + u32 meta_size; + u64 csum_io_offset; + u64 read_pos; + char *kaddr; + u32 csum; + int type; + struct btrfs_r5l_meta_block *mb; + struct btrfs_r5l_payload *payload; + struct btrfs_r5l_log *log = ctx->log; + struct btrfs_device *dev; + int ret = 0; + + mb = kmap(ctx->meta_page); + meta_size = le32_to_cpu(mb->meta_size); + csum_io_offset = PAGE_SIZE; + + for (offset = sizeof(struct btrfs_r5l_meta_block); + offset < meta_size; + offset += sizeof(struct btrfs_r5l_payload)) { + payload = (void *)mb + offset; + + /* check if there is any invalid device, if so, skip writing this mb. */ + dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL); + if (!dev || dev->missing) { + ret = -EINVAL; + goto out; + } + + type = le16_to_cpu(payload->type); + if (type == R5LOG_PAYLOAD_DATA) { + read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset); + csum_io_offset += PAGE_SIZE; + + ASSERT(le32_to_cpu(payload->size) == 1); + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) { + ret = -EIO; + goto out; + } + + kaddr = kmap(ctx->io_page); + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + kunmap(ctx->io_page); + } else if (type == R5LOG_PAYLOAD_PARITY) { + int i; + for (i = 0; i < le32_to_cpu(payload->size); i++) { + read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset); + csum_io_offset += PAGE_SIZE; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) { + ret = -EIO; + goto out; + } + + kaddr = kmap(ctx->io_page); + if (i == 0) + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + else + csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE); + kunmap(ctx->io_page); + } + } else { + ASSERT(0); + } + + if (csum != le32_to_cpu(payload->csum)) { + trace_printk("r5l data csum fails location 0x%llx devid %llu\n", le64_to_cpu(payload->location), le64_to_cpu(payload->devid)); + ret = -EAGAIN; + goto out; + } + } +out: + kunmap(ctx->meta_page); + return ret; +} + +static int btrfs_r5l_recover_load_data(struct btrfs_r5l_recover_ctx *ctx) +{ + u64 offset; + struct btrfs_r5l_meta_block *mb; + u32 meta_size; + u64 io_offset; + u64 read_pos; + struct btrfs_device *dev; + struct btrfs_r5l_payload *payload; + struct btrfs_r5l_log *log = ctx->log; + int ret = 0; + + /* if any checksum fails, skip writing this mb. */ + ret = btrfs_r5l_recover_verify_checksum(ctx); + if (ret) + return ret; + + mb = kmap(ctx->meta_page); + + io_offset = PAGE_SIZE; + offset = sizeof(struct btrfs_r5l_meta_block); + meta_size = le32_to_cpu(mb->meta_size); + + for (offset = sizeof(struct btrfs_r5l_meta_block); + offset < meta_size; + offset += sizeof(struct btrfs_r5l_payload)) { + payload = (void *)mb + offset; + + /* read data from log disk and write to payload->location */ +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid)); +#endif + + /* liubo: how to handle the case where dev is suddenly off? */ + dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL); + ASSERT(dev && !dev->missing); + + if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) { + read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset); + io_offset += PAGE_SIZE; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) + goto out; + + if (!btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) { + ret = -EIO; + goto out; + } + } else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) { + int i; + + ASSERT(offset + sizeof(struct btrfs_r5l_payload) == meta_size); + + for (i = 0; i < le32_to_cpu(payload->size); i++) { + u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE; + read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset); + io_offset += PAGE_SIZE; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) + goto out; + + if (!btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) { + ret = -EIO; + goto out; + } + } + } else { + ASSERT(0); + } + } + + ctx->total_size += (io_offset - PAGE_SIZE); +out: + kunmap(ctx->meta_page); + return ret; +} + +static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_recover_ctx *ctx) +{ + int ret; + + while (1) { + ret = btrfs_r5l_recover_load_meta(ctx); + if (ret) + break; + + ret = btrfs_r5l_recover_load_data(ctx); + if (ret && ret != -EAGAIN) + break; + + ctx->seq++; + ctx->pos = btrfs_r5l_ring_add(ctx->log, ctx->pos, ctx->total_size); + } + + return 0; +} + +static int btrfs_r5l_recover_allocate_ra(struct btrfs_r5l_recover_ctx *ctx) +{ + struct page *page; + ctx->ra_bio = btrfs_io_bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + + ctx->total = 0; + ctx->valid = 0; + while (ctx->total < BTRFS_R5L_RECOVER_IO_POOL_SIZE) { + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + break; + + ctx->ra_pages[ctx->total++] = page; + } + + if (ctx->total == 0) { + bio_put(ctx->ra_bio); + return -ENOMEM; + } + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("readahead: %d allocated pages\n", ctx->total); +#endif + return 0; +} + +static void btrfs_r5l_recover_free_ra(struct btrfs_r5l_recover_ctx *ctx) +{ + int i; +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("readahead: %d to free pages\n", ctx->total); +#endif + for (i = 0; i < ctx->total; i++) + __free_page(ctx->ra_pages[i]); + bio_put(ctx->ra_bio); +} + +static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp); + +static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log) +{ + struct btrfs_r5l_recover_ctx *ctx; + u64 pos; + int ret; + + ctx = kzalloc(sizeof(*ctx), GFP_NOFS); + ASSERT(ctx); + + ctx->log = log; + ctx->pos = log->last_checkpoint; + ctx->seq = log->last_cp_seq; + ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + ASSERT(ctx->meta_page); + ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + ASSERT(ctx->io_page); + + ret = btrfs_r5l_recover_allocate_ra(ctx); + ASSERT(ret == 0); + + btrfs_r5l_recover_flush_log(ctx); + + pos = ctx->pos; + log->next_checkpoint = ctx->pos; + ctx->seq += 10000; + btrfs_r5l_write_empty_meta_block(log, ctx->pos, ctx->seq++); + ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, PAGE_SIZE); + + log->log_start = ctx->pos; + log->seq = ctx->seq; + /* last_checkpoint point to the empty block. */ + log->last_checkpoint = pos; + btrfs_r5l_write_super(log->fs_info, pos); + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("%s: log_start %llu seq %llu\n", __func__, log->log_start, log->seq); +#endif + __free_page(ctx->meta_page); + __free_page(ctx->io_page); + btrfs_r5l_recover_free_ra(ctx); + kfree(ctx); + return 0; +} + +/* return 0 if success, otherwise return errors */ +int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, struct btrfs_r5l_log *r5log, u64 cp) +{ + struct btrfs_r5l_log *log; + struct page *page; + struct btrfs_r5l_meta_block *mb; + bool create_new = false; + int ret; + + if (r5log) + ASSERT(fs_info->r5log == NULL); + if (fs_info->r5log) + ASSERT(r5log == NULL); + + if (fs_info->r5log) + log = fs_info->r5log; + else + /* + * this only happens when adding the raid56 log for + * the first time. + */ + log = r5log; + + ASSERT(log); + + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + ASSERT(page); + + if (!btrfs_r5l_sync_page_io(log, log->dev, (cp >> 9), PAGE_SIZE, page, + REQ_OP_READ)) { + __free_page(page); + return -EIO; + } + + mb = kmap(page); +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("r5l: mb->pos %llu cp %llu mb->seq %llu\n", le64_to_cpu(mb->position), cp, le64_to_cpu(mb->seq)); +#endif + + if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC) { +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("magic not match: create new r5l\n"); +#endif + create_new = true; + goto create; + } + + ASSERT(le64_to_cpu(mb->position) == cp); + if (le64_to_cpu(mb->position) != cp) { +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("mb->position not match: create new r5l\n"); +#endif + create_new = true; + goto create; + } +create: + if (create_new) { + /* initial new r5log */ + log->last_cp_seq = prandom_u32(); + cp = 0; + + btrfs_r5l_write_empty_meta_block(log, cp, log->last_cp_seq); + btrfs_r5l_write_super(fs_info, cp); + } else { + log->last_cp_seq = le64_to_cpu(mb->seq); + } + + log->last_checkpoint = cp; + + kunmap(page); + __free_page(page); + + if (create_new) { + log->log_start = btrfs_r5l_ring_add(log, cp, PAGE_SIZE); + log->seq = log->last_cp_seq + 1; + log->next_checkpoint = cp; + } else { + ret = btrfs_r5l_recover_log(log); + } + + return ret; +} + +/* + * writing super with log->next_checkpoint + * + * This is protected by log->io_mutex. + */ +static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp) +{ + int ret; + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("r5l writing super to reclaim space, cp %llu\n", cp); +#endif + + btrfs_set_super_journal_tail(fs_info->super_for_commit, cp); + + /* + * flush all disk cache so that all data prior to + * %next_checkpoint lands on raid disks(recovery will start + * from %next_checkpoint). + */ + ret = write_all_supers(fs_info, 1); + ASSERT(ret == 0); +} + +/* this is called by commit transaction and it's followed by writing super. */ +void btrfs_r5l_write_journal_tail(struct btrfs_fs_info *fs_info) +{ + if (fs_info->r5log) { + u64 cp = READ_ONCE(fs_info->r5log->next_checkpoint); + + trace_printk("journal_tail %llu\n", cp); + btrfs_set_super_journal_tail(fs_info->super_copy, cp); + WRITE_ONCE(fs_info->r5log->last_checkpoint, cp); + } +} + +/* + * return 0 if data/parity are written into log and it will move data + * to RAID in endio. + * + * return 1 if log is not available or there is no space in log. + */ +static int btrfs_r5l_write_stripe(struct btrfs_raid_bio *rbio) +{ + int stripe, pagenr; + int data_pages = 0, parity_pages = 0; + u64 reserve; + int meta_size; + bool do_submit = false; + struct btrfs_r5l_log *log = rbio->fs_info->r5log; + + if (!log) { +#ifdef BTRFS_DEBUG_R5LOG + btrfs_info(rbio->fs_info, "r5log is not available\n"); +#endif + return 1; + } + + /* get data_pages and parity_pages */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + data_pages++; + } else { + parity_pages++; + } + } + } +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("data_pages %d parity_pages %d\n", data_pages, parity_pages); + ASSERT(parity_pages == 16 * (rbio->real_stripes - rbio->nr_data)); +#endif -/* only allocate pages for p/q stripes */ -static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) -{ - int i; - struct page *page; + /* + * parity pages are contiguous on disk, thus only one + * payload is required. + */ + meta_size = sizeof(struct btrfs_r5l_payload) * data_pages + + sizeof(struct btrfs_r5l_payload) * (rbio->real_stripes - rbio->nr_data); + + /* doesn't support large raid array */ + if (meta_size + sizeof(struct btrfs_r5l_meta_block) > PAGE_SIZE) { +#ifdef BTRFS_DEBUG_R5LOG + btrfs_info(rbio->fs_info, "meta_size (%d) is too big\n", meta_size); +#endif + return 1; + } - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + mutex_lock(&log->io_mutex); + /* meta + data/parity */ + reserve = (1 + data_pages + parity_pages) << PAGE_SHIFT; + if (btrfs_r5l_has_free_space(log, reserve)) { + btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio); + do_submit = true; + } else { +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("r5log: no space log->last_checkpoint %llu log->log_start %llu log->next_checkpoint %llu\n", log->last_checkpoint, log->log_start, log->next_checkpoint); +#endif - for (; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; + /* + * reclaim works via writing to log device with the + * new next_checkpoint. + */ + btrfs_r5l_write_super(rbio->fs_info, log->next_checkpoint); + + log->last_checkpoint = log->next_checkpoint; + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("r5log: after reclaim(write super) log->last_checkpoint %llu log->log_start %llu log->next_checkpoint %llu\n", log->last_checkpoint, log->log_start, log->next_checkpoint); +#endif + /* now we should have enough space. */ + ASSERT(btrfs_r5l_has_free_space(log, reserve)); + btrfs_r5l_log_stripe(log, data_pages, parity_pages, rbio); + do_submit = true; } - return 0; + + if (do_submit) { + btrfs_r5l_submit_current_io(log); + } + mutex_unlock(&log->io_mutex); + + return (do_submit ? 0 : 1); } -/* - * add a single page from a specific stripe into our list of bios for IO - * this will try to merge into existing bios if possible, and returns - * zero if all went well. - */ -static int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) +static void btrfs_write_rbio(struct btrfs_raid_bio *rbio) { - struct bio *last = bio_list->tail; - u64 last_end = 0; - int ret; + struct btrfs_bio *bbio = rbio->bbio; + int stripe, pagenr; + struct bio_list bio_list; struct bio *bio; - struct btrfs_bio_stripe *stripe; - u64 disk_start; - - stripe = &rbio->bbio->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_SHIFT); + int ret = 0; - /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) - return fail_rbio_index(rbio, stripe_nr); + bio_list_init(&bio_list); - /* see if we can add this page onto our existing bio */ - if (last) { - last_end = (u64)last->bi_iter.bi_sector << 9; - last_end += last->bi_iter.bi_size; + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } - /* - * we can't merge these if they are from different - * devices or if they are not contiguous - */ - if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && - last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_SIZE, 0); - if (ret == PAGE_SIZE) - return 0; + ret = rbio_add_io_page(rbio, &bio_list, + page, stripe, pagenr, rbio->stripe_len); + if (ret) + goto out; } } - /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); - if (!bio) - return -ENOMEM; + if (likely(!bbio->num_tgtdevs)) + goto write_data; - bio->bi_iter.bi_size = 0; - bio->bi_bdev = stripe->dev->bdev; - bio->bi_iter.bi_sector = disk_start >> 9; + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (!bbio->tgtdev_map[stripe]) + continue; - bio_add_page(bio, page, PAGE_SIZE, 0); - bio_list_add(bio_list, bio); - return 0; -} + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) -{ - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); + ret = rbio_add_io_page(rbio, &bio_list, page, + rbio->bbio->tgtdev_map[stripe], + pagenr, rbio->stripe_len); + if (ret) + goto out; + } } -} -/* - * helper function to walk our bio list and populate the bio_pages array with - * the result. This seems expensive, but it is faster than constantly - * searching through the bio list as we setup the IO in finish_rmw or stripe - * reconstruction. - * - * This must be called before you trust the answers from page_in_rbio - */ -static void index_rbio_pages(struct btrfs_raid_bio *rbio) -{ - struct bio *bio; - struct bio_vec *bvec; - u64 start; - unsigned long stripe_offset; - unsigned long page_index; - int i; +write_data: + atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - spin_lock_irq(&rbio->bio_list_lock); - bio_list_for_each(bio, &rbio->bio_list) { - start = (u64)bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bbio->raid_map[0]; - page_index = stripe_offset >> PAGE_SHIFT; + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; - bio_for_each_segment_all(bvec, bio, i) - rbio->bio_pages[page_index + i] = bvec->bv_page; + bio->bi_private = rbio; + bio->bi_end_io = raid_write_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } - spin_unlock_irq(&rbio->bio_list_lock); +out: + ASSERT(ret == 0 || ret == -EIO); + if (ret == -EIO) + rbio_orig_end_io(rbio, -EIO); } /* @@ -1170,19 +2261,14 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) */ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { - struct btrfs_bio *bbio = rbio->bbio; void *pointers[rbio->real_stripes]; int nr_data = rbio->nr_data; int stripe; int pagenr; int p_stripe = -1; int q_stripe = -1; - struct bio_list bio_list; - struct bio *bio; int ret; - bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) { p_stripe = rbio->real_stripes - 1; } else if (rbio->real_stripes - rbio->nr_data == 2) { @@ -1262,68 +2348,15 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - - ret = rbio_add_io_page(rbio, &bio_list, - page, stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } - } - - if (likely(!bbio->num_tgtdevs)) - goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bbio->tgtdev_map[stripe]) - continue; - - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - - ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bbio->tgtdev_map[stripe], - pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } - } - -write_data: - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - - bio->bi_private = rbio; - bio->bi_end_io = raid_write_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + /* write to log device firstly */ + ret = btrfs_r5l_write_stripe(rbio); + if (ret == 0) + return; - submit_bio(bio); - } + /* if no log, lets write data to RAID. */ + btrfs_write_rbio(rbio); return; - -cleanup: - rbio_orig_end_io(rbio, -EIO); } /* @@ -2715,3 +3748,67 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) async_missing_raid56(rbio); } + +struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct block_device *bdev) +{ + int num_devices = fs_info->fs_devices->num_devices; + u64 dev_total_bytes; + struct btrfs_r5l_log *log = kzalloc(sizeof(struct btrfs_r5l_log), GFP_NOFS); + if (!log) + return ERR_PTR(-ENOMEM); + + ASSERT(device); + ASSERT(bdev); + dev_total_bytes = i_size_read(bdev->bd_inode); + + /* see find_free_dev_extent for 1M start offset */ + log->data_offset = 1024ull * 1024; + log->device_size = dev_total_bytes - log->data_offset; + log->device_size = round_down(log->device_size, PAGE_SIZE); + + /* + * when device has been included in fs_devices, do not take + * into account this device when checking log size. + */ + if (device->in_fs_metadata) + num_devices--; + + if (log->device_size < BTRFS_STRIPE_LEN * num_devices * 2) { + btrfs_info(fs_info, "r5log log device size (%llu < %llu) is too small", log->device_size, BTRFS_STRIPE_LEN * num_devices * 2); + kfree(log); + return ERR_PTR(-EINVAL); + } + + log->dev = device; + log->fs_info = fs_info; + ASSERT(sizeof(device->uuid) == BTRFS_UUID_SIZE); + log->uuid_csum = btrfs_crc32c(~0, device->uuid, sizeof(device->uuid)); + mutex_init(&log->io_mutex); + spin_lock_init(&log->io_list_lock); + INIT_LIST_HEAD(&log->io_list); + + return log; +} + +void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info, struct btrfs_r5l_log *log) +{ + cmpxchg(&fs_info->r5log, NULL, log); + ASSERT(fs_info->r5log == log); + +#ifdef BTRFS_DEBUG_R5LOG + trace_printk("r5log: set a r5log in fs_info, alloc_range 0x%llx 0x%llx\n", + log->data_offset, log->data_offset + log->device_size); +#endif +} + +int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device) +{ + struct btrfs_r5l_log *log; + + log = btrfs_r5l_init_log_prepare(fs_info, device, device->bdev); + if (IS_ERR(log)) + return PTR_ERR(log); + + btrfs_r5l_init_log_post(fs_info, log); + return 0; +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 4ee4fe346838ce..fc4ff20346778f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -39,6 +39,80 @@ static inline int nr_data_stripes(struct map_lookup *map) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) +/* r5log */ +struct btrfs_r5l_log; +#define BTRFS_R5LOG_MAGIC 0x6433c509 + +#define BTRFS_R5L_STRIPE_END 1 + +/* one meta block + several data + parity blocks */ +struct btrfs_r5l_io_unit { + struct btrfs_r5l_log *log; + struct btrfs_raid_bio *rbio; + + struct list_head list; + int status; + + /* store meta block */ + struct page *meta_page; + + /* current offset in meta page */ + int meta_offset; + + /* current bio for accepting new data/parity block */ + struct bio *current_bio; + + /* sequence number in meta block */ + u64 seq; + + /* where io_unit starts and ends */ + u64 log_start; + u64 log_end; + + /* split bio to hold more data */ + bool need_split_bio; + struct bio *split_bio; +}; + +enum r5l_payload_type { + R5LOG_PAYLOAD_DATA = 0, + R5LOG_PAYLOAD_PARITY = 1, +}; + +/* + * payload is appending to the meta block and it describes the + * location and the size of data or parity. + */ +struct btrfs_r5l_payload { + __le16 type; + __le16 flags; + + __le32 size; + + /* data or parity */ + __le64 location; + __le64 devid; + + __le32 csum; +}; + +/* io unit starts from a meta block. */ +struct btrfs_r5l_meta_block { + __le32 magic; + + /* the whole size of the block */ + __le32 meta_size; + + __le32 csum; + + __le64 seq; + __le64 position; + + struct btrfs_r5l_payload payload[]; +}; + +/* r5log end */ + struct btrfs_raid_bio; struct btrfs_device; @@ -65,4 +139,12 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +struct btrfs_r5l_log * btrfs_r5l_init_log_prepare(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, + struct block_device *bdev); +void btrfs_r5l_init_log_post(struct btrfs_fs_info *fs_info, + struct btrfs_r5l_log *log); +int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device); +int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, + struct btrfs_r5l_log *r5log, u64 cp); #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2168654c90a1e6..e312e5ada7cc60 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2238,6 +2238,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_set_super_log_root(fs_info->super_copy, 0); btrfs_set_super_log_root_level(fs_info->super_copy, 0); + btrfs_r5l_write_journal_tail(fs_info); + memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_copy)); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 017b67daa3bbf3..7f848d79ef513b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2313,7 +2313,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, return ret; } -int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path, const u64 flags) { struct btrfs_root *root = fs_info->dev_root; struct request_queue *q; @@ -2326,6 +2326,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 tmp; int seeding_dev = 0; int ret = 0; + bool is_r5log = (flags & BTRFS_DEVICE_RAID56_LOG); + struct btrfs_r5l_log *r5log = NULL; + + if (is_r5log) + ASSERT(!fs_info->fs_devices->seeding); if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding) return -EROFS; @@ -2363,6 +2368,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (is_r5log) { + r5log = btrfs_r5l_init_log_prepare(fs_info, device, bdev); + if (IS_ERR(r5log)) { + kfree(device); + ret = PTR_ERR(r5log); + goto error; + } + } + name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { kfree(device); @@ -2382,6 +2396,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; + if (is_r5log) + device->type |= BTRFS_DEV_RAID56_LOG; device->writeable = 1; device->generation = trans->transid; device->io_width = fs_info->sectorsize; @@ -2434,11 +2450,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path /* add sysfs device entry */ btrfs_sysfs_add_device_link(fs_info->fs_devices, device); - /* - * we've got more storage, clear any full flags on the space - * infos - */ - btrfs_clear_space_info_full(fs_info); + if (!is_r5log) { + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(fs_info); + } mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -2502,6 +2520,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path ret = btrfs_commit_transaction(trans); } + if (is_r5log) { + /* initialize r5log with cp == 0. */ + btrfs_r5l_load_log(fs_info, r5log, 0); + btrfs_r5l_init_log_post(fs_info, r5log); + } + /* Update ctime/mtime for libblkid */ update_dev_time(device_path); return ret; @@ -4716,8 +4740,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } if (!device->in_fs_metadata || - device->is_tgtdev_for_dev_replace) + device->is_tgtdev_for_dev_replace || + (device->type & BTRFS_DEV_RAID56_LOG)) { +#ifdef BTRFS_DEBUG_R5LOG + if (device->type & BTRFS_DEV_RAID56_LOG) + btrfs_info(info, "skip a r5log when alloc chunk\n"); +#endif continue; + } if (device->total_bytes > device->bytes_used) total_avail = device->total_bytes - device->bytes_used; @@ -6689,6 +6719,18 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device->bytes_used; spin_unlock(&fs_info->free_chunk_lock); } + + if (device->type & BTRFS_DEV_RAID56_LOG) { + ret = btrfs_set_r5log(fs_info, device); + if (ret) { + btrfs_err(fs_info, "error %d on loading r5log", ret); + return ret; + } + + btrfs_info(fs_info, "devid %llu uuid %pU is raid56 log", + device->devid, device->uuid); + } + ret = 0; return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c7d0fbc915cabd..44cc3fa0a8da32 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -26,6 +26,10 @@ extern struct mutex uuid_mutex; +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_DEBUG_R5LOG +#endif + #define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; @@ -437,7 +441,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); -int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path, + const u64 flags); int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, const char *device_path, struct btrfs_device *srcdev, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index a456e5309238bb..be5991f267ee43 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -35,6 +35,7 @@ struct btrfs_ioctl_vol_args { #define BTRFS_DEVICE_PATH_NAME_MAX 1024 #define BTRFS_DEVICE_SPEC_BY_ID (1ULL << 3) +#define BTRFS_DEVICE_RAID56_LOG (1ULL << 4) #define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED \ (BTRFS_SUBVOL_CREATE_ASYNC | \ @@ -818,5 +819,7 @@ enum btrfs_err_code { struct btrfs_ioctl_feature_flags[3]) #define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_ADD_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 59, \ + struct btrfs_ioctl_vol_args_v2) #endif /* _UAPI_LINUX_BTRFS_H */ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 10689e1fdf11d1..52fed59e85e768 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -347,6 +347,10 @@ struct btrfs_key { __u64 offset; } __attribute__ ((__packed__)); +/* dev_item.type */ +/* #define BTRFS_DEV_REGULAR 0 */ +#define BTRFS_DEV_RAID56_LOG (1ULL << 0) + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid;