diff --git a/README.md b/README.md index 2531bff..9143c96 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,9 @@ All datasets are compressed with [zstd](https://github.com/facebook/zstd). You c > [!NOTE] > libCacheSim can directly work with compressed data, so no decompression is needed if you use libCacheSim to run simulations. +#### Trace Conversion + +The folder `conversion_scripts` contains scripts we use to convert the original traces from the sources into formats suitable for using with the [libCacheSim][libCacheSim] platform in a manner that respects the original semantics of the source traces (to the best of our knowledge). # Key-value Cache Traces @@ -315,7 +318,7 @@ The original release can be found at [here][src-meta]. - `op_time`: The time of the request - `block_id`: The requested block ID -- `block_id_size`: The size of the requested block (in MB), this field is always 40 +- `block_id_size`: The size of the block_id field, this field is always 40 - `io_size`: The requested size, note that requests often do not ask for the full block - `io_offset`: The start offset in the block - `user_name`: Anonymized username (represent different use cases) @@ -325,6 +328,7 @@ The original release can be found at [here][src-meta]. - `host_name`: Anonymized host name that serves the request - `rs_shard_id`: Reed-solomon shard ID +A storage node block is uniquely identified by its block_id + rs_shard_id. The block is in turn broken into chunks to be cached. We have a script in `conversion_scripts/conv_meta_block.cpp` that does this breakdown and create a new trace where each chunk is treated as a cache object. ### Download Links * **Plain text**: [S3][metaStorage-s3-txt] diff --git a/conversion_scripts/conv_meta_block.cpp b/conversion_scripts/conv_meta_block.cpp new file mode 100644 index 0000000..215a5d6 --- /dev/null +++ b/conversion_scripts/conv_meta_block.cpp @@ -0,0 +1,104 @@ +// Script to convert Meta's block storage traces to libCacheSim csv format +// +// Usage: +// g++ conv_meta_block.cpp -o conv_meta_block +// ./conv_meta_block block_traces_1_conved.csv block_traces_1.csv + +#include +#include +#include +#include +#include +#include + +constexpr size_t CHUNK_SIZE = 128 * 1024; // 128 KiB +constexpr double SAMPLE_RATIO = 0.1; // sample SAMPLE_RATIO fraction of blocks +constexpr size_t SAMPLE_SEED = 42; + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cout << "Usage: " << argv[0] + << " [ ...]\n"; + return EXIT_FAILURE; + } + + // open files + std::ofstream output_csv{argv[1]}; + if (!output_csv) { + std::cout << "Error: Could not open file " << argv[1] << std::endl; + return EXIT_FAILURE; + } + std::vector input_csvs{}; + for (size_t i = 2; i < argc; ++i) { + input_csvs.emplace_back(argv[i]); + if (!input_csvs.back()) { + std::cout << "Error: Could not open file " << argv[i] << std::endl; + } + } + + // vars for sampling + std::unordered_set keep{}; + std::unordered_set dont_keep{}; + std::mt19937 gen{SAMPLE_SEED}; + std::uniform_real_distribution<> dis(0.0, 1.0); + + // header of output csv, following libCacheSim csv format + output_csv << "timestamp,obj_id,obj_size\n"; + + // fields of input csvs we are interested in, following Meta's block storage + // traces format + size_t op_time; // 0 + std::string block_id; // 1 + size_t io_size; // 3 + size_t io_offset; // 4 + std::string rs_shard_id; // 10 + + // read input_csvs, write to output_csv + std::string line; + std::string cell; + std::vector cells{}; + for (auto& input_csv : input_csvs) { + getline(input_csv, line); // skip header + while (getline(input_csv, line)) { + cells.clear(); + + std::stringstream row(line); + while (getline(row, cell, ',')) { + cells.push_back(cell); + } + + op_time = std::stoull(cells[0]); + block_id = cells[1]; + io_size = std::stoull(cells[3]); + io_offset = std::stoull(cells[4]); + rs_shard_id = cells[10]; + + // Reference for doing conversion: + // https://github.com/facebook/CacheLib/commit/23a888e54e4fed22f81c114e3ca9af95d7f0787c + // Each storage node block is uniquely identified via block_id + + // rs_shard_id. It's in turn broken into chunks to be cached. We + // represent a chunk of a storage node block as the cache object. + std::string real_block_id = block_id + "_" + rs_shard_id; + if (keep.count(real_block_id) == 0 && + dont_keep.count(real_block_id) == 0) { + double u = dis(gen); + if (u > SAMPLE_RATIO) { + dont_keep.insert(real_block_id); + } else { + keep.insert(real_block_id); + } + } + + if (dont_keep.count(real_block_id) > 0) { + continue; + } + + size_t start_chunk = io_offset / CHUNK_SIZE; + size_t end_chunk = (io_offset + io_size - 1) / CHUNK_SIZE; + for (size_t chunk_id = start_chunk; chunk_id <= end_chunk; ++chunk_id) { + std::string obj_id = real_block_id + "_" + std::to_string(chunk_id); + output_csv << op_time << ',' << obj_id << ',' << CHUNK_SIZE << '\n'; + } + } + } +}