cacheMon · shermanjlim · Oct 24, 2025 · Oct 24, 2025 · gemini-code-assist · Oct 24, 2025
diff --git a/README.md b/README.md
@@ -54,6 +54,9 @@ All datasets are compressed with [zstd](https://github.com/facebook/zstd). You c
 > [!NOTE]
 > libCacheSim can directly work with compressed data, so no decompression is needed if you use libCacheSim to run simulations. 
 
+#### Trace Conversion
+
+The folder `conversion_scripts` contains scripts we use to convert the original traces from the sources into formats suitable for using with the [libCacheSim][libCacheSim] platform in a manner that respects the original semantics of the source traces (to the best of our knowledge).
 
 # Key-value Cache Traces
 
@@ -315,7 +318,7 @@ The original release can be found at [here][src-meta].
 
 - `op_time`: The time of the request
 - `block_id`: The requested block ID
-- `block_id_size`: The size of the requested block (in MB), this field is always 40
+- `block_id_size`: The size of the block_id field, this field is always 40
 - `io_size`: The requested size, note that requests often do not ask for the full block
 - `io_offset`: The start offset in the block
 - `user_name`: Anonymized username (represent different use cases)
@@ -325,6 +328,7 @@ The original release can be found at [here][src-meta].
 - `host_name`: Anonymized host name that serves the request
 - `rs_shard_id`: Reed-solomon shard ID
 
+A storage node block is uniquely identified by its block_id + rs_shard_id. The block is in turn broken into chunks to be cached. We have a script in `conversion_scripts/conv_meta_block.cpp` that does this breakdown and create a new trace where each chunk is treated as a cache object.
 
 ### Download Links
 * **Plain text**: [S3][metaStorage-s3-txt]

diff --git a/conversion_scripts/conv_meta_block.cpp b/conversion_scripts/conv_meta_block.cpp
@@ -0,0 +1,104 @@
+// Script to convert Meta's block storage traces to libCacheSim csv format
+//
+// Usage:
+// g++ conv_meta_block.cpp -o conv_meta_block
+// ./conv_meta_block block_traces_1_conved.csv block_traces_1.csv
+
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+constexpr size_t CHUNK_SIZE = 128 * 1024;  // 128 KiB
+constexpr double SAMPLE_RATIO = 0.1;  // sample SAMPLE_RATIO fraction of blocks
+constexpr size_t SAMPLE_SEED = 42;
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "Usage: " << argv[0]
+              << " <output_csv> <input_csv1> [<input_csv2> ...]\n";
-    std::cout << "Usage: " << argv[0]
-              << " <output_csv> <input_csv1> [<input_csv2> ...]\n";
+std::cerr << "Usage: " << argv[0]
+              << " <output_csv> <input_csv1> [<input_csv2> ...]\n";
-    std::cout << "Usage: " << argv[0]
-              << " <output_csv> <input_csv1> [<input_csv2> ...]\n";
+std::cerr << "Usage: " << argv[0]
+              << " <output_csv> <input_csv1> [<input_csv2> ...]\n";
+    return EXIT_FAILURE;
+  }
+
+  // open files
+  std::ofstream output_csv{argv[1]};
+  if (!output_csv) {
+    std::cout << "Error: Could not open file " << argv[1] << std::endl;
-    std::cout << "Error: Could not open file " << argv[1] << std::endl;
+    std::cerr << "Error: Could not open file " << argv[1] << std::endl;
-    std::cout << "Error: Could not open file " << argv[1] << std::endl;
+    std::cerr << "Error: Could not open file " << argv[1] << std::endl;
+    return EXIT_FAILURE;
+  }
+  std::vector<std::ifstream> input_csvs{};
+  for (size_t i = 2; i < argc; ++i) {
+    input_csvs.emplace_back(argv[i]);
+    if (!input_csvs.back()) {
+      std::cout << "Error: Could not open file " << argv[i] << std::endl;
+    }
+  }
-  for (size_t i = 2; i < argc; ++i) {
-    input_csvs.emplace_back(argv[i]);
-    if (!input_csvs.back()) {
-      std::cout << "Error: Could not open file " << argv[i] << std::endl;
-    }
-  }
+  for (size_t i = 2; i < argc; ++i) {
+    input_csvs.emplace_back(argv[i]);
+    if (!input_csvs.back()) {
+      std::cerr << "Error: Could not open file " << argv[i] << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
-  for (size_t i = 2; i < argc; ++i) {
-    input_csvs.emplace_back(argv[i]);
-    if (!input_csvs.back()) {
-      std::cout << "Error: Could not open file " << argv[i] << std::endl;
-    }
-  }
+  for (size_t i = 2; i < argc; ++i) {
+    input_csvs.emplace_back(argv[i]);
+    if (!input_csvs.back()) {
+      std::cerr << "Error: Could not open file " << argv[i] << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+
+  // vars for sampling
+  std::unordered_set<std::string> keep{};
+  std::unordered_set<std::string> dont_keep{};
+  std::mt19937 gen{SAMPLE_SEED};
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+
+  // header of output csv, following libCacheSim csv format
+  output_csv << "timestamp,obj_id,obj_size\n";
+
+  // fields of input csvs we are interested in, following Meta's block storage
+  // traces format
+  size_t op_time;           // 0
+  std::string block_id;     // 1
+  size_t io_size;           // 3
+  size_t io_offset;         // 4
+  std::string rs_shard_id;  // 10
+
+  // read input_csvs, write to output_csv
+  std::string line;
+  std::string cell;
+  std::vector<std::string> cells{};
+  for (auto& input_csv : input_csvs) {
+    getline(input_csv, line);  // skip header
+    while (getline(input_csv, line)) {
+      cells.clear();
+
+      std::stringstream row(line);
+      while (getline(row, cell, ',')) {
+        cells.push_back(cell);
+      }
+
+      op_time = std::stoull(cells[0]);
+      block_id = cells[1];
+      io_size = std::stoull(cells[3]);
+      io_offset = std::stoull(cells[4]);
+      rs_shard_id = cells[10];
+
+      // Reference for doing conversion:
+      // https://github.com/facebook/CacheLib/commit/23a888e54e4fed22f81c114e3ca9af95d7f0787c
+      // Each storage node block is uniquely identified via block_id +
+      // rs_shard_id. It's in turn broken into chunks to be cached. We
+      // represent a chunk of a storage node block as the cache object.
+      std::string real_block_id = block_id + "_" + rs_shard_id;
+      if (keep.count(real_block_id) == 0 &&
+          dont_keep.count(real_block_id) == 0) {
+        double u = dis(gen);
+        if (u > SAMPLE_RATIO) {
+          dont_keep.insert(real_block_id);
+        } else {
+          keep.insert(real_block_id);
+        }
+      }
+
+      if (dont_keep.count(real_block_id) > 0) {
+        continue;
+      }
+
+      size_t start_chunk = io_offset / CHUNK_SIZE;
+      size_t end_chunk = (io_offset + io_size - 1) / CHUNK_SIZE;
+      for (size_t chunk_id = start_chunk; chunk_id <= end_chunk; ++chunk_id) {
+        std::string obj_id = real_block_id + "_" + std::to_string(chunk_id);
+        output_csv << op_time << ',' << obj_id << ',' << CHUNK_SIZE << '\n';
+      }
+    }
+  }
+}