In [14]:
import os
import random
import requests
import numpy as np
import pandas as pd
import multiprocessing as mp

from typing import List, Dict, Any, Tuple
from tqdm import tqdm_notebook as tqdm

random.seed(42)

In [15]:
# Inputs

cluster_name = "cluster1_16TB"
date = "20240115"
data_file = "data-00052-of-00100"
url_template = "https://storage.googleapis.com/thesios-io-traces/%s/%s/%s"
initialize_data_dir = True
data_dir = "data_%s_%s_%s" % (cluster_name, date, data_file)
output_trace_file = "trace_%s_%s_%s.txt" % (cluster_name, date, data_file)
remove_zero_size_ops = True

In [16]:
# Helper functions
def download_to_file(url, filename):
    response = requests.get(url)
    with open(filename, "wb") as f:
        f.write(response.content)

def format_bytes(bytes: int) -> str:
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if bytes < 1024:
            return f"{bytes:.2f} {unit}"
        bytes /= 1024
    return f"{bytes:.2f} PB"

def process_trace(trace) -> Dict[str, Dict[str, int]]:
    # We want to process the trace, operation by operation.
    # Let's simplify: assume everything pre-exists, and we only need to keep track of the last read/write offset for each file.

    # We will keep track of the last read offset for each file.
    # Initialize dictionaries to store file metadata
    file_metadata = {}

    for _, row in trace.iterrows():
        filename = row["filename"]
        offset = row["file_offset"]
        io_size = row["request_io_size_bytes"]
        operation = row["op_type"]

        if filename not in file_metadata:
            file_metadata[filename] = {
                "max_read_offset": 0,
                "max_write_offset": 0,
                "max_initialized_offset": 0,
            }

        metadata = file_metadata[filename]

        if operation == "READ":
            metadata["max_initialized_offset"] = max(metadata["max_initialized_offset"], offset + io_size)
            metadata["max_read_offset"] = max(metadata["max_read_offset"], offset + io_size)
        elif operation == "WRITE":
            metadata["max_initialized_offset"] = max(metadata["max_initialized_offset"], offset + io_size)
            metadata["max_write_offset"] = max(metadata["max_write_offset"], offset + io_size)

    return file_metadata

def print_file_metadata_stats(file_metadata) -> Tuple[int, int, int]:
    total_size_initialized = sum([metadata["max_initialized_offset"] for metadata in file_metadata.values()])
    total_size_read = sum([metadata["max_read_offset"] for metadata in file_metadata.values()])
    total_size_written = sum([metadata["max_write_offset"] for metadata in file_metadata.values()])

    print(f"Total size initialized: {format_bytes(total_size_initialized)}")
    print(f"Total size read: {format_bytes(total_size_read)}")
    print(f"Total size written: {format_bytes(total_size_written)}")
    return total_size_initialized, total_size_read, total_size_written

# Initialize a new directory and write the files needed to run the trace
def initialize_file(file_path: str, size: int):
    """Initialize a file with pseudo-random data."""
    with open(file_path, "wb") as f:
        # Write data in chunks to avoid excessive memory usage
        chunk_size = 16 * 1024 * 1024  # 16 MB chunks
        remaining_size = size

        while remaining_size > 0:
            write_size = min(chunk_size, remaining_size)
            data = np.random.bytes(write_size)
            f.write(data)
            remaining_size -= write_size
    # print(f"Initialized file: {file_path} with size: {format_bytes(size)}")

def initialize_file_single_arg(args):
    return initialize_file(*args)

def create_trace_data_dir(data_dir: str, file_metadata: Dict[str, Dict[str, int]]):
    print("Initializing data directory: %s" % data_dir)
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    else:
        print("Data directory already exists, not doing anything.")
        return

    # Process files in parallel
    total_files = len(file_metadata)
    print(f"Initializing {total_files} files...")

    with mp.Pool(mp.cpu_count()) as pool:
        tasks = []
        for filename, metadata in file_metadata.items():
            file_path = os.path.join(data_dir, filename)
            size = metadata["max_initialized_offset"]
            tasks.append((file_path, size))

        # Use multiprocessing to parallelize file initialization
        for _ in tqdm(pool.imap(initialize_file_single_arg, tasks), total=total_files):
            pass

    print("File initialization complete.")

In [17]:
local_file = "%s_%s_%s.csv" % (cluster_name, date, data_file)
download_url = url_template % (cluster_name, date, data_file)

if not os.path.exists(local_file):
    print("Downloading %s to %s" % (download_url, local_file))
    download_to_file(download_url, local_file)

In [18]:
trace = pd.read_csv(local_file)
trace.head()

Unnamed: 0,filename,file_offset,application,c_time,io_zone,redundancy_type,op_type,service_class,from_flash_cache,cache_hit,request_io_size_bytes,disk_io_size_bytes,response_io_size_bytes,start_time,disk_time,simulated_disk_start_time,simulated_latency
0,917f44e5c65607c83f7d94334b969ce23ea742e29d66e3...,837216,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705339049,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,1,32832,0,0,1705350000.0,0.0,0.0,4.4e-05
1,655858a2cfd7a28c1c9d77de3b9341ebce4331b4c07412...,4423009,bigtable,1705349515,WARM,REPLICATED,WRITE,LATENCY_SENSITIVE,0,-1,3764,3764,0,1705350000.0,0.0,0.0,5.1e-05
2,0b12b74b37fe27d1f4652b506a5f7112ac33fc93258231...,1050624,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705348783,WARM,ERASURE_CODED,WRITE,THROUGHPUT_ORIENTED,0,-1,0,0,0,1705350000.0,0.0,0.0,0.000113
3,0b12b74b37fe27d1f4652b506a5f7112ac33fc93258231...,1050624,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705348783,WARM,ERASURE_CODED,WRITE,THROUGHPUT_ORIENTED,0,-1,1050624,1050624,0,1705350000.0,0.0,0.0,0.000654
4,5bf831a5fb370e7a26c40c91ea13323e13bfb08a145e0c...,3677184,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705262431,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,0,262656,262144,262656,1705350000.0,0.001486,1705350000.0,0.001493


In [19]:
if remove_zero_size_ops:
    trace = trace[trace["request_io_size_bytes"] > 0].reset_index()

In [20]:
# How many lines is the trace?
print("Number of lines in trace: %d" % len(trace))

# What is the proportion of read and write operations?
read_ops = len(trace[trace["op_type"] == "READ"])
write_ops = len(trace[trace["op_type"] == "WRITE"])
print("Read operations: %d" % read_ops)
print("Write operations: %d" % write_ops)
print("Read proportion: %.2f%%" % (100 * read_ops / (read_ops + write_ops)))

read_bytes = trace[trace["op_type"] == "READ"]["request_io_size_bytes"].sum()
write_bytes = trace[trace["op_type"] == "WRITE"]["request_io_size_bytes"].sum()
total_bytes = read_bytes + write_bytes
print("Read bytes: %s" % format_bytes(read_bytes))
print("Write bytes: %s" % format_bytes(write_bytes))
print("Total bytes: %s" % format_bytes(total_bytes))

Number of lines in trace: 144525
Read operations: 79008
Write operations: 65517
Read proportion: 54.67%
Read bytes: 26.98 GB
Write bytes: 16.59 GB
Total bytes: 43.57 GB


In [21]:
num_unique_filenames = trace["filename"].nunique()
print(f"Number of unique filenames: {num_unique_filenames}")
print("\nUnique filenames:")
print(trace["filename"].unique())
if num_unique_filenames > 15000:
    raise Exception("Too many unique filenames: %d" % num_unique_filenames)

Number of unique filenames: 13526

Unique filenames:
['917f44e5c65607c83f7d94334b969ce23ea742e29d66e307b838583614cc5356'
 '655858a2cfd7a28c1c9d77de3b9341ebce4331b4c0741262c1b68911d49901cc'
 '0b12b74b37fe27d1f4652b506a5f7112ac33fc932582315421ccfa42b3a61e7b' ...
 '40b7dc7cfbbe8166efeb060f9a44bc08e9bdade9e2137308f78d6192a5a29561'
 '3f88740dd32663eafaa972e85ea53d8ceeda2e221bb5976ca1f644419049b1d8'
 'd01863d08b470c43922ebbf80e11217f2ba63579e72ebaeccf201df41cd44e32']


In [22]:
file_metadata = process_trace(trace)
total_size_initialized, total_size_read, total_size_written = print_file_metadata_stats(file_metadata)

Total size initialized: 64.17 GB
Total size read: 53.41 GB
Total size written: 16.70 GB


In [23]:
GiB = 2 ** 30
if total_size_initialized > 100 * GiB:
    raise Exception("Too much data initialized: %s" % format_bytes(total_size_initialized))

In [24]:
if initialize_data_dir:
    create_trace_data_dir(data_dir, file_metadata)

Initializing data directory: data_cluster1_16TB_20240115_data-00052-of-00100
Initializing 13526 files...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _ in tqdm(pool.imap(initialize_file_single_arg, tasks), total=total_files):


  0%|          | 0/13526 [00:00<?, ?it/s]

File initialization complete.


In [25]:
# Format the trace file to be used in the benchmark client
# The expected format is:
# <op_type> <filename>:<offset>:<size>
# Example:
# READ file1:0:4096
# WRITE file2:0:4096

def convert_trace_to_bench_client_format(trace: pd.DataFrame, output_file: str):
    with open(output_file, "w") as f:
        for _, row in trace.iterrows():
            filename = row["filename"]
            offset = row["file_offset"]
            io_size = row["request_io_size_bytes"]
            operation = row["op_type"]

            f.write(f"{operation} {filename}:{offset}:{io_size}\n")

if not os.path.exists(output_trace_file):
    print("Converting trace to benchmark client format...")
    convert_trace_to_bench_client_format(trace, output_trace_file)
    print("Trace file converted to benchmark client format: %s" % output_trace_file)
else:
    print("Trace file already exists: %s" % output_trace_file)
    print("Not overwriting.")

Converting trace to benchmark client format...
Trace file converted to benchmark client format: trace_cluster1_16TB_20240115_data-00052-of-00100.txt


In [26]:
# Find lines where request_io_size_bytes != disk_io_size_bytes
mismatched_io_sizes = trace[trace["request_io_size_bytes"] != trace["disk_io_size_bytes"]]

# Display the first few rows of mismatched IO sizes
print("Rows where request_io_size_bytes != disk_io_size_bytes:")
mismatched_io_sizes.head()

Rows where request_io_size_bytes != disk_io_size_bytes:


Unnamed: 0,index,filename,file_offset,application,c_time,io_zone,redundancy_type,op_type,service_class,from_flash_cache,cache_hit,request_io_size_bytes,disk_io_size_bytes,response_io_size_bytes,start_time,disk_time,simulated_disk_start_time,simulated_latency
0,0,917f44e5c65607c83f7d94334b969ce23ea742e29d66e3...,837216,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705339049,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,1,32832,0,0,1705350000.0,0.0,0.0,4.4e-05
3,4,5bf831a5fb370e7a26c40c91ea13323e13bfb08a145e0c...,3677184,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705262431,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,0,262656,262144,262656,1705350000.0,0.001486,1705350000.0,0.001493
4,5,8faac84afe631d341f8a8fdd41c546bd71abb424fdccf3...,8339328,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705327243,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,1,16416,0,0,1705350000.0,0.0,0.0,0.000165
5,6,8faac84afe631d341f8a8fdd41c546bd71abb424fdccf3...,8339328,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705327243,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,0,65664,73728,65664,1705350000.0,0.005166,1705350000.0,0.005201
8,9,1c6d2ba3324fd75086f09f92b91e00fef488ebe1853dbb...,3145728,bigtable,1705237221,WARM,ERASURE_CODED,READ,THROUGHPUT_ORIENTED,0,1,270336,0,270336,1705350000.0,0.0,0.0,6.8e-05
