In [1]:
import os
import random
import requests
import numpy as np
import pandas as pd
import multiprocessing as mp

from tqdm import tqdm_notebook as tqdm

random.seed(42)

In [2]:
# Inputs

cluster_name = "cluster1_16TB"
date = "20240115"
data_file = "data-00000-of-00100"
url_template = "https://storage.googleapis.com/thesios-io-traces/%s/%s/%s"
initialize_data_dir = True
data_dir = "data_%s_%s_%s" % (cluster_name, date, data_file)

In [3]:
def download_to_file(url, filename):
    response = requests.get(url)
    with open(filename, "wb") as f:
        f.write(response.content)

def format_bytes(bytes: int) -> str:
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if bytes < 1024:
            return f"{bytes:.2f} {unit}"
        bytes /= 1024
    return f"{bytes:.2f} PB"

In [4]:


local_file = "%s_%s_%s.csv" % (cluster_name, date, data_file)
download_url = url_template % (cluster_name, date, data_file)

if not os.path.exists(local_file):
    print("Downloading %s to %s" % (download_url, local_file))
    download_to_file(download_url, local_file)

In [5]:
trace = pd.read_csv(local_file)
trace.head()

Unnamed: 0,filename,file_offset,application,c_time,io_zone,redundancy_type,op_type,service_class,from_flash_cache,cache_hit,request_io_size_bytes,disk_io_size_bytes,response_io_size_bytes,start_time,disk_time,simulated_disk_start_time,simulated_latency
0,d206e7d484caba36fa936dffa618118382e36e0a616a41...,2453358,bigtable,1705305576,WARM,REPLICATED,WRITE,THROUGHPUT_ORIENTED,0,-1,3390,3390,0,1705306000.0,0.0,0.0,0.000125
1,0c44a2ae48ee5d8b0c299ee04f3071e53b7d95b61cc13c...,7354368,4aed9945dd146966ceb9894bafd139b178af74e984c59c...,1705167112,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,0,1050624,1052672,1050624,1705306000.0,0.004239,1705306000.0,0.004246
2,d206e7d484caba36fa936dffa618118382e36e0a616a41...,2456748,bigtable,1705305576,WARM,REPLICATED,WRITE,THROUGHPUT_ORIENTED,0,-1,12294,12294,0,1705306000.0,0.0,0.0,9.1e-05
3,95f38d49a2da49d63a4bd459b79e3d68c24e1e13878dd1...,417618,51d511367f2bdd00717fc55ff6db252e785fb629018a8b...,1705305525,WARM,REPLICATED,WRITE,THROUGHPUT_ORIENTED,0,-1,0,0,0,1705306000.0,0.0,0.0,8.9e-05
4,95f38d49a2da49d63a4bd459b79e3d68c24e1e13878dd1...,417618,51d511367f2bdd00717fc55ff6db252e785fb629018a8b...,1705305525,WARM,REPLICATED,WRITE,THROUGHPUT_ORIENTED,0,-1,904237,904237,0,1705306000.0,0.0,0.0,0.000371


In [6]:
# How many lines is the trace?
print("Number of lines in trace: %d" % len(trace))

Number of lines in trace: 168590


In [7]:
num_unique_filenames = trace["filename"].nunique()
print(f"Number of unique filenames: {num_unique_filenames}")
print("\nUnique filenames:")
print(trace["filename"].unique())
if num_unique_filenames > 15000:
    raise Exception("Too many unique filenames: %d" % num_unique_filenames)

Number of unique filenames: 13565

Unique filenames:
['d206e7d484caba36fa936dffa618118382e36e0a616a4117f4d13d06b7dc7fe6'
 '0c44a2ae48ee5d8b0c299ee04f3071e53b7d95b61cc13c4289c11a56b5250a36'
 '95f38d49a2da49d63a4bd459b79e3d68c24e1e13878dd19aedfe0e2832c17d6e' ...
 '1f2ac552b5b9935abf7ef5df672e5da2e12b551fa63370baf3658f5c1b9876d2'
 '551dc659b6aa5b2b16c6267643236fb6746575f66d467a82fca72b51b1922c06'
 'db132f3a38568dd6e1147bf081b02c1e3a3e7b061fa1a9887ba4f2e24b20a95d']


In [8]:
def process_trace(trace):
    # We want to process the trace, operation by operation.
    # Let's simplify: assume everything pre-exists, and we only need to keep track of the last read/write offset for each file.

    # We will keep track of the last read offset for each file.
    # Initialize dictionaries to store file metadata
    file_metadata = {}

    for _, row in trace.iterrows():
        filename = row["filename"]
        offset = row["file_offset"]
        io_size = row["request_io_size_bytes"]
        operation = row["op_type"]

        if filename not in file_metadata:
            file_metadata[filename] = {
                "max_read_offset": 0,
                "max_write_offset": 0,
                "max_initialized_offset": 0,
            }

        metadata = file_metadata[filename]

        if operation == "READ":
            metadata["max_initialized_offset"] = max(metadata["max_initialized_offset"], offset + io_size)
            metadata["max_read_offset"] = max(metadata["max_read_offset"], offset + io_size)
        elif operation == "WRITE":
            metadata["max_initialized_offset"] = max(metadata["max_initialized_offset"], offset + io_size)
            metadata["max_write_offset"] = max(metadata["max_write_offset"], offset + io_size)

    return file_metadata

def print_file_metadata_stats(file_metadata):
    total_size_initialized = sum([metadata["max_initialized_offset"] for metadata in file_metadata.values()])
    total_size_read = sum([metadata["max_read_offset"] for metadata in file_metadata.values()])
    total_size_written = sum([metadata["max_write_offset"] for metadata in file_metadata.values()])

    print(f"Total size initialized: {format_bytes(total_size_initialized)}")
    print(f"Total size read: {format_bytes(total_size_read)}")
    print(f"Total size written: {format_bytes(total_size_written)}")
    return total_size_initialized, total_size_read, total_size_written


file_metadata = process_trace(trace)
total_size_initialized, total_size_read, total_size_written = print_file_metadata_stats(file_metadata)

Total size initialized: 65.21 GB
Total size read: 53.07 GB
Total size written: 18.92 GB


In [9]:
GiB = 2 ** 30
if total_size_initialized > 100 * GiB:
    raise Exception("Too much data initialized: %s" % format_bytes(total_size_initialized))

In [10]:
# Initialize a new directory and write the files needed to run the trace

def initialize_file(file_path: str, size: int):
    """Initialize a file with pseudo-random data."""
    with open(file_path, "wb") as f:
        # Write data in chunks to avoid excessive memory usage
        chunk_size = 16 * 1024 * 1024  # 16 MB chunks
        remaining_size = size

        while remaining_size > 0:
            write_size = min(chunk_size, remaining_size)
            data = np.random.bytes(write_size)
            f.write(data)
            remaining_size -= write_size

    print(f"Initialized file: {file_path} with size: {format_bytes(size)}")

def initialize_file_single_arg(args):
    return initialize_file(*args)

def create_trace_data_dir(data_dir, file_metadata):
    print("Initializing data directory: %s" % data_dir)
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    else:
        print("Data directory already exists, not doing anything.")
        return

    # Process files in parallel
    total_files = len(file_metadata)
    print(f"Initializing {total_files} files...")

    with mp.Pool(mp.cpu_count()) as pool:
        tasks = []
        for filename, metadata in file_metadata.items():
            file_path = os.path.join(data_dir, filename)
            size = metadata["max_initialized_offset"]
            tasks.append((file_path, size))

        # Use multiprocessing to parallelize file initialization
        for _ in tqdm(pool.imap(initialize_file_single_arg, tasks), total=total_files):
            pass

    print("File initialization complete.")



if initialize_data_dir:
    create_trace_data_dir(data_dir, file_metadata)

Initializing data directory: data_cluster1_16TB_20240115_data-00000-of-00100
Initializing 13565 files...
Initialized file: data_cluster1_16TB_20240115_data-00000-of-00100/3c62afce01d8a10a352c92a396a37dadfb8d4f5f03c1c7049769ad46abe41b96 with size: 1.00 MBInitialized file: data_cluster1_16TB_20240115_data-00000-of-00100/80008845eaf1b477c0597ecf3ea452e12e629ed08c03c1bc1098234f912902e1 with size: 1.25 MB

Initialized file: data_cluster1_16TB_20240115_data-00000-of-00100/4ffe32bf2b6ed9e5970687ce370ace496e352b099e191fd997f1922e0844738d with size: 1.65 MB


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _ in tqdm(pool.imap(initialize_file_single_arg, tasks), total=total_files):


  0%|          | 0/13565 [00:00<?, ?it/s]

Initialized file: data_cluster1_16TB_20240115_data-00000-of-00100/a315c36cf16f5d7ea3454f217f6c0e02103a5f2dbe97892801f9cfa76dbcef7f with size: 7.01 MBInitialized file: data_cluster1_16TB_20240115_data-00000-of-00100/d206e7d484caba36fa936dffa618118382e36e0a616a4117f4d13d06b7dc7fe6 with size: 5.51 MBInitialized file: data_cluster1_16TB_20240115_data-00000-of-00100/dbc19080ff913d52ed5bd386c87ac2aaddeef34aae9a661f26f788b34703d6f3 with size: 8.00 MB
Initialized file: data_cluster1_16TB_20240115_data-00000-of-00100/f43c17c5d744cc9fbdf644333076cd386548430d7043593a25445d2a9c7dbfd5 with size: 8.02 MBInitialized file: data_cluster1_16TB_20240115_data-00000-of-00100/f076bed1429596e9b4fcda20b80ceb289d28019e8f3a8cc875e59f4df6aa2105 with size: 8.02 MBInitialized file: data_cluster1_16TB_20240115_data-00000-of-00100/0c44a2ae48ee5d8b0c299ee04f3071e53b7d95b61cc13c4289c11a56b5250a36 with size: 8.02 MB
Initialized file: data_cluster1_16TB_20240115_data-00000-of-00100/7291e51345a204e38e840e0eedac87dfa4f0f8

In [11]:
# Find lines where request_io_size_bytes != disk_io_size_bytes
mismatched_io_sizes = trace[trace["request_io_size_bytes"] != trace["disk_io_size_bytes"]]

# Display the first few rows of mismatched IO sizes
print("Rows where request_io_size_bytes != disk_io_size_bytes:")
mismatched_io_sizes.head()

Rows where request_io_size_bytes != disk_io_size_bytes:


Unnamed: 0,filename,file_offset,application,c_time,io_zone,redundancy_type,op_type,service_class,from_flash_cache,cache_hit,request_io_size_bytes,disk_io_size_bytes,response_io_size_bytes,start_time,disk_time,simulated_disk_start_time,simulated_latency
1,0c44a2ae48ee5d8b0c299ee04f3071e53b7d95b61cc13c...,7354368,4aed9945dd146966ceb9894bafd139b178af74e984c59c...,1705167112,WARM,REPLICATED,READ,THROUGHPUT_ORIENTED,0,0,1050624,1052672,1050624,1705306000.0,0.004239,1705306000.0,0.004246
7,ccc5dd26c7981bb3805b340632295fcf6abae62d81ce8c...,0,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1705304559,WARM,ERASURE_CODED,READ,THROUGHPUT_ORIENTED,0,0,1050624,1052672,1050624,1705306000.0,0.031058,1705306000.0,0.067846
10,dbc19080ff913d52ed5bd386c87ac2aaddeef34aae9a66...,5484544,bigtable,1705056906,WARM,ERASURE_CODED,READ,THROUGHPUT_ORIENTED,0,0,540672,536576,540672,1705306000.0,0.002441,1705306000.0,0.061153
11,dbc19080ff913d52ed5bd386c87ac2aaddeef34aae9a66...,6021120,bigtable,1705056906,WARM,ERASURE_CODED,READ,THROUGHPUT_ORIENTED,0,0,139264,135168,139264,1705306000.0,0.000961,1705306000.0,0.045035
12,4ffe32bf2b6ed9e5970687ce370ace496e352b099e191f...,1050624,eccab0ec807ba5e9c86ea4d72b7272534653995c86e7d3...,1704572620,WARM,ERASURE_CODED,READ,THROUGHPUT_ORIENTED,0,0,411648,413696,411648,1705306000.0,0.002005,1705306000.0,0.045428
