In [2]:
import os
import math
import string
import random
import requests
import numpy as np
import pandas as pd
from subprocess import run
import multiprocessing as mp

from typing import List, Dict, Any, Tuple
from tqdm import tqdm_notebook as tqdm

random.seed(42)

In [8]:
# Inputs

# Previous Inputs:
# - cluster_17, 0, 12M rows

# Trace file (zstd compressed)
cluster_name = "cluster34"
# part = 0
trace_file = "/mydata/twitter_traces/%s" % cluster_name
compressed_trace_file = "%s.sort.zst" % trace_file
num_rows_to_keep = 100 * 10**6

init_workload_file = trace_file + "_init.txt"
bench_workload_file = trace_file + "_bench.txt"

# Trace analysis
analyze_trace = True

data_dir = "data_twitter_%s" % cluster_name


In [9]:
# Parse trace file into pandas dataframe
# Line format is comma separated:
# timestamp, key, key size, value size, client id, operation, ttl
columns = ["timestamp", "key", "key_size", "value_size", "client_id", "op_type", "ttl"]
trace = pd.read_csv(compressed_trace_file, names=columns, index_col=False, compression="zstd", nrows=num_rows_to_keep)
print("Trace rows: ", len(trace))

Trace rows:  100000000


In [5]:
KiB = 2**10
MiB = 2**20
GiB = 2**30


def format_number(num: int) -> str:
    """Format a number with commas."""
    return "{:,}".format(num)


def format_bytes(num_bytes: int) -> str:
    """Format bytes in human-readable form."""
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if num_bytes < 1024:
            return f"{num_bytes:.2f} {unit}"
        num_bytes /= 1024
    return f"{num_bytes:.2f} PB"


In [5]:
# Check if timestamp column is ordered
# Check if timestamps are monotonically increasing
is_ordered = trace["timestamp"].is_monotonic_increasing
print("Timestamps are ordered:", is_ordered)

if not is_ordered:
    # Sort the trace by timestamp
    print("Sorting trace by timestamp...")
    trace.sort_values("timestamp", inplace=True)
    # Verify sorting worked
    is_ordered = trace["timestamp"].is_monotonic_increasing
    # Overwrite trace file
    print("Saving sorted trace...")
    trace.to_csv(trace_file, index=False, header=False)
    print("Sorted trace saved")

# Keep only first num_rows_to_keep rows
if len(trace) > num_rows_to_keep:
    print(f"Keeping first {num_rows_to_keep:,} rows out of {len(trace):,} total rows")
    trace = trace.head(num_rows_to_keep)
else:
    print(f"Trace has {len(trace):,} rows, no truncation needed")

Timestamps are ordered: True
Keeping first 12,000,000 rows out of 125,111,121 total rows


In [111]:
# Print a list of value sizes and their counts in decreasing order
value_size_counts = trace["value_size"].value_counts()
print("Value size counts:")
print(value_size_counts)

key_size_counts = trace["key_size"].value_counts()
print("Key size counts:")
print(key_size_counts)

Value size counts:
1         4544025
109       2906983
0          656976
102        585666
39         422552
           ...   
12866           1
25454           1
11674           1
152946          1
21969           1
Name: value_size, Length: 27848, dtype: int64
Key size counts:
42    1284899
14    1147953
15     963136
43     949503
26     900665
33     828715
24     811886
23     766890
13     561270
34     556306
32     467932
41     446652
27     441010
35     436206
36     397251
25     395657
44     208764
31     110063
22      88919
29      59064
20      36094
19      32683
28      30634
12      28763
16      23648
21      13162
18       8561
30       1441
11       1308
9         791
17        113
10         55
8           6
Name: key_size, dtype: int64


In [6]:
# Print mean and median value size. Make sure keys first.
# Calculate mean and median value sizes
mean_value_size = trace.groupby("key")["value_size"].mean().mean()
median_value_size = trace.groupby("key")["value_size"].median().median()

print("Mean value size: ", int(mean_value_size))
print("Median value size: ", int(median_value_size))

# Convert key_size to numeric, dropping any non-numeric values
temp_trace = trace.copy()
temp_trace["key_size"] = pd.to_numeric(temp_trace["key_size"], errors="coerce")
# Keep only rows where key_size is not null (was successfully converted to numeric)
temp_trace = temp_trace[temp_trace["key_size"].notna()]


# Calculate mean and median key sizes
mean_key_size = temp_trace.groupby("key")["key_size"].mean().mean()
median_key_size = temp_trace.groupby("key")["key_size"].median().median()

print("Mean key size: ", int(mean_key_size))
print("Median key size: ", int(median_key_size))

key_size = int(mean_key_size)
value_size = int(mean_value_size)

print("Key size: ", key_size)
print("Value size: ", value_size)


Mean value size:  512
Median value size:  1
Mean key size:  32
Median key size:  33
Key size:  32
Value size:  512


In [113]:
# Calculate how many keys are needed for a ~100GB dataset
target_size = 110 * GiB
num_keys = target_size / (mean_key_size + mean_value_size)
print("Number of keys needed for 100GB dataset: ", format_number(int(num_keys)))

Number of keys needed for 100GB dataset:  309,588,024


In [None]:
# How big is the working set?
wss = trace["key"].nunique() * (key_size + value_size)
print("Working set size: ", format_bytes(wss))

In [7]:
# How many lines is the trace?
print("Number of lines in trace: %d" % len(trace))

# What is the proportion of read and write operations?
read_ops = len(trace[trace["op_type"] == "get"])
write_ops = len(trace[trace["op_type"] == "set"])
print("Read operations: %d" % read_ops)
print("Write operations: %d" % write_ops)
print("Read proportion: %.2f%%" % (100 * read_ops / (read_ops + write_ops)))


Number of lines in trace: 50000000
Read operations: 46912736
Write operations: 3087264
Read proportion: 93.83%
Working set size:  3.49 GB


In [115]:
# Prefix pad keys with zeros to make them the same length.
# Alternatively, trim keys to a fixed length.
def pad_key(key: str, target_size: int) -> str:
    # If key is longer than target size, truncate it
    if len(key) > target_size:
        return key[:target_size]
    # If key is shorter than target size, pad with zeros
    return key.zfill(target_size)

# Generate a random alphanumeric key of a given size
def random_key(key_size: int) -> str:
    # Generate a random string of digits and letters
    chars = string.ascii_letters + string.digits
    return "".join(random.choice(chars) for _ in range(key_size))


def random_keys(num_keys: int, key_size: int) -> List[str]:
    return [random_key(key_size) for _ in range(num_keys)]


def random_keys_single_arg(args):
    return random_keys(*args)


def random_keys_parallel(num_keys: int, key_size: int, num_processes: int) -> List[str]:
    # Split the number of keys into chunks of 10k keys
    chunk_size = 10000
    num_chunks = num_keys // chunk_size
    remaining_keys = num_keys % chunk_size

    # Create list of tuples with (num_keys, key_size) for each process
    args = [(chunk_size, key_size)] * num_chunks

    # Add remaining keys as final chunk if needed
    if remaining_keys > 0:
        args.append((remaining_keys, key_size))

    # Use multiprocessing pool with progress bar
    with mp.Pool(num_processes) as pool:
        keys = list(tqdm(pool.imap(random_keys_single_arg, args), total=len(args), desc="Generating keys"))

    # Flatten the list of lists
    return [key for sublist in keys for key in sublist]

def generate_init_workload(trace: pd.DataFrame, key_size: int, num_keys: int, filename: str) -> None:
    # Construct the init workload file
    # Each line is a key
    # All keys and values are the same size
    trace_keys = list(trace["key"].unique())
    trace_keys = [pad_key(key, key_size) for key in trace_keys]

    num_remaining_keys = int(num_keys) - len(trace_keys)
    print("Generating %s remaining keys" % format_number(num_remaining_keys))
    remaining_keys = random_keys_parallel(num_remaining_keys, key_size, mp.cpu_count())

    # Combine trace keys and remaining keys
    all_keys = trace_keys + remaining_keys

    # Permuate the keys
    random.shuffle(all_keys)

    # Write keys to file
    print("Writing keys to file")
    with open(filename, "w") as f:
        for key in all_keys:
            f.write(key + "\n")

generate_init_workload(trace, key_size, num_keys, init_workload_file)


Generating 306,201,418 remaining keys


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  keys = list(tqdm(pool.imap(random_keys_single_arg, args), total=len(args), desc="Generating keys"))


Generating keys:   0%|          | 0/30621 [00:00<?, ?it/s]

Writing keys to file


'1.20 GB'

In [116]:
trace["op_type"].value_counts()

get    11342618
set      657382
Name: op_type, dtype: int64

In [117]:
# Now generate the bench workload file
# Each line is an operation followed by a key (space separated)
# Operations are get or insert

def generate_bench_workload(trace: pd.DataFrame, key_size: int, filename: str) -> None:
    # Construct the bench workload file
    # Each line is an operation followed by a key (space separated)
    print("Generating bench workload")

    with open(filename, "w") as f:
        # Iterate through trace rows with progress bar
        for _, row in tqdm(trace.iterrows(), total=len(trace), desc="Writing operations"):
            key = pad_key(str(row["key"]), key_size)
            # Write operation and key
            if row["op_type"] == "get":
                f.write("get " + key + "\n")
            elif row["op_type"] == "cas":
                f.write("update " + key + "\n")
            else:
                f.write("insert " + key + "\n")

    print("Done generating bench workload")


generate_bench_workload(trace, key_size, bench_workload_file)

Generating bench workload


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _, row in tqdm(trace.iterrows(), total=len(trace), desc="Writing operations"):


Writing operations:   0%|          | 0/12000000 [00:00<?, ?it/s]

Done generating bench workload


In [118]:
libcachesim_dir = "/mydata/libCacheSim"
trace_analyzer = os.path.join(libcachesim_dir, "_build/bin/traceAnalyzer")

if not os.path.exists(trace_analyzer):
    print("libCacheSim traceAnalyzer not found. Skipping trace analysis.")
elif analyze_trace:
    cmd = [trace_analyzer, trace_file, "csv", "--common", "-t", "time-col=1, obj-id-col=2, obj-size-col=4, delimiter=,, has-header=false"]
    print("Running trace analysis")
    run(cmd, check=True)

Running trace analysis


dat: cluster34.000
number of requests: 116734382, number of objects: 8697706
number of req GiB: 22.5188, number of obj GiB: 7.3966
compulsory miss ratio (req/byte): 0.0745/0.3285
object size weighted by req/obj: 207/913
frequency mean: 13.4213
time span: 37098(0.4294 day)
write: 0(0), overwrite: 0(0), del:0(0)
request rate min 1576.1800 req/s, max 3952.9600 req/s, window 300s
object rate min 955.8100 obj/s, max 2303.3500 obj/s, window 300s
popularity: Zipf linear fitting slope=1.0368, intercept=-1.0000, R2=-1.0000
X-hit (number of obj accessed X times): 1710366(0.1966), 2233810(0.2568), 1132913(0.1303), 611823(0.0703), 398030(0.0458), 318951(0.0367), 236664(0.0272), 177397(0.0204), 
freq (fraction) of the most popular obj: 4340172(0.0372), 1177272(0.0101), 1166011(0.0100), 972589(0.0083), 936991(0.0080), 918408(0.0079), 770925(0.0066), 709380(0.0061), 

