In [10]:
import json
import os
import shutil
import tarfile
import hashlib
import pickle
import docker

In [11]:
benchmarks = ["rnn", "face_detection", "gzip", "cnn", "image_rotate", "json_serdes", "matmul", "pyaes", "chameleon", "lr_training"]
base_image = "debian_python"

overlay_logs = "/home/XXXX/FaaSSnapper/analysis/benchmark/overlay_logs" # This directory stores the on-demand files.
package_dir = "/home/XXXX/FaaSSnapper/analysis/faasimage/data/packages" # This directory stores the packages.
faasimage_dir = "/home/XXXX/FaaSSnapper/analysis/faasimage/data/images" # This directory stores the image data.
layer_dir = "/home/XXXX/FaaSSnapper/analysis/faasimage/data/layers" # This directory stores the layers.
image_dir = "/home/XXXX/FaaSSnapper/analysis/image_dir/images" # This directory stores the complete image.
input_dir = "/home/XXXX/FaaSSnapper/analysis/faasimage/data/input" # This directory stores the list of packages and on-deman files.

image_python_root = "usr/local/lib/python3.10/"
image_python_package_root = "usr/local/lib/python3.10/site-packages"

def get_gear_dir(bench):
    return f"/home/XXXX/FaaSSnapper/analysis/image_dir/prefetched/{bench}/image"

**Helper Functions**

In [12]:
def get_file_hash(file_path):
    # Calculate the hash of a file
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        data = f.read()  # Read the entire file in one go
        hasher.update(data)
    return hasher.hexdigest()

def get_dir_structure(image_path):
    image_files = []
    image_dirs = []

    for root, dirs, files in os.walk(image_path):
        if(len(files) == 0):
            image_dirs.append(f"1:::{str(os.path.relpath(root, image_path))}")

        for dir in dirs:
            if os.path.islink(os.path.join(root, dir)):
                image_dirs.append(f"1:::{str(os.path.relpath(os.path.join(root, dir), image_path))}")

        for file in files:
            file_path = os.path.join(root, file)
            if (os.path.islink(file_path)):
                image_files.append(f"1:::{str(os.path.relpath(os.path.join(root, file), image_path))}")

            elif (os.path.exists(file_path)):
                hash_id = get_file_hash(file_path)
                image_files.append(f"{hash_id}:::{str(os.path.relpath(os.path.join(root, file), image_path))}")

    return image_files, image_dirs

def get_common(base, bench, on_demand):
    data = []
    base_data =  set(base).intersection(set(bench))
    # print(set(base).difference(set(bench)))
    for f in base_data:
        if f.split(':::')[-1].strip() in on_demand:
            data.append(f.split(':::')[-1].strip())
    return data

def get_diff(set_a, set_b):
    print_paths = []
    res = set_a.difference(set_b)
    for f in res:
        if("usr/local/lib/python3.10/" not in f) and ("usr/lib/x86_64-linux-gnu/" not in f):
            print_paths.append(f)
    
    return print_paths

def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def remove_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)

def parse_dirname(path):
    return os.path.dirname(path)


Let us first get the on-demand files for each image

In [13]:
demand_paths = {}

def get_on_demand(benchmark):
    overlay_file = f"{overlay_logs}/{benchmark}"
    files = set()
    with open(overlay_file, 'r') as file:
        for line in file:
            if 'PID' in line:
                file_path = line.split("is accessing file")[-1].strip()
                files.add('/'.join(file_path.split('/')[7:]))
    return files

for bench in benchmarks:
    demand_paths[bench] = get_on_demand(bench)

Make a base layer that stores python, standard packages, OS files, and store pointer to the rest of the files.

In [15]:
def create_common_layer(faasimage_base, base_image_dir):
    common_layer_dir = f"{layer_dir}/common"
    base_files, base_dirs = get_dir_structure(base_image_dir)
    common_files = set()
    common_dirs = set()
    for bench in benchmarks:
        common_dirs = common_dirs.union(faasimage_base[bench]["dirs"])
        common_files = common_files.union(faasimage_base[bench]["files"])

    remove_dir(common_layer_dir)
    create_dir(common_layer_dir)
    
    for file in base_files:
        hash_value = file.split(':::')[0].strip() 
        base_file = file.split(':::')[-1].strip()
        item_path = os.path.join(base_image_dir, base_file) # Get the source path (filesystem image).
        dest_path = os.path.join(common_layer_dir, base_file) # Get the destination path (layer path).

        item_dir = parse_dirname(item_path)
        dest_dir = parse_dirname(dest_path)
        create_dir(dest_dir)
        stat = os.stat(item_dir)
        os.chmod(dest_dir, stat.st_mode)  # Set permissions
        os.utime(dest_dir, (stat.st_atime, stat.st_mtime))  # Set access/modification times

        if (os.path.islink(item_path)):
            target_link = os.readlink(item_path)
            os.symlink(target_link, dest_path)

        elif (base_file in common_files):
            shutil.copy2(item_path, dest_path)
        else:
            with open(dest_path, "w") as f:
                f.write(hash_value)

    for dir in base_dirs:
        base_dir = dir.split(':::')[-1].strip()
        item_path = os.path.join(base_image_dir, base_dir) # Get the source path (filesystem image).
        dest_path = os.path.join(common_layer_dir, base_dir) # Get the destination path (layer path).
        create_dir(parse_dirname(dest_path))

        if os.path.islink(item_path):
            target_link = os.readlink(item_path)
            os.symlink(target_link, dest_path, target_is_directory=True)
        else:
            create_dir(dest_path)
            stat = os.stat(item_path)
            os.chmod(dest_path, stat.st_mode)  # Set permissions
            os.utime(dest_path, (stat.st_atime, stat.st_mtime))  # Set access/modification times

base_image_dir = f"{image_dir}/{base_image}"
base_files, base_dirs = get_dir_structure(base_image_dir)
faasimage_base = {}
for bench in benchmarks:
    faasimage_base[bench] = {}
    bench_files, bench_dirs = get_dir_structure(f"{image_dir}/{bench}")
    faasimage_base[bench]["files"] = set(get_common(base_files, bench_files, demand_paths[bench]))
    faasimage_base[bench]["dirs"] = set(get_common(base_dirs, bench_dirs, demand_paths[bench]))
    print(bench, len(faasimage_base[bench]["files"]), len(faasimage_base[bench]["dirs"]))

create_common_layer(faasimage_base, base_image_dir)

# for bench1 in benchmarks:
#     for bench2 in benchmarks:
#         print(bench1, bench2, get_diff(faasimage_base[bench1]["files"], faasimage_base[bench2]["files"]))
#         print(bench1, bench2, get_diff(faasimage_base[bench1]["dirs"], faasimage_base[bench2]["dirs"]))


rnn 265 18
face_detection 237 16
gzip 224 15
cnn 409 22
image_rotate 223 15
json_serdes 223 15
matmul 196 16
pyaes 223 14
chameleon 203 15
lr_training 262 19


Get Function files i.e. (IMAGE - BASE_IMAGE)

In [16]:
def get_func_image(base, bench):
    return set(bench).difference(set(base))

def print_non_python(func_dirs):
    res = []
    for f in func_dirs:
        if("pycache" not in f):
            res.append(f)
    return res

base_files, base_dirs = get_dir_structure(base_image_dir)
faasimage_func = {}
for bench in benchmarks:
    faasimage_func[bench] = {}
    bench_files, bench_dirs = get_dir_structure(f"{image_dir}/{bench}")
    faasimage_func[bench]["files"] = set(get_func_image(base_files, bench_files))
    faasimage_func[bench]["dirs"] = set(get_func_image(base_dirs, bench_dirs))
    print(bench, len(faasimage_func[bench]["files"]), len(faasimage_func[bench]["dirs"]))

rnn 19023 18
face_detection 4030 12
gzip 1043 0
cnn 16832 524
image_rotate 3510 11
json_serdes 1043 0
matmul 2449 4
pyaes 1237 0
chameleon 950 0
lr_training 10533 9


Split the image into python func packages, app dependencies, and pycache + function + function pointer

In [17]:
def get_directory(file_path, from_top):
    file_path = file_path.split(os.sep)
    if len(file_path) >= from_top:
        dirname = file_path[from_top]
        return dirname
    else:
        return None

def check_base_package(file_path):
    if("usr/local/lib/python3.10/site-packages" in file_path):
        package_name = get_directory(file_path, 5)
        if(package_name in ["_distutils_hack", "pip", "pkg_resources", "setuptools", "wheel", "distutils-precedence.pth", "pip-23.0.1.dist-info", "README.txt", "setuptools-65.5.1.dist-info", "wheel-0.43.0.dist-info", "__pycache__"]):
            return True
        
    return False

def split_image(faasimage_func, bench, bench_path, demand_files, has_apt=False):
    bench_files = faasimage_func[bench]["files"]
    bench_dirs = faasimage_func[bench]["dirs"]
    package_files = []
    bench_layers =  {
        "func+pycache+funcptrs": f"{layer_dir}/{bench}_func",
    }

    for layer in bench_layers.keys():
        remove_dir(bench_layers[layer])
        create_dir(bench_layers[layer])
    
    for file in bench_files:
        hash_value = file.split(':::')[0].strip()
        bench_file = file.split(':::')[-1].strip()
        item_path = os.path.join(bench_path, bench_file)
        
        if ("bench/" in bench_file) or (check_base_package(bench_file)) or (("usr/local/lib/python3.10/" in bench_file) and ("site-packages" not in bench_file)):
            dest_path = os.path.join(bench_layers["func+pycache+funcptrs"], bench_file)

        elif "usr/local/lib/python3.10/site-packages" in bench_file:
            package_files.append(bench_file)
            continue
        else:
            if has_apt:
                dest_path = os.path.join(bench_layers["apt_package"], bench_file)
            else:
                dest_path = os.path.join(bench_layers["func+pycache+funcptrs"], bench_file)

        item_dir = parse_dirname(item_path)
        dest_dir = parse_dirname(dest_path)
        create_dir(dest_dir)
        stat = os.stat(item_dir)
        os.chmod(dest_dir, stat.st_mode)  # Set permissions
        os.utime(dest_dir, (stat.st_atime, stat.st_mtime))  # Set access/modification times

        if (os.path.islink(item_path)):
            target_link = os.readlink(item_path)
            os.symlink(target_link, dest_path)
        elif (bench_file in demand_files):
            shutil.copy2(item_path, dest_path)
        else:
            with open(dest_path, "w") as f:
                f.write(hash_value)

    for dir in bench_dirs:
        bench_dir = dir.split(':::')[-1].strip()
        if ("usr/local/lib/python3.10/site-packages" in bench_dir):
            continue
        item_path = os.path.join(bench_path, bench_dir) # Get the source path (filesystem image).
        if has_apt:
            dest_path = os.path.join(bench_layers["apt_package"], bench_dir)
        else:
            dest_path = os.path.join(bench_layers["func+pycache+funcptrs"], bench_dir)

        item_dir = parse_dirname(item_path)
        dest_dir = parse_dirname(dest_path)
        create_dir(dest_dir)
        stat = os.stat(item_dir)
        os.chmod(dest_dir, stat.st_mode)  # Set permissions
        os.utime(dest_dir, (stat.st_atime, stat.st_mtime))  # Set access/modification times

        if os.path.islink(item_path):
            target_link = os.readlink(item_path)
            os.symlink(target_link, dest_path, target_is_directory=True)
        else:
            create_dir(dest_path)

    return package_files

def parse_package(bench, bench_path, package_files, demand_files):
    bench_func_layer = f"{layer_dir}/{bench}_func"
    package_list = []
    package_folders = set()
    docker_package_repo = dict()
    package_repo = {}
    pycache_repo = {}
    for dir in os.listdir(os.path.join(bench_path, "usr/local/lib/python3.10/site-packages")):
        if (dir in ["_distutils_hack", "pip", "pkg_resources", "setuptools", "wheel", "distutils-precedence.pth", "pip-23.0.1.dist-info", "README.txt", "setuptools-65.5.1.dist-info", "wheel-0.43.0.dist-info", "__pycache__"]):
            continue
        if (dir in ["cv2", "tensorflow", "clang", "PIL", "sklearn", "absl", "dateutil", "google", "grpc", "pasta"]):
            docker_package_repo[dir] = []
            package_list.append(dir)
        if('dist-info' in dir):
            docker_package_repo[dir.split('.dist-info')[0]] = []
            package_list.append(dir.split('.dist-info')[0])

    # print(package_list)

    for package in package_list:
        remove_dir(f"{package_dir}/{package}")
        create_dir(f"{package_dir}/{package}")
    
    for file in package_files:
        bench_file = file.split(':::')[-1].strip()
        package_name = get_directory(bench_file, 5)
        package_root = f"usr/local/lib/python3.10/site-packages/{package_name}"
        package_folders.add(package_root)
        item_path = os.path.join(bench_path, bench_file)
        dest_path = None
        for package in package_list:
            if (package.split('-')[0].strip().lower() in package_name.lower()):
                package_repo[package_name] = package
                if "/__pycache__/" in bench_file:
                    pycache_repo[parse_dirname(os.path.relpath(bench_file, "usr/local/lib/python3.10/site-packages"))] = package
                    dest_path = os.path.join(bench_func_layer, "package_repo", package, os.path.relpath(bench_file, "usr/local/lib/python3.10/site-packages"))
                else:
                    if(bench_file in demand_files):
                        docker_package_repo[package].append(os.path.relpath(bench_file, "usr/local/lib/python3.10/site-packages"))
                    dest_path = os.path.join(f"{package_dir}/{package}", os.path.relpath(bench_file, "usr/local/lib/python3.10/site-packages"))
        
        if(dest_path is None):
            print(item_path, package_name, package_list, package_repo)
            return

        item_dir = parse_dirname(item_path)
        dest_dir = parse_dirname(dest_path)
        create_dir(dest_dir)
        stat = os.stat(item_dir)
        os.chmod(dest_dir, stat.st_mode)  # Set permissions
        os.utime(dest_dir, (stat.st_atime, stat.st_mtime))  # Set access/modification times

        if (os.path.islink(item_path)):
            target_link = os.readlink(item_path)
            os.symlink(target_link, dest_path)
        else:
            shutil.copy2(item_path, dest_path)
    
    for folder in package_folders:
        dest_path = os.path.join(bench_func_layer, folder)
        create_dir(parse_dirname(dest_path))
        package_name = get_directory(folder, 5)
        os.symlink(f"/package_repo/{package_repo[package_name]}/{package_name}", dest_path, target_is_directory=True)

    # for file in pycache_repo.keys():
    #     dest_path = f"{package_dir}/{pycache_repo[file]}/{file}"
    #     create_dir(parse_dirname(dest_path))
    #     os.symlink(f"/pycache_repo/{file}", dest_path, target_is_directory=True)

    with open(os.path.join(input_dir, f"{bench}.json"), "w") as outfile:
        # print(docker_package_repo)
        json.dump(docker_package_repo, outfile, indent=4)
    

remove_dir(package_dir)
create_dir(package_dir)

remove_dir(input_dir)
create_dir(input_dir)

for bench in benchmarks:
    package_files = split_image(faasimage_func, bench, f"{image_dir}/{bench}", demand_paths[bench], False)
    parse_package(bench, f"{image_dir}/{bench}", package_files, demand_paths[bench])
    print(bench)


rnn
face_detection
gzip
cnn
image_rotate
json_serdes
matmul
pyaes
chameleon
lr_training


Create Image Folder for faasimage files

In [4]:
def compress_layer(layer_path, layer_tar_path):
    with tarfile.open(layer_tar_path, "w:gz") as tar:
        tar.add(layer_path, arcname='')

def generate_image(bench, include_package, image_data, include_apt):
    bench_image_path = os.path.join(faasimage_dir, bench)
    create_dir(bench_image_path)

    if(include_apt):
        layers = ["common", f"{bench}_apt", f"{bench}_func"]
    else:
        layers = ["common", f"{bench}_func"]
        
    if (include_package):
        layers.append("packages")

    dockerfile_data= """FROM scratch\n"""

    for env_var in image_data[bench]["env"]:
        dockerfile_data += f"ENV {env_var.split('=')[0]} {env_var.split('=')[1]} \n"
    if(bench == "face_detection"):
        dockerfile_data += "ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/lib/:/package_repo/opencv_python-4.9.0.80/opencv_python.libs \n"
    elif(bench == "lr_training"):
        dockerfile_data += "ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/lib/:/package_repo/scikit_learn-1.4.2/scikit_learn.libs \n"
        
    else:
        dockerfile_data += f"ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/lib/ \n"


    for layer_name in layers:
        compress_layer(f"{layer_dir}/{layer_name}", f"{bench_image_path}/{layer_name}.tar.gz")
        dockerfile_data += f"ADD {layer_name}.tar.gz / \n"

    entrypoint_cmd = ','.join(['"' + s + '"' for s in image_data[bench]['entrypoint']])
    dockerfile_data += f"ENTRYPOINT [{entrypoint_cmd}] \n"

    with open(os.path.join(bench_image_path, "Dockerfile"), 'w') as file:
        file.write(dockerfile_data)

    # print(dockerfile_data)

    docker_script = f""" cd {bench_image_path}
    docker rmi localhost:5000/{bench}_faas
    docker build -t localhost:5000/{bench}_faas .
    docker push localhost:5000/{bench}_faas
    docker rmi localhost:5000/{bench}_faas
    """
    os.system(docker_script)

cache_dir = "/home/XXXX/FaaSSnapper/analysis/image_dir/cache"

def load_object_from_file(file_path):
    with open(file_path, "rb") as file:
        obj = pickle.load(file)
    print(f"Object loaded from '{file_path}'.")
    return obj

image_data = load_object_from_file(os.path.join(cache_dir, "image_data"))
# ['rnn', 'face_detection', 'gzip', 'cnn', 'image_rotate', 'json_serdes', 'matmul', 'pyaes', 'chameleon', 'lr_training']
for bench in ["rnn"]:
    generate_image(bench, False, image_data, False)


Object loaded from '/home/XXXX/FaaSSnapper/analysis/image_dir/cache/image_data'.
Untagged: localhost:5000/rnn_faas:latest
Untagged: localhost:5000/rnn_faas@sha256:61576ef1a10a420a123011fc6bdd74edbdc71dfe5ac96406ede59de17cd1e5ab
Deleted: sha256:873bdb80f29b5add3e376d828cab25b801c21b2284aa670c8e3d29030e0e533b
Deleted: sha256:d591a16737ed57f5f1b5c3bf83e4477023ffd94bed7a07c000ac6a399d4191b7
Deleted: sha256:91daec0a785085b0f7af2545b966247f330dbd0ef0384359de7eba6598c07a0f


#2 [internal] load build definition from Dockerfile
#2 sha256:6f13112a5db13fcb7eb46e2df6118579d614be1d6964fb86efbda1337b25215d
#2 transferring dockerfile:
#2 ...

#1 [internal] load .dockerignore
#1 sha256:4cae10160ead90366bee46ea882562c1ed3be8a179addc347e4d6882ef79b092
#1 transferring context: 2B done
#1 DONE 0.7s

#2 [internal] load build definition from Dockerfile
#2 sha256:6f13112a5db13fcb7eb46e2df6118579d614be1d6964fb86efbda1337b25215d
#2 transferring dockerfile: 761B done
#2 DONE 0.8s

#3 [internal] load build context
#3 sha256:38467c3fdc74844c7c17dec5719f267c12eb3ca642b216d1a1c4aedfe4f387d1
#3 transferring context: 31.40MB 0.2s done
#3 DONE 0.5s

#4 [1/2] ADD common.tar.gz /
#4 sha256:1dac2db13cc20fcc3f0831a63bdc293e71c4891646453567f73b0977f1e332e4
#4 DONE 2.1s

#5 [2/2] ADD rnn_func.tar.gz /
#5 sha256:0c9ed1764f24d50c91d2025126841a072f9a406d0c4501c0d9e9a77689249934
#5 DONE 9.2s

#6 exporting to image
#6 sha256:e8c613e07b0b7ff33893b694f7759a10d42e180f2b4dc349fb57dc6b71dcab00
#6 

Using default tag: latest
The push refers to repository [localhost:5000/rnn_faas]
2d4582e3586c: Preparing
91daec0a7850: Preparing
91daec0a7850: Layer already exists
2d4582e3586c: Pushed
latest: digest: sha256:8b5df42ba4654c8af50a3243cf439e99d51c2a1a2af0969c5508f0c59a335250 size: 740
Untagged: localhost:5000/rnn_faas:latest
Untagged: localhost:5000/rnn_faas@sha256:8b5df42ba4654c8af50a3243cf439e99d51c2a1a2af0969c5508f0c59a335250
Deleted: sha256:3420b098073d06203360b6b24507ad6168bd9ec1c5f7b74c4771968216943e56
