In [None]:
!pip install diffusers transformers ray memory_profiler line_profiler accelerate zerocopy

In [None]:
%load_ext memory_profiler
%load_ext line_profiler

In [18]:
import copy
import gc
import numpy as np
import pandas as pd
import pickle
import ray
import time
import torch
import torch.nn as nn
import zerocopy

from diffusers import StableDiffusionPipeline
from functools import wraps
from typing import Dict, List, Tuple

In [19]:
def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    with torch.no_grad():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

In [20]:
def extract_tensors(m: torch.nn.Module) -> Tuple[torch.nn.Module, List[Dict]]:
    tensors = []
    for _, module in m.named_modules():
        params = {
            name: torch.clone(param).detach().numpy()
            for name, param in module.named_parameters(recurse=False)
        }
        buffers = {
            name: torch.clone(buf).detach().numpy()
            for name, buf in module.named_buffers(recurse=False)
        }
        tensors.append({"params": params, "buffers": buffers})

    m_copy = copy.deepcopy(m)
    for _, module in m_copy.named_modules():
        for name in [name for name, _ in module.named_parameters(recurse=False)] + [
            name for name, _ in module.named_buffers(recurse=False)
        ]:
            setattr(module, name, None)

    m_copy.train(False)
    return m_copy, tensors


def replace_tensors(m: torch.nn.Module, tensors: List[Dict]):
    modules = [module for _, module in m.named_modules()]
    for module, tensor_dict in zip(modules, tensors):
        for name, array in tensor_dict["params"].items():
            module.register_parameter(name, torch.nn.Parameter(torch.as_tensor(array)))
        for name, array in tensor_dict["buffers"].items():
            module.register_buffer(name, torch.as_tensor(array))

### Download files

In [21]:
model_id = "runwayml/stable-diffusion-v1-5"
pickle_id = "runwayml_stable-diffusion-v1-5.pkl"
torch_id = "runwayml_stable-diffusion-v1-5.pt"

In [None]:
pipe = StableDiffusionPipeline.from_pretrained(
    model_id, torch_dtype=torch.float16, variant="fp16"
).to("cuda")

In [33]:
pipe.config

FrozenDict([('vae', ('diffusers', 'AutoencoderKL')),
            ('text_encoder', ('transformers', 'CLIPTextModel')),
            ('tokenizer', ('transformers', 'CLIPTokenizer')),
            ('unet', ('diffusers', 'UNet2DConditionModel')),
            ('scheduler', ('diffusers', 'PNDMScheduler')),
            ('safety_checker',
             ('stable_diffusion', 'StableDiffusionSafetyChecker')),
            ('feature_extractor', ('transformers', 'CLIPImageProcessor')),
            ('requires_safety_checker', True),
            ('_name_or_path', 'runwayml/stable-diffusion-v1-5')])

In [34]:
print(isinstance(pipe.unet, nn.Module))
print(isinstance(pipe.vae, nn.Module))
print(isinstance(pipe.text_encoder, nn.Module))
print(isinstance(pipe.tokenizer, nn.Module))
print(isinstance(pipe.scheduler, nn.Module))
print(isinstance(pipe.safety_checker, nn.Module))
print(isinstance(pipe.feature_extractor, nn.Module))

True
True
True
False
False
True
False


In [36]:
%%time
with open(pickle_id, "wb") as file:
    pickle.dump(pipe, file)

CPU times: user 2.12 s, sys: 3.36 s, total: 5.49 s
Wall time: 10.8 s


In [37]:
%%time
torch.save(pipe, torch_id)

CPU times: user 2.7 s, sys: 2.61 s, total: 5.31 s
Wall time: 21.5 s


In [38]:
del pipe

In [46]:
flush()

In [47]:
benchmark_dict = {}

In [48]:
def timed(name: str = None):
    def outer_wrapper(f):
        @wraps(f)
        def wrapper(*args, log=False, save_result=False, **kwargs):
            global benchmark_dict
            t1 = time.time()
            ret = f(*args, **kwargs)
            t2 = time.time()
            if name is not None and save_result:
                benchmark_dict[name] = t2 - t1
            if log:
                print(
                    f"[BENCHMARK] Execution of {f.__name__} took {(t2 - t1):.3f} seconds"
                )
            return ret

        return wrapper

    return outer_wrapper

In [49]:
@timed()
def sum_example(l: list):
    s = 0
    for index, x in enumerate(l):
        s += index * x
    return s

In [50]:
%%time
for i in range(8):
    size = 10**i
    l = list(range(size))
    sum_example(l, log=True)

[BENCHMARK] Execution of sum_example took 0.000 seconds
[BENCHMARK] Execution of sum_example took 0.000 seconds
[BENCHMARK] Execution of sum_example took 0.000 seconds
[BENCHMARK] Execution of sum_example took 0.000 seconds
[BENCHMARK] Execution of sum_example took 0.002 seconds
[BENCHMARK] Execution of sum_example took 0.011 seconds
[BENCHMARK] Execution of sum_example took 0.279 seconds
[BENCHMARK] Execution of sum_example took 1.102 seconds
CPU times: user 1.49 s, sys: 211 ms, total: 1.7 s
Wall time: 1.7 s


# Benchmarks

In [60]:
@timed(name=None)
def do_stable_diffusion_inference(pipe):
    prompt = "a photo of an astronaut riding a horse"
    image = pipe(prompt, num_inference_steps=10).images[0]
    return image


@timed(name="load_model_using_diffusers_pipeline")
def load_model_1(do_inference=True):
    pipe = StableDiffusionPipeline.from_pretrained(
        model_id, torch_dtype=torch.float16, variant="fp16"
    ).to("cuda")

    if do_inference:
        result = do_stable_diffusion_inference(pipe, log=True)
    else:
        result = None

    del pipe
    flush()
    return result


@timed(name="load_model_using_pickle")
def load_model_2(do_inference=True):
    with open(pickle_id, "rb") as file:
        pipe: StableDiffusionPipeline = pickle.load(file).to("cuda")

    if do_inference:
        result = do_stable_diffusion_inference(pipe, log=True)
    else:
        result = None

    del pipe
    flush()
    return result


@timed(name="load_model_using_torch_load_map_cpu")
def load_model_3(do_inference=True):
    pipe = torch.load(torch_id, map_location=torch.device("cpu"))

    if do_inference:
        result = do_stable_diffusion_inference(pipe, log=True)
    else:
        result = None

    del pipe
    flush()
    return result


@timed(name="load_model_using_torch_load_map_cuda")
def load_model_4(do_inference=True):
    pipe = torch.load(torch_id, map_location=torch.device("cuda"))

    if do_inference:
        result = do_stable_diffusion_inference(pipe, log=True)
    else:
        result = None

    del pipe
    flush()
    return result


@timed(name="load_model_using_torch_load_map_cpu_move_to_cuda")
def load_model_5(do_inference=True):
    pipe = torch.load(torch_id, map_location=torch.device("cpu"))
    pipe.to("cuda")

    if do_inference:
        result = do_stable_diffusion_inference(pipe, log=True)
    else:
        result = None

    del pipe
    flush()
    return result


@timed(name=None)
def load_model_6(do_inference=True, log=True, save_result=True):
    pipe = StableDiffusionPipeline.from_pretrained(
        model_id, torch_dtype=torch.float16, variant="fp16"
    )
    unet_extracted = extract_tensors(pipe.unet)
    vae_extracted = extract_tensors(pipe.vae)
    text_encoder_extracted = extract_tensors(pipe.text_encoder)
    safety_checker_extracted = extract_tensors(pipe.safety_checker)
    tokenizer = pipe.tokenizer
    scheduler = pipe.scheduler
    feature_extractor = pipe.feature_extractor

    # pipe_ref, tensors = extract_tensors(pipe) # does not work because StableDiffusionPipeline does not expose common named_modules for all internal models

    del pipe
    flush()

    @timed(name="load_model_using_extract_and_replace_tensors")
    def load_model_6_impl():
        replace_tensors(*unet_extracted)
        replace_tensors(*vae_extracted)
        replace_tensors(*text_encoder_extracted)
        replace_tensors(*safety_checker_extracted)
        # replace_tensors(pipe_ref, tensors)

        pipe = StableDiffusionPipeline(
            unet=unet_extracted[0],
            vae=vae_extracted[0],
            text_encoder=text_encoder_extracted[0],
            tokenizer=tokenizer,
            scheduler=scheduler,
            safety_checker=safety_checker_extracted[0],
            feature_extractor=feature_extractor,
        ).to("cuda")

        if do_inference:
            result = do_stable_diffusion_inference(pipe, log=True)
        else:
            result = None

        del pipe
        flush()

        return result

    result = load_model_6_impl(log=log, save_result=save_result)

    del unet_extracted
    del vae_extracted
    del text_encoder_extracted
    del safety_checker_extracted
    del tokenizer
    del scheduler
    del feature_extractor
    flush()

    return result


@timed(name=None)
def load_model_7(do_inference=True, log=True, save_result=True):
    ray.init()
    pipe = StableDiffusionPipeline.from_pretrained(
        model_id, torch_dtype=torch.float16, variant="fp16"
    )
    unet_ref = ray.put(extract_tensors(pipe.unet))
    vae_ref = ray.put(extract_tensors(pipe.vae))
    text_encoder_ref = ray.put(extract_tensors(pipe.text_encoder))
    safety_checker_ref = ray.put(extract_tensors(pipe.safety_checker))
    tokenizer = pipe.tokenizer
    scheduler = pipe.scheduler
    feature_extractor = pipe.feature_extractor

    del pipe
    flush()

    @timed(name="load_model_using_extract_and_replace_tensors_with_ray")
    def load_model_7_impl():
        unet_extracted = ray.get(unet_ref)
        vae_extracted = ray.get(vae_ref)
        text_encoder_extracted = ray.get(text_encoder_ref)
        safety_checker_extracted = ray.get(safety_checker_ref)

        replace_tensors(*unet_extracted)
        replace_tensors(*vae_extracted)
        replace_tensors(*text_encoder_extracted)
        replace_tensors(*safety_checker_extracted)

        pipe = StableDiffusionPipeline(
            unet=unet_extracted[0],
            vae=vae_extracted[0],
            text_encoder=text_encoder_extracted[0],
            tokenizer=tokenizer,
            scheduler=scheduler,
            safety_checker=safety_checker_extracted[0],
            feature_extractor=feature_extractor,
        ).to("cuda")

        if do_inference:
            result = do_stable_diffusion_inference(pipe, log=True)
        else:
            result = None

        del pipe
        del unet_extracted
        del vae_extracted
        del text_encoder_extracted
        del safety_checker_extracted
        flush()

        return result

    result = load_model_7_impl(log=log, save_result=save_result)

    del unet_ref
    del vae_ref
    del text_encoder_ref
    del safety_checker_ref
    del tokenizer
    del scheduler
    del feature_extractor
    flush()

    ray.shutdown()

    return result

In [52]:
%%time
%memit load_model_1(do_inference=False, log=True, save_result=True)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


[BENCHMARK] Execution of load_model_1 took 3.993 seconds
peak memory: 4436.16 MiB, increment: 1497.08 MiB
CPU times: user 3.06 s, sys: 636 ms, total: 3.69 s
Wall time: 4.54 s


In [53]:
# %%time
# %memit load_model_1(do_inference=True, log=True)

In [54]:
%%time
%memit load_model_2(do_inference=False, log=True, save_result=True)

[BENCHMARK] Execution of load_model_2 took 14.676 seconds
peak memory: 4262.87 MiB, increment: 1314.58 MiB
CPU times: user 2.44 s, sys: 3.38 s, total: 5.82 s
Wall time: 15.3 s


In [55]:
# %%time
# %memit load_model_2(do_inference=True, log=True)

In [56]:
%%time
%memit load_model_3(do_inference=False, log=True, save_result=True)

[BENCHMARK] Execution of load_model_3 took 13.048 seconds
peak memory: 4161.11 MiB, increment: 1209.73 MiB
CPU times: user 1.16 s, sys: 2.81 s, total: 3.97 s
Wall time: 13.6 s


In [57]:
%%time
%memit load_model_4(do_inference=False, log=True, save_result=True)

[BENCHMARK] Execution of load_model_4 took 11.066 seconds
peak memory: 3009.07 MiB, increment: 56.48 MiB
CPU times: user 2.12 s, sys: 2.23 s, total: 4.35 s
Wall time: 11.7 s


In [58]:
%%time
%memit load_model_5(do_inference=False, log=True, save_result=True)

[BENCHMARK] Execution of load_model_5 took 11.644 seconds
peak memory: 4067.66 MiB, increment: 1122.70 MiB
CPU times: user 1.88 s, sys: 2.99 s, total: 4.86 s
Wall time: 12.2 s


In [64]:
%%time
%memit load_model_6(do_inference=False, log=True, save_result=True)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


[BENCHMARK] Execution of load_model_6_impl took 1.243 seconds
[BENCHMARK] Execution of load_model_6 took 16.299 seconds
peak memory: 7570.57 MiB, increment: 3870.75 MiB
CPU times: user 5.38 s, sys: 4.3 s, total: 9.68 s
Wall time: 17 s


In [62]:
%%time
%memit load_model_7(do_inference=False, log=True, save_result=True)

2023-10-31 20:31:05,498	INFO worker.py:1642 -- Started a local Ray instance.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


[BENCHMARK] Execution of load_model_7_impl took 1.527 seconds
[BENCHMARK] Execution of load_model_7 took 23.011 seconds
peak memory: 7459.48 MiB, increment: 3030.56 MiB
CPU times: user 5.46 s, sys: 6.5 s, total: 12 s
Wall time: 23.9 s


In [68]:
df = pd.DataFrame()
df["name"] = benchmark_dict.keys()
df["time"] = benchmark_dict.values()
pd.set_option("display.max_colwidth", None)
df.sort_values(by=["time"])

Unnamed: 0,name,time
5,load_model_using_extract_and_replace_tensors,1.243478
6,load_model_using_extract_and_replace_tensors_with_ray,1.52697
0,load_model_using_diffusers_pipeline,3.992747
3,load_model_using_torch_load_map_cuda,11.066344
4,load_model_using_torch_load_map_cpu_move_to_cuda,11.644324
2,load_model_using_torch_load_map_cpu,13.047926
1,load_model_using_pickle,14.675687
