In [2]:
import torch
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
x = torch.tensor([1.0], device=device)
y = x * 2
print(y)


tensor([2.], device='cuda:1')


In [3]:
import os
import torch
from PIL import Image
from diffusers import CogVideoXDPMScheduler
from diffusers.utils import export_to_video
from custom_cogvideox_pipe import CustomCogVideoXPipeline
from transformers import CLIPProcessor, CLIPTokenizer, CLIPTextModel, CLIPVisionModel


# Move the pipeline to the appropriate device (GPU or CPU)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


[2024-11-11 04:20:55,479] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [4]:
# Define the paths to your pretrained model and the output directory where your checkpoints are saved
pretrained_model_name_or_path = "THUDM/CogVideoX-5b"  # Replace with your pretrained model path or name
output_dir = "/mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800"  # Replace with your output directory where the checkpoints are saved

# Prepare the input prompt and reference image
prompt = "Two dogs one with a black and tan coat and another with a black and white coat appear to be playing on a lush green lawn with trees and a building in the background"  # Replace with your desired prompt
reference_image_path = "/root/daneul/projects/refactored/CogVideo/finetune/val_samples/854179_background_boxes.jpg"  # Replace with the path to your reference image
ref_image = Image.open(reference_image_path).convert('RGB')

In [5]:
# Set the LoRA parameters (use the same values as during training)
lora_alpha = 128  # Replace with your value if different
rank = 128        # Replace with your value if different
lora_scaling = lora_alpha / rank

In [6]:
# Load the pipeline
pipe = CustomCogVideoXPipeline.from_pretrained(
    pretrained_model_name_or_path,
    customization=True  # Ensure this is set to True for customization
)
pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.41it/s]it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:04<00:00,  1.13it/s]


In [7]:
import torch.nn as nn
class SkipProjectionLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.projection = nn.Linear(in_features, out_features)

    def forward(self, x):
        return x + self.projection(x)
    
class ProjectionLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.projection = nn.Linear(in_features, out_features)

    def forward(self, x):
        return self.projection(x)

In [8]:

pipe = pipe.to(device)

# Load the LoRA weights directly from the local file
lora_weights_path = os.path.join(output_dir, "pytorch_lora_weights_transformer.safetensors")
if not os.path.exists(lora_weights_path):
    raise FileNotFoundError(f"LoRA weights not found at {lora_weights_path}")

# Load the LoRA state dictionary
from safetensors.torch import load_file
lora_state_dict = load_file(lora_weights_path)

# Load the LoRA weights into the pipeline
pipe.load_lora_weights(
    pretrained_model_name_or_path_or_dict=lora_state_dict,
    adapter_name="cogvideox-lora"
)
pipe.set_adapters(["cogvideox-lora"], [lora_scaling])

# Load additional components (projection layers and reference vision encoder)
# Ensure the paths are correct and the files exist
projection_layers = {
    "T5ProjectionLayer": "T5ProjectionLayer.pth",
    "CLIPTextProjectionLayer": "CLIPTextProjectionLayer.pth",
    "CLIPVisionProjectionLayer": "CLIPVisionProjectionLayer.pth",
    "reference_vision_encoder": "pytorch_clip_vision_model.bin"
}

for layer_name, filename in projection_layers.items():
    layer_path = os.path.join(output_dir, filename)
    if not os.path.exists(layer_path):
        raise FileNotFoundError(f"{layer_name} weights not found at {layer_path}")
pipe.transformer.T5ProjectionLayer = SkipProjectionLayer(4096, 4096)
pipe.transformer.CLIPTextProjectionLayer = ProjectionLayer(512, 4096)
pipe.transformer.CLIPVisionProjectionLayer = ProjectionLayer(768, 4096)
# pipe.transformer.reference_vision_encoder = CLIPVisionEncoder()
# Correctly initialize the CLIPVisionModel using from_pretrained
pipe.transformer.reference_vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")

# Move the reference vision encoder to the appropriate device
pipe.transformer.reference_vision_encoder = pipe.transformer.reference_vision_encoder.to(device)

# Load the projection layers and vision encoder
pipe.transformer.T5ProjectionLayer.load_state_dict(
    torch.load(os.path.join(output_dir, "T5ProjectionLayer.pth"), map_location=device)
)
pipe.transformer.CLIPTextProjectionLayer.load_state_dict(
    torch.load(os.path.join(output_dir, "CLIPTextProjectionLayer.pth"), map_location=device)
)
pipe.transformer.CLIPVisionProjectionLayer.load_state_dict(
    torch.load(os.path.join(output_dir, "CLIPVisionProjectionLayer.pth"), map_location=device)
)
pipe.transformer.reference_vision_encoder.load_state_dict(
    torch.load(os.path.join(output_dir, "pytorch_clip_vision_model.bin"), map_location=device)
)

  torch.load(os.path.join(output_dir, "T5ProjectionLayer.pth"), map_location=device)
  torch.load(os.path.join(output_dir, "CLIPTextProjectionLayer.pth"), map_location=device)
  torch.load(os.path.join(output_dir, "CLIPVisionProjectionLayer.pth"), map_location=device)
  torch.load(os.path.join(output_dir, "pytorch_clip_vision_model.bin"), map_location=device)


<All keys matched successfully>

In [9]:
# Move models to device
pipe.transformer.to(device)
pipe.text_encoder.to(device)
pipe.clip_text_encoder.to(device)
pipe.vae.to(device)
pipe.transformer.reference_vision_encoder.to(device)
pipe.transformer.T5ProjectionLayer.to(device)
pipe.transformer.CLIPTextProjectionLayer.to(device)
pipe.transformer.CLIPVisionProjectionLayer.to(device)
# dtype = torch.float16  # or torch.float32, depending on your setup
dtype = torch.float32

pipe.transformer.to(dtype=dtype)
pipe.text_encoder.to(dtype=dtype)
pipe.clip_text_encoder.to(dtype=dtype)
pipe.vae.to(dtype=dtype)
pipe.transformer.reference_vision_encoder.to(dtype=dtype)
pipe.transformer.T5ProjectionLayer.to(dtype=dtype)
pipe.transformer.CLIPTextProjectionLayer.to(dtype=dtype)
pipe.transformer.CLIPVisionProjectionLayer.to(dtype=dtype)

ProjectionLayer(
  (projection): Linear(in_features=768, out_features=4096, bias=True)
)

In [11]:
# Generate the video
# Process the reference image
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
processed_image = clip_processor(images=ref_image, return_tensors="pt")
ref_img_states = processed_image['pixel_values'].to(device)
print("ref_img_states shape:", ref_img_states.shape)
print("ref_img_states device:", ref_img_states.device)
ref_img_states = ref_img_states.to(dtype=dtype)

with torch.no_grad():
    video_output = pipe(
        prompt=prompt,
        ref_img_states=ref_img_states,
        guidance_scale=1.0,
        # guidance_scale=6,          # Adjust guidance scale if needed
        use_dynamic_cfg=True,      # Set to True if you want to use dynamic CFG
        num_frames=49,             # Adjust the number of frames if needed
        height=480,                # Set the desired video height
        width=720,                 # Set the desired video width
        num_inference_steps=50,    # Adjust the number of inference steps if needed
        output_type='np',          # Output as numpy array
        eval=True                  # Set to True for evaluation mode
    )

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Extract the video frames from the output
video_frames = video_output.frames[0]  # Assuming frames is a list of videos

# Save the video to a file
output_video_path = "output_video.mp4"  # Specify the output video file path
export_to_video(video_frames, output_video_path, fps=8)  # Adjust FPS if needed

print(f"Video saved to {output_video_path}")

In [1]:
# inference_script.py

import os
import torch
from PIL import Image
from diffusers import CogVideoXDPMScheduler
from diffusers.utils import export_to_video
from custom_cogvideox_pipe import CustomCogVideoXPipeline, SkipProjectionLayer, ProjectionLayer
from transformers import CLIPProcessor, CLIPTokenizer, CLIPTextModel, CLIPVisionModel

# Device setup
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
dtype = torch.float32  # Use float32 to avoid issues with float16
# Define the paths to your pretrained model and the output directory where your checkpoints are saved
pretrained_model_name_or_path = "THUDM/CogVideoX-5b"  # Replace with your pretrained model path or name
output_dir = "/mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800"  # Replace with your output directory where the checkpoints are saved

# Prepare the input prompt and reference image
prompt = "Two dogs one with a black and tan coat and another with a black and white coat appear to be playing on a lush green lawn with trees and a building in the background"  # Replace with your desired prompt
negative_prompt = "Low quality, bad image, artifacts" 
reference_image_path = "/root/daneul/projects/refactored/CogVideo/finetune/val_samples/854179_background_boxes.jpg"  # Replace with the path to your reference image
# ref_image = Image.open(reference_image_path).convert('RGB')

# Define the paths to your pretrained model and the output directory where your checkpoints are saved
# pretrained_model_name_or_path = "THUDM/CogVideoX-5b"  # Replace with your pretrained model path or name
# output_dir = "/path/to/output_dir"  # Replace with your output directory where the checkpoints are saved

# Prepare the input prompt and reference image
# prompt = "Your prompt here"  # Replace with your desired prompt
# negative_prompt = ""  # Optional negative prompt; can be customized
# reference_image_path = "/path/to/reference_image.jpg"  # Replace with the path to your reference image
if not os.path.exists(reference_image_path):
    raise FileNotFoundError(f"Reference image not found at {reference_image_path}")
ref_image = Image.open(reference_image_path).convert('RGB')

# Set the LoRA parameters (use the same values as during training)
lora_alpha = 128  # Replace with your value if different
rank = 128        # Replace with your value if different
lora_scaling = lora_alpha / rank

# Load the pipeline
pipe = CustomCogVideoXPipeline.from_pretrained(
    pretrained_model_name_or_path,
    customization=True  # Ensure this is set to True for customization
)
pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)

# Load the LoRA weights directly from the local file
lora_weights_path = os.path.join(output_dir, "pytorch_lora_weights_transformer.safetensors")
if not os.path.exists(lora_weights_path):
    raise FileNotFoundError(f"LoRA weights not found at {lora_weights_path}")

# Load the LoRA state dictionary
from safetensors.torch import load_file
lora_state_dict = load_file(lora_weights_path)

# Load the LoRA weights into the pipeline
pipe.load_lora_weights(
    pretrained_model_name_or_path_or_dict=lora_state_dict,
    adapter_name="cogvideox-lora"
)
pipe.set_adapters(["cogvideox-lora"], [lora_scaling])

# Load additional components (projection layers and reference vision encoder)
# Ensure the paths are correct and the files exist
projection_layers = {
    "T5ProjectionLayer": "T5ProjectionLayer.pth",
    "CLIPTextProjectionLayer": "CLIPTextProjectionLayer.pth",
    "CLIPVisionProjectionLayer": "CLIPVisionProjectionLayer.pth",
    "reference_vision_encoder": "pytorch_clip_vision_model.bin"
}

for layer_name, filename in projection_layers.items():
    layer_path = os.path.join(output_dir, filename)
    if not os.path.exists(layer_path):
        raise FileNotFoundError(f"{layer_name} weights not found at {layer_path}")

# Initialize and load projection layers and vision encoder
pipe.transformer.T5ProjectionLayer = SkipProjectionLayer(4096, 4096)
pipe.transformer.CLIPTextProjectionLayer = ProjectionLayer(512, 4096)
pipe.transformer.CLIPVisionProjectionLayer = ProjectionLayer(768, 4096)
pipe.transformer.reference_vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")

# Load the projection layers and vision encoder
pipe.transformer.T5ProjectionLayer.load_state_dict(
    torch.load(os.path.join(output_dir, "T5ProjectionLayer.pth"), map_location=device)
)
pipe.transformer.CLIPTextProjectionLayer.load_state_dict(
    torch.load(os.path.join(output_dir, "CLIPTextProjectionLayer.pth"), map_location=device)
)
pipe.transformer.CLIPVisionProjectionLayer.load_state_dict(
    torch.load(os.path.join(output_dir, "CLIPVisionProjectionLayer.pth"), map_location=device)
)
pipe.transformer.reference_vision_encoder.load_state_dict(
    torch.load(os.path.join(output_dir, "pytorch_clip_vision_model.bin"), map_location=device)
)

# Move models to device and set dtype
pipe.transformer.to(device=device, dtype=dtype)
pipe.text_encoder.to(device=device, dtype=dtype)
pipe.clip_text_encoder.to(device=device, dtype=dtype)
pipe.vae.to(device=device, dtype=dtype)
pipe.transformer.reference_vision_encoder.to(device=device, dtype=dtype)
pipe.transformer.T5ProjectionLayer.to(device=device, dtype=dtype)
pipe.transformer.CLIPTextProjectionLayer.to(device=device, dtype=dtype)
pipe.transformer.CLIPVisionProjectionLayer.to(device=device, dtype=dtype)

# Ensure models are in evaluation mode
pipe.transformer.eval()
pipe.text_encoder.eval()
pipe.clip_text_encoder.eval()
pipe.vae.eval()
pipe.transformer.reference_vision_encoder.eval()

# Process the reference image
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
processed_image = clip_processor(images=ref_image, return_tensors="pt")
ref_img_states = processed_image['pixel_values'].to(device=device, dtype=dtype)

# Optional: Print the shape and device of ref_img_states for debugging
print("ref_img_states shape:", ref_img_states.shape)
print("ref_img_states device:", ref_img_states.device)
print("ref_img_states dtype:", ref_img_states.dtype)

# Generate the video
with torch.no_grad():
    video_output = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        ref_img_states=ref_img_states,
        guidance_scale=6,          # Adjust guidance scale if needed
        use_dynamic_cfg=True,      # Set to True if you want to use dynamic CFG
        num_frames=49,             # Adjust the number of frames if needed
        height=480,                # Set the desired video height
        width=720,                 # Set the desired video width
        num_inference_steps=50,    # Adjust the number of inference steps if needed
        output_type='numpy',       # Output as numpy array
        eval=True                  # Set to True for evaluation mode
    )

# Extract the video frames from the output
video_frames = video_output.frames[0]  # Assuming frames is a list of videos

# Save the video to a file
output_video_path = "output_video.mp4"  # Specify the output video file path
export_to_video(video_frames, output_video_path, fps=8)  # Adjust FPS if needed

print(f"Video saved to {output_video_path}")


  from .autonotebook import tqdm as notebook_tqdm


[2024-11-11 04:42:51,033] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
Loading pipeline components...: 100%|██████████| 5/5 [00:08<00:00,  1.76s/it]
  torch.load(os.path.join(output_dir, "T5ProjectionLayer.pth"), map_location=device)
  torch.load(os.path.join(output_dir, "CLIPTextProjectionLayer.pth"), map_location=device)
  torch.load(os.path.join(output_dir, "CLIPVisionProjectionLayer.pth"), map_location=device)
  torch.load(os.path.join(output_dir, "pytorch_clip_vision_model.bin"), map_location=device)


ref_img_states shape: torch.Size([1, 3, 224, 224])
ref_img_states device: cuda:1
ref_img_states dtype: torch.float32


  with torch.cuda.amp.autocast(enabled=False):


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [1]:
import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
from transformers import T5Tokenizer, T5EncoderModel, CLIPProcessor, CLIPTokenizer, CLIPTextModel
from custom_cogvideox_pipe import CustomCogVideoXPipeline
from custom_cogvideox import CustomCogVideoXTransformer3DModel
from PIL import Image
import os
from safetensors.torch import load_file
from diffusers.models.attention_processor import LoRAAttnProcessor

def main():
    # Model paths and parameters
    pretrained_model_name_or_path = "THUDM/CogVideoX-5b"
    output_dir = "/mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800"
    prompt = "Two dogs one with a black and tan coat and another with a black and white coat appear to be playing on a lush green lawn with trees and a building in the background"
    negative_prompt = "Low quality, bad image, artifacts"
    reference_image_path = "/root/daneul/projects/refactored/CogVideo/finetune/val_samples/854179_background_boxes.jpg"

    # Device and dtype setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.bfloat16

    # Load models and processors
    print("Loading models and processors...")
    tokenizer = T5Tokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
    text_encoder = T5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder")
    vae = AutoencoderKLCogVideoX.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
    scheduler = CogVideoXDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
    
    clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
    clip_text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch16")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

    print("Loading transformer...")
    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
        pretrained_model_name_or_path,
        subfolder="transformer",
        torch_dtype=dtype,
        customization=True,
    )

    print("Creating pipeline...")
    pipe = CustomCogVideoXPipeline(
        tokenizer=tokenizer,
        text_encoder=text_encoder,
        transformer=transformer,
        vae=vae,
        scheduler=scheduler,
        clip_tokenizer=clip_tokenizer,
        clip_text_encoder=clip_text_encoder,
        customization=True,
    )

    print("Loading LoRA weights...")
    # Load LoRA weights manually
    lora_path = os.path.join(output_dir, "pytorch_lora_weights_transformer.safetensors")
    if not os.path.exists(lora_path):
        lora_path = os.path.join(output_dir, "pytorch_lora_weights.safetensors")
    
    if os.path.exists(lora_path):
        state_dict = load_file(lora_path)
        pipe.transformer.load_state_dict(state_dict, strict=False)
        print(f"Successfully loaded LoRA weights from {lora_path}")
    else:
        print(f"Warning: Could not find LoRA weights in {output_dir}")
        print("Available files:", os.listdir(output_dir))
        raise FileNotFoundError("LoRA weights not found")

    print("Loading additional components...")
    # Load additional components
    component_files = {
        "T5ProjectionLayer": ["T5ProjectionLayer.pth", "T5ProjectionLayer.safetensors"],
        "CLIPTextProjectionLayer": ["CLIPTextProjectionLayer.pth", "CLIPTextProjectionLayer.safetensors"],
        "CLIPVisionProjectionLayer": ["CLIPVisionProjectionLayer.pth", "CLIPVisionProjectionLayer.safetensors"],
        "reference_vision_encoder": ["reference_vision_encoder.pth", "reference_vision_encoder.safetensors"]
    }

    for component_name, filenames in component_files.items():
        loaded = False
        for filename in filenames:
            filepath = os.path.join(output_dir, filename)
            if os.path.exists(filepath):
                try:
                    if filename.endswith('.pth'):
                        state_dict = torch.load(filepath)
                    else:
                        state_dict = load_file(filepath)
                    getattr(pipe.transformer, component_name).load_state_dict(state_dict)
                    print(f"Successfully loaded {component_name} from {filename}")
                    loaded = True
                    break
                except Exception as e:
                    print(f"Error loading {component_name} from {filename}: {e}")
        
        if not loaded:
            print(f"Warning: Could not load {component_name} from any of the attempted files")

    # Move pipeline to device and set to eval mode
    pipe.to(device)
    pipe.transformer.eval()
    pipe.text_encoder.eval()
    pipe.vae.eval()
    pipe.clip_text_encoder.eval()

    print("Processing reference image...")
    ref_image = Image.open(reference_image_path).convert('RGB')
    processed_image = clip_processor(
        images=ref_image,
        return_tensors="pt"
    ).to(device, dtype=dtype)

    # Generate video
    print("Generating video...")
    generator = torch.Generator(device=device).manual_seed(42)
    
    with torch.no_grad():
        output = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            ref_img_states=processed_image.pixel_values,
            height=480,
            width=720,
            num_frames=49,
            num_inference_steps=50,
            guidance_scale=6.0,
            use_dynamic_cfg=True,
            generator=generator,
            output_type="pil",
        )

    # Save the output video
    output_path = "output_video.mp4"
    from diffusers.utils import export_to_video
    export_to_video(output.frames[0], output_path, fps=8)
    print(f"Video saved to {output_path}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


[2024-11-11 05:09:37,899] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




Loading models and processors...


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 3816.47it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


Loading transformer...


Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 21290.88it/s]


Creating pipeline...
Loading LoRA weights...
Successfully loaded LoRA weights from /mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800/pytorch_lora_weights_transformer.safetensors
Loading additional components...


  state_dict = torch.load(filepath)


Error loading T5ProjectionLayer from T5ProjectionLayer.pth: 'NoneType' object has no attribute 'load_state_dict'
Error loading CLIPTextProjectionLayer from CLIPTextProjectionLayer.pth: 'NoneType' object has no attribute 'load_state_dict'
Error loading CLIPVisionProjectionLayer from CLIPVisionProjectionLayer.pth: 'NoneType' object has no attribute 'load_state_dict'
Processing reference image...


TypeError: BatchEncoding.to() got an unexpected keyword argument 'dtype'

In [1]:
import os
import torch
from PIL import Image
from diffusers import CogVideoXDPMScheduler
from diffusers.utils import export_to_video
from custom_cogvideox_pipe import CustomCogVideoXPipeline, SkipProjectionLayer, ProjectionLayer
from transformers import CLIPProcessor, CLIPTokenizer, CLIPTextModel, CLIPVisionModel
import gc
import time

def clear_cuda_cache():
    """Helper function to clear CUDA cache and garbage collect"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

def main():
    # Reset CUDA
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
    
    # Device setup
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    dtype = torch.float32

    # Path setup
    pretrained_model_name_or_path = "THUDM/CogVideoX-5b"
    output_dir = "/mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800"
    prompt = "Two dogs one with a black and tan coat and another with a black and white coat appear to be playing on a lush green lawn with trees and a building in the background"
    negative_prompt = "Low quality, bad image, artifacts"
    reference_image_path = "/root/daneul/projects/refactored/CogVideo/finetune/val_samples/854179_background_boxes.jpg"

    # Load pipeline
    print("Loading pipeline...")
    try:
        pipe = CustomCogVideoXPipeline.from_pretrained(
            pretrained_model_name_or_path,
            torch_dtype=dtype,
            customization=True
        )
        pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)
        print("Pipeline loaded")
        
    except Exception as e:
        print(f"Error in pipeline initialization: {e}")
        raise

    clear_cuda_cache()

    # Load LoRA weights
    print("Loading LoRA weights...")
    try:
        lora_weights_path = os.path.join(output_dir, "pytorch_lora_weights_transformer.safetensors")
        if not os.path.exists(lora_weights_path):
            raise FileNotFoundError(f"LoRA weights not found at {lora_weights_path}")

        from safetensors.torch import load_file
        lora_state_dict = load_file(lora_weights_path)
        pipe.load_lora_weights(
            pretrained_model_name_or_path_or_dict=lora_state_dict,
            adapter_name="cogvideox-lora"
        )
        pipe.set_adapters(["cogvideox-lora"], [128/128])
    except Exception as e:
        print(f"Error loading LoRA weights: {e}")
        raise

    clear_cuda_cache()

    # Load additional components
    print("Loading additional components...")
    try:
        pipe.transformer.T5ProjectionLayer = SkipProjectionLayer(4096, 4096)
        pipe.transformer.CLIPTextProjectionLayer = ProjectionLayer(512, 4096)
        pipe.transformer.CLIPVisionProjectionLayer = ProjectionLayer(768, 4096)
        pipe.transformer.reference_vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")

        for name, path in {
            "T5ProjectionLayer": "T5ProjectionLayer.pth",
            "CLIPTextProjectionLayer": "CLIPTextProjectionLayer.pth",
            "CLIPVisionProjectionLayer": "CLIPVisionProjectionLayer.pth",
            "reference_vision_encoder": "pytorch_clip_vision_model.bin"
        }.items():
            full_path = os.path.join(output_dir, path)
            if not os.path.exists(full_path):
                raise FileNotFoundError(f"{path} not found")
            state_dict = torch.load(full_path, map_location='cpu')
            getattr(pipe.transformer, name).load_state_dict(state_dict)
            print(f"Loaded {name}")

    except Exception as e:
        print(f"Error loading additional components: {e}")
        raise

    clear_cuda_cache()

    # Process reference image
    print("Processing reference image...")
    try:
        clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
        ref_image = Image.open(reference_image_path).convert('RGB')
        processed_image = clip_processor(images=ref_image, return_tensors="pt")
        ref_img_states = processed_image['pixel_values'].to(device=device, dtype=dtype)
        print("Reference image processed successfully")
    except Exception as e:
        print(f"Error processing reference image: {e}")
        raise

    # Move components to GPU
    print("Moving components to GPU...")
    try:
        # Move all models to the same device
        pipe.to(device)
        pipe.tokenizer.padding_side = "left"  # Ensure consistent padding
        print("Models moved to device")

        # Set models to eval mode
        for model in [pipe.text_encoder, pipe.clip_text_encoder, pipe.vae, pipe.transformer]:
            model.eval()
            if hasattr(model, "requires_grad_"):
                model.requires_grad_(False)

    except Exception as e:
        print(f"Error moving components to GPU: {e}")
        raise

    clear_cuda_cache()

    # Generate video
    print("Generating video...")
    try:
        print(f"Current device: {device}")
        with torch.inference_mode():
            # Pre-process prompt
            text_inputs = pipe.tokenizer(
                prompt,
                padding="max_length",
                max_length=226,
                truncation=True,
                return_tensors="pt"
            ).to(device)

            # Generate
            video_output = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                ref_img_states=ref_img_states,
                guidance_scale=6,
                use_dynamic_cfg=True,
                num_frames=49,
                height=480,
                width=720,
                num_inference_steps=50,
                output_type='numpy',
                eval=True
            )
    except Exception as e:
        print(f"Error during generation: {e}")
        print(f"Error type: {type(e)}")
        print(f"Error details: {str(e)}")

        # Debug device information
        print("\nDebug information:")
        if hasattr(pipe, "text_encoder") and hasattr(pipe.text_encoder, "device"):
            print(f"Text encoder device: {pipe.text_encoder.device}")
        if hasattr(pipe, "clip_text_encoder") and hasattr(pipe.clip_text_encoder, "device"):
            print(f"CLIP text encoder device: {pipe.clip_text_encoder.device}")
        if hasattr(pipe, "transformer") and hasattr(pipe.transformer, "device"):
            print(f"Transformer device: {pipe.transformer.device}")
        raise
    finally:
        clear_cuda_cache()

    # Save video
    print("Saving video...")
    output_video_path = "output_video.mp4"
    export_to_video(video_output.frames[0], output_video_path, fps=8)
    print(f"Video saved to {output_video_path}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Fatal error: {e}")
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
        raise

  from .autonotebook import tqdm as notebook_tqdm


[2024-11-11 05:09:19,561] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




ImportError: cannot import name 'SkipProjectionLayer' from 'custom_cogvideox_pipe' (/root/daneul/projects/refactored/CogVideo/finetune/custom_cogvideox_pipe.py)

In [1]:
import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
from transformers import T5Tokenizer, T5EncoderModel, CLIPProcessor, CLIPTokenizer, CLIPTextModel, CLIPVisionModel
from custom_cogvideox_pipe import CustomCogVideoXPipeline
from custom_cogvideox import CustomCogVideoXTransformer3DModel
from PIL import Image
import os
from safetensors.torch import load_file

# Define the custom layers if not already defined
import torch.nn as nn

class SkipProjectionLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.projection = nn.Linear(in_features, out_features)

    def forward(self, x):
        return x + self.projection(x)
    
class ProjectionLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.projection = nn.Linear(in_features, out_features)

    def forward(self, x):
        return self.projection(x)

def main():
    # Model paths and parameters
    pretrained_model_name_or_path = "THUDM/CogVideoX-5b"
    output_dir = "/mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800"
    prompt = "Two dogs one with a black and tan coat and another with a black and white coat appear to be playing on a lush green lawn with trees and a building in the background"
    negative_prompt = "Low quality, bad image, artifacts"
    reference_image_path = "/root/daneul/projects/refactored/CogVideo/finetune/val_samples/854179_background_boxes.jpg"

    # Device and dtype setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype = torch.float32  # Use torch.float32 for better compatibility

    print("Using device:", device)

    # Load models and processors
    print("Loading models and processors...")
    tokenizer = T5Tokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
    text_encoder = T5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder")
    vae = AutoencoderKLCogVideoX.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
    scheduler = CogVideoXDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
    
    clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
    clip_text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch16")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

    print("Loading transformer...")
    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
        pretrained_model_name_or_path,
        subfolder="transformer",
        torch_dtype=dtype,
        customization=True,
    )

    print("Creating pipeline...")
    pipe = CustomCogVideoXPipeline(
        tokenizer=tokenizer,
        text_encoder=text_encoder,
        transformer=transformer,
        vae=vae,
        scheduler=scheduler,
        clip_tokenizer=clip_tokenizer,
        clip_text_encoder=clip_text_encoder,
        customization=True,
    )

    print("Loading LoRA weights...")
    # Load LoRA weights manually
    lora_path = os.path.join(output_dir, "pytorch_lora_weights_transformer.safetensors")
    if not os.path.exists(lora_path):
        lora_path = os.path.join(output_dir, "pytorch_lora_weights.safetensors")
    
    if os.path.exists(lora_path):
        state_dict = load_file(lora_path)
        pipe.transformer.load_state_dict(state_dict, strict=False)
        print(f"Successfully loaded LoRA weights from {lora_path}")
    else:
        print(f"Warning: Could not find LoRA weights in {output_dir}")
        print("Available files:", os.listdir(output_dir))
        raise FileNotFoundError("LoRA weights not found")

    print("Initializing additional components...")
    # Initialize additional components before loading their state dictionaries
    pipe.transformer.T5ProjectionLayer = SkipProjectionLayer(4096, 4096)
    pipe.transformer.CLIPTextProjectionLayer = ProjectionLayer(512, 4096)
    pipe.transformer.CLIPVisionProjectionLayer = ProjectionLayer(768, 4096)
    pipe.transformer.reference_vision_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")

    print("Loading additional components...")
    # Load additional components
    component_files = {
        "T5ProjectionLayer": ["T5ProjectionLayer.pth", "T5ProjectionLayer.safetensors"],
        "CLIPTextProjectionLayer": ["CLIPTextProjectionLayer.pth", "CLIPTextProjectionLayer.safetensors"],
        "CLIPVisionProjectionLayer": ["CLIPVisionProjectionLayer.pth", "CLIPVisionProjectionLayer.safetensors"],
        "reference_vision_encoder": ["pytorch_clip_vision_model.bin", "reference_vision_encoder.safetensors"]
    }

    for component_name, filenames in component_files.items():
        loaded = False
        for filename in filenames:
            filepath = os.path.join(output_dir, filename)
            if os.path.exists(filepath):
                try:
                    if filename.endswith('.safetensors'):
                        state_dict = load_file(filepath)
                    else:
                        state_dict = torch.load(filepath, map_location=device)
                    getattr(pipe.transformer, component_name).load_state_dict(state_dict)
                    print(f"Successfully loaded {component_name} from {filename}")
                    loaded = True
                    break
                except Exception as e:
                    print(f"Error loading {component_name} from {filename}: {e}")
        
        if not loaded:
            print(f"Warning: Could not load {component_name} from any of the attempted files")

    # Move pipeline components to device and set data types
    pipe.transformer.to(device=device, dtype=dtype)
    pipe.text_encoder.to(device=device, dtype=dtype)
    pipe.vae.to(device=device, dtype=dtype)
    pipe.clip_text_encoder.to(device=device, dtype=dtype)
    pipe.transformer.reference_vision_encoder.to(device=device, dtype=dtype)
    pipe.transformer.T5ProjectionLayer.to(device=device, dtype=dtype)
    pipe.transformer.CLIPTextProjectionLayer.to(device=device, dtype=dtype)
    pipe.transformer.CLIPVisionProjectionLayer.to(device=device, dtype=dtype)

    # Set models to eval mode
    pipe.transformer.eval()
    pipe.text_encoder.eval()
    pipe.vae.eval()
    pipe.clip_text_encoder.eval()
    pipe.transformer.reference_vision_encoder.eval()

    print("Processing reference image...")
    ref_image = Image.open(reference_image_path).convert('RGB')
    processed_image = clip_processor(
        images=ref_image,
        return_tensors="pt"
    )
    # Move the pixel_values tensor to device and dtype
    pixel_values = processed_image['pixel_values'].to(device=device, dtype=dtype)

    # Generate video
    print("Generating video...")
    generator = torch.Generator(device=device).manual_seed(42)
    
    with torch.no_grad():
        output = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            ref_img_states=pixel_values,
            height=480,
            width=720,
            num_frames=49,
            num_inference_steps=50,
            guidance_scale=6.0,
            use_dynamic_cfg=True,
            generator=generator,
            output_type="pil",
        )

    # Save the output video
    output_path = "output_video.mp4"
    from diffusers.utils import export_to_video
    export_to_video(output.frames[0], output_path, fps=8)
    print(f"Video saved to {output_path}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


[2024-11-11 05:21:15,629] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




Using device: cuda
Loading models and processors...


Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 6379.17it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.73s/it]


Loading transformer...


Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 14716.86it/s]


Creating pipeline...
Loading LoRA weights...
Successfully loaded LoRA weights from /mnt/carpedkm_data/finetune_result/finetune4000_custom_zero_init_t5_full_custom_with_clip/checkpoint-800/pytorch_lora_weights_transformer.safetensors
Initializing additional components...
Loading additional components...


  state_dict = torch.load(filepath, map_location=device)


Successfully loaded T5ProjectionLayer from T5ProjectionLayer.pth
Successfully loaded CLIPTextProjectionLayer from CLIPTextProjectionLayer.pth
Successfully loaded CLIPVisionProjectionLayer from CLIPVisionProjectionLayer.pth
Successfully loaded reference_vision_encoder from pytorch_clip_vision_model.bin
Processing reference image...
Generating video...
Device of text_input_ids: cuda:0
Device of attention_mask: cuda:0
Device of text_encoder: cuda:0


  with torch.cuda.amp.autocast(enabled=False):
  2%|▏         | 1/50 [00:44<36:15, 44.39s/it]


KeyboardInterrupt: 