In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

# cogvlm2-llama3-caption

In [2]:
import io

import argparse
import numpy as np
import torch
from decord import cpu, VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer

from matplotlib import pyplot as plt
import pandas as pd

from tqdm import tqdm
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_PATH = "THUDM/cogvlm2-llama3-caption"

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
    0] >= 8 else torch.float16

In [4]:

parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
# args = parser.parse_args([])
args = parser.parse_args([])

In [5]:
def load_video(video_data, strategy='chat'):
    bridge.set_bridge('torch')
    mp4_stream = video_data
    num_frames = 24
    decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))

    frame_id_list = None
    total_frames = len(decord_vr)
    if strategy == 'base':
        clip_end_sec = 60
        clip_start_sec = 0
        start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
        end_frame = min(total_frames,
                        int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
        frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
    elif strategy == 'chat':
        timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
        timestamps = [i[0] for i in timestamps]
        max_second = round(max(timestamps)) + 1
        frame_id_list = []
        for second in range(max_second):
            closest_num = min(timestamps, key=lambda x: abs(x - second))
            index = timestamps.index(closest_num)
            frame_id_list.append(index)
            if len(frame_id_list) >= num_frames:
                break

    video_data = decord_vr.get_batch(frame_id_list)
    video_data = video_data.permute(3, 0, 1, 2)
    return video_data

In [6]:
def get_video_length_in_seconds(video_data):
      bridge.set_bridge('torch')
      mp4_stream = video_data
      num_frames = 24
      decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))

      frame_id_list = None
      total_frames = len(decord_vr)

      timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
      timestamps = [i[0] for i in timestamps]
      max_second = round(max(timestamps)) + 1

      return max_second

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=TORCH_TYPE,
    trust_remote_code=True,
    device_map = "auto"
).eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A new version of the following files was downloaded from https://huggingface.co/THUDM/cogvlm2-llama3-caption:
- configuration_cogvlm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/THUDM/cogvlm2-llama3-caption:
- util.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/THUDM/cogvlm2-llama3-caption:
- visual.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded fr

In [8]:
def predict(prompt, video_data, temperature):
    strategy = 'chat'

    video = load_video(video_data, strategy=strategy)

    print(video.size())

    history = []
    query = prompt
    inputs = model.build_conversation_input_ids(
        tokenizer=tokenizer,
        query=query,
        images=[video],
        history=history,
        template_version=strategy
    )
    # inputs = {
    #     'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
    #     'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
    #     'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
    #     'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
    # }
    inputs = {
        'input_ids': inputs['input_ids'].unsqueeze(0).cuda(),
        'token_type_ids': inputs['token_type_ids'].unsqueeze(0).cuda(),
        'attention_mask': inputs['attention_mask'].unsqueeze(0).cuda(),
        'images': [[inputs['images'][0].cuda().to(TORCH_TYPE)]],
    }
    gen_kwargs = {
        "max_new_tokens": 500,
        "pad_token_id": 128002,
        "top_k": 1,
        "do_sample": False,
        "top_p": 0.1,
        "temperature": temperature,
    }
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

# generate caption for a dataset

In [11]:
PROMPT="Please describe this video in detail." # original prompt

### 09/20/2024
## general prompt
# with examples
# PROMPT="A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than 140 words. Here are some examples of good descriptions: 1. A stylish woman in a black leather jacket, long flowing red dress, black boots, and dark sunglasses strides confidently down a bustling, neon-lit Tokyo street. Her bold red lipstick complements her outfit, while she holds a sleek black purse in one hand. The damp pavement glistens, reflecting the vibrant neon signs from surrounding buildings, casting a kaleidoscope of colors across the scene. Pedestrians move around her, but her confident walk makes her stand out amidst the crowd. The camera follows her from a slightly low angle, emphasizing her commanding presence, while the shallow depth of field blurs the surrounding faces, keeping her in sharp focus. The bright neon lights contrast with the dark tones of her attire, enhancing the scene’s urban, futuristic aesthetic. The mood is electric and stylish, blending modern fashion with the vibrant, fast-paced energy of Tokyo’s nightlife. 2. Several massive wooly mammoths slowly make their way across a snowy meadow, their thick fur rippling in the gusty wind. The towering creatures are framed by a landscape of snow-covered trees and imposing, jagged mountains in the distance. Above, wispy clouds stretch across the sky, with the mid-afternoon sun casting a soft, warm glow over the scene, enhancing the contrast between the cold snow and the rich textures of the mammoths' fur. The low camera angle captures the scale of the mammoths, making them appear even more majestic as they dominate the foreground. Their slow, steady movements create a peaceful yet powerful atmosphere. The warm sunlight filtering through the clouds adds a golden hue to the snowy expanse, while the dramatic mountains provide a rugged backdrop. The combination of lighting, natural beauty, and the sheer size of the mammoths creates a timeless and awe-inspiring visual. 3. A drone glides over the rugged cliffs of Big Sur’s Garay Point Beach, capturing the raw power of waves crashing against the jagged coastline. The deep blue waters, capped with white-tipped waves, shimmer under the warm glow of the setting sun. The drone's high vantage point reveals the steep drop from the coastal road to the beach below, emphasizing the dramatic vertical landscape. Lush green shrubbery clings to the cliffs, adding texture and color to the rocky terrain. In the distance, a small island with a lighthouse stands against the horizon, its isolated presence enhancing the sense of vastness and tranquility. The composition masterfully balances the rugged cliffs, turbulent waters, and serene island, while the golden sunlight casts long shadows, adding depth and warmth. The dynamic movement of the ocean and the sweeping coastal views create a striking visual of nature’s untamed beauty."
# without examples
# PROMPT="A video is given by providing several frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than 140 words."

## film prompt
# with examples
# PROMPT="A video is given by providing three frames in chronological order. Describe this video including the objects, their performance, the scene, and the visual style. Pay attention to all the objects in the video. Besides, describe the video based on the following essential elements of professional film footages: 1. Camera: Camera movement, lens, and depth of field directly affect the quality, style,and narrative of the footage. 2. Lighting: Lighting direction and color tone are crucial for setting the mood of the scene. It affects how the colors and shadows appear and can be used creatively to influence the story's atmosphere and the audience's perception of characters and settings. 3. Composition: This involves the arrangement of elements within the frame, including characters, props, and scenery. Composition techniques, such as the rule of thirds, leading lines, and framing, can help guide the viewer’s attention and make the visuals more engaging. Note: Do not describe each frame individually. Do not reply with words like 'first frame'. The entire description should be no more than 140 words, which should be useful for AI to re-generate the video. Here are some examples of descriptions: 1. A stylish woman in a black leather jacket and vibrant red dress walks confidently through a neon-lit Tokyo street, her figure dominating the frame as the camera follows her with a shallow depth of field. The busy urban scene blurs softly behind her, highlighting her bold presence amidst a sea of lights and reflections from the rain-slicked pavement. The camera's steady movement keeps her at the center, guiding the viewer’s focus on her assertive stride. Dynamic lighting intensifies the colors, with neon hues bouncing off the wet ground, adding depth and contrast to the visuals. Clever use of the rule of thirds draws attention to the woman while maintaining balance with the bustling background. The lighting enhances the mood, bathing the scene in vibrant colors, while the shallow focus emphasizes her isolation amidst the crowd, heightening the drama and sense of purpose in her walk. 2. Several wooly mammoths move steadily across a vast snowy meadow, their thick fur rippling in the brisk wind. A low-angle camera captures the scene, giving the mammoths a towering, majestic presence as they traverse the rugged landscape. The deep focus keeps both the mammoths and the snow-laden trees in sharp detail, while the dramatic mountains in the background add grandeur to the composition. Mid-afternoon sunlight casts a golden glow over the scene, creating a striking contrast between the warm light and the cold snow. The lighting highlights the texture of the mammoths' fur, making them stand out against the icy surroundings. The composition artfully balances these creatures against the dramatic backdrop, using leading lines from the terrain to guide the viewer’s eyes. This harmony of camera movement, light, and natural elements enhances the narrative, conveying the mammoths' dominance and resilience in a harsh environment. 3. A drone soars over Big Sur’s Garay Point Beach, capturing powerful waves as they crash against the steep, jagged cliffs. The camera’s aerial perspective emphasizes the vastness of the coastline, revealing the steep drop to the rocky shore below. In the distance, a small island with a lighthouse adds a focal point to the scene, enhancing the visual narrative of isolation and rugged beauty. The setting sun bathes the landscape in a warm, golden light, contrasting beautifully with the deep blue waters and white-tipped waves. The soft, natural lighting accentuates the textures of the cliffs and the movement of the ocean, adding depth and drama to the composition. The rule of thirds is expertly used, with the cliffs occupying one side, the ocean in the middle, and the distant island drawing the eye outward, creating a balanced and visually captivating scene."
# without examples
# PROMPT="A video is given by providing several frames in chronological order. Describe this video including the objects, their performance, the scene, and the visual style. Pay attention to all the objects in the video. Besides, describe the video based on the following essential elements of professional film footages: 1. Camera: Camera movement, lens, and depth of field directly affect the quality, style,and narrative of the footage. 2. Lighting: Lighting direction and color tone are crucial for setting the mood of the scene. It affects how the colors and shadows appear and can be used creatively to influence the story's atmosphere and the audience's perception of characters and settings. 3. Composition: This involves the arrangement of elements within the frame, including characters, props, and scenery. Composition techniques, such as the rule of thirds, leading lines, and framing, can help guide the viewer’s attention and make the visuals more engaging. Note: Do not describe each frame individually. Do not reply with words like 'first frame'. The entire description should be no more than 140 words, which should be useful for AI to re-generate the video."

video_csv_path = "/root/projects/Data/CelebV-HQ/video_paths_subset_100_full_path.csv" # provide the path here
# new_csv_path = video_csv_path.replace(".csv", "_caption_CogVLM2Caption_GeneralPromptNoExamples_140words_09_20.csv") # change the name when you change the prompt!
# new_csv_path = video_csv_path.replace(".csv", "_caption_CogVLM2Caption_OriginalPrompt.csv") # change the name when you change the prompt!

In [12]:
df = pd.read_csv(video_csv_path)
video_path_list = df["path"].tolist()
caption_list = []

video_seconds_list = []
process_time_list = []

for i, video_path in enumerate(tqdm(video_path_list)):
      temperature = 0.1

      start_time = time.time()
      video_data = open(video_path, 'rb').read()
      response = predict(PROMPT, video_data, temperature)
      end_time = time.time()
      
      caption_list.append(response)

      # statistics
      process_time_list.append(end_time - start_time)
      max_seconds = get_video_length_in_seconds(video_data)
      video_seconds_list.append(max_seconds)

      print(f"{video_path} {max_seconds} seconds\n{response}\n\n")

      if i == 30:
            break

# save
# df["text"] = caption_list
# df.to_csv(new_csv_path, index=False)

  0%|          | 0/100 [00:00<?, ?it/s]

torch.Size([3, 6, 512, 512])


  1%|          | 1/100 [00:40<1:06:01, 40.02s/it]

/root/projects/Data/CelebV-HQ/celebvhq/-mjnbKL7fHQ_2.mp4 6 seconds
 A muscular Black man with short hair and a goatee, wearing a white sleeveless top, stands in an indoor gym, exuding determination and focus. The gym is equipped with a clock and a bulletin board, suggesting a setting of physical activity or training. As time passes, the man maintains his intense gaze and posture, indicating a serious conversation or confrontation with another person, who is not fully visible. The gym's atmosphere is highlighted by the presence of a clock and a poster, adding to the scene's intensity. The man's muscular build and the gym's equipment suggest a narrative of physical activity or training.


torch.Size([3, 14, 512, 512])


  2%|▏         | 2/100 [00:52<39:14, 24.02s/it]  

/root/projects/Data/CelebV-HQ/celebvhq/0wLgQQGob1U_14.mp4 14 seconds
 A man with a light complexion and short, dark hair is seated in a white chair, engaging in a conversation while smiling warmly at the camera. He is dressed casually in a black t-shirt, set against a modern, well-lit interior featuring a large window, a sleek black cabinet, and a white countertop. The room's decor includes a bookshelf filled with books and trinkets, a small sculpture, and a plant, contributing to a personalized and inviting atmosphere. Throughout, the man's expressions range from friendly and open to animated, suggesting a lively and engaging dialogue.


torch.Size([3, 5, 512, 512])


  3%|▎         | 3/100 [01:02<27:59, 17.32s/it]

/root/projects/Data/CelebV-HQ/celebvhq/10X4Th3YE30_4.mp4 5 seconds
A man with dark hair and a stubble beard, wearing a white shirt with a black collar, is deeply engaged in playing a red electric guitar. His expression is one of intense concentration, suggesting a moment of musical immersion. The warm red backdrop highlights his features and the vibrant color of the guitar, emphasizing the emotional depth of the performance. As time passes, his focused expression and the dynamic positioning of his hands on the fretboard remain consistent, indicating a passionate and intense musical engagement.


torch.Size([3, 5, 512, 512])


  4%|▍         | 4/100 [01:12<23:18, 14.57s/it]

/root/projects/Data/CelebV-HQ/celebvhq/1o0B9w6J3DM_10.mp4 5 seconds
A woman with long, dark hair and red lipstick is seated in a television studio, wearing a black top with sheer sleeves. Initially, she appears engaged in conversation, with a blurred figure and a bookshelf filled with books and trinkets in the background. The setting is warmly lit, creating an inviting atmosphere. Two seconds later, her expression shifts to one of pleasant surprise or mild amusement, with the bookshelf now displaying an eclectic mix of books and figurines, and a vibrant abstract painting behind her, enhancing the talk show ambiance.


torch.Size([3, 3, 512, 512])


  5%|▌         | 5/100 [01:23<21:09, 13.36s/it]

/root/projects/Data/CelebV-HQ/celebvhq/2lMIShoMuIk_6.mp4 3 seconds
A woman with long, curly brown hair and a black top is seen smiling broadly in an urban setting, possibly a European city, with a street market stall displaying postcards in the background. The scene is vibrant, with a blurred passerby adding to the lively atmosphere. Two seconds later, the same woman, now with a contemplative expression, stands in front of a display rack filled with postcards and books. Her attire includes a sleeveless top, a peach-colored scarf, and a green necklace, with the urban backdrop and a passerby still visible.


torch.Size([3, 8, 512, 512])


  6%|▌         | 6/100 [01:35<20:04, 12.81s/it]

/root/projects/Data/CelebV-HQ/celebvhq/3HBpd1aGqq4_4.mp4 8 seconds
A young woman with red hair and glasses, wearing a black coat, scarf, and carrying a red shoulder bag, is seen in an urban park, engaging in conversation with a man in a dark jacket. Initially, her expression is one of earnest engagement, but as time passes, her demeanor becomes more relaxed and friendly, with a slight smile and a look of pleasant surprise. The background features bare trees and a modern building, suggesting a cooler season. The presence of a microphone indicates she might be an interviewee or speaker, and the overall atmosphere is one of casual interaction and enjoyment.


torch.Size([3, 4, 512, 512])


  7%|▋         | 7/100 [01:46<18:59, 12.25s/it]

/root/projects/Data/CelebV-HQ/celebvhq/3Sz7dbX2kTY_1.mp4 4 seconds
A young man with dark hair and a light complexion appears in a close-up, wearing a beige shirt and displaying a calm and introspective expression. The soft lighting highlights his features against a blurred background, suggesting an indoor setting. Two seconds later, the same young boy, now in a light-colored shirt, is seen mid-speech with an earnest and slightly concerned expression. His dark hair is neatly combed, and his eyes are focused intently on something off-camera, indicating a moment of significant engagement or concern. The warm lighting continues to accentuate his features against the blurred indoor backdrop.


torch.Size([3, 5, 512, 512])


  8%|▊         | 8/100 [01:58<18:26, 12.02s/it]

/root/projects/Data/CelebV-HQ/celebvhq/5K6sh7HZri4_2.mp4 5 seconds
A contemplative woman with blonde hair and bangs, wearing a pink floral top, is seated indoors, possibly in a kitchen or living room, with a hand on her cheek and a subtle smile. The scene shifts to the same woman, now in a pink floral nightgown, sitting in a dental chair with a concerned expression, her hand raised to her cheek. The background is blurred, suggesting a clinical setting. The overall content hints at a narrative involving a woman in a pink dress, possibly reflecting on a personal moment or engaging in a conversation, with a focus on her expressions and the domestic environment.


torch.Size([3, 5, 512, 512])


  9%|▉         | 9/100 [02:09<17:44, 11.70s/it]

/root/projects/Data/CelebV-HQ/celebvhq/5enqrVvjxg0_0.mp4 5 seconds
 A man in a dark jacket and blue shirt, wearing a white cervical collar and a patch over his right eye, appears distressed and fatigued, suggesting recent physical trauma. His expression is one of concern and mild discomfort. The scene is set in an indoor environment, possibly a hospital or clinic, with a blurred background that emphasizes the man's condition. The presence of the 'MOVIECLIPS' logo indicates that this is a still from a movie. The man's attire and the setting remain consistent, highlighting his recent injury and the ongoing care he is receiving.


torch.Size([3, 8, 512, 512])


 10%|█         | 10/100 [02:20<17:23, 11.60s/it]

/root/projects/Data/CelebV-HQ/celebvhq/5qKySTAWpiY_6.mp4 8 seconds
 A contemplative woman with short, wavy brown hair and fair skin is captured in various moments of introspection. Initially, her eyes are downcast, and her expression is one of deep thought or concern, highlighted by soft lighting that accentuates her features. As time passes, her gaze shifts slightly away from the camera, maintaining the reflective mood. Her vintage-style attire and the warm lighting continue to emphasize her introspective demeanor. Throughout, her expression remains one of deep thought or concern, with the soft lighting enhancing her features against a blurred background, suggesting an indoor setting.


torch.Size([3, 5, 512, 512])


 11%|█         | 11/100 [02:32<17:13, 11.62s/it]

/root/projects/Data/CelebV-HQ/celebvhq/6qZZWAScCn8_1.mp4 5 seconds
A male athlete with a short beard and reddish-brown hair, marked by dried blood, is seen sitting in an octagon-shaped cage, displaying signs of exhaustion and focus. His tattoos are visible on his skin, and he is wearing a clear plastic headset, indicating he is engaged in a sporting event. Two individuals are seen attending to him, one holding a white cloth to his face, suggesting post-fight care. The scene is set against a blurred audience and a chain-link fence, typical of an indoor sports arena. The athlete's expression and the presence of individuals around him hint at a recent bout.


torch.Size([3, 18, 512, 512])


 12%|█▏        | 12/100 [02:46<18:13, 12.43s/it]

/root/projects/Data/CelebV-HQ/celebvhq/7FyjCUDR0IM_2.mp4 18 seconds
A young woman with long brown hair and a black mask stands in front of a green metal fence, holding a microphone, suggesting she is engaged in a sports-related interview or commentary. The scene is set outdoors, likely at a sports facility, with a tennis court visible in the background. Throughout, her attire includes a grey turtleneck and a black jacket, indicating a casual yet professional setting. The presence of the 'EASYSOCCER' and 'EASYLEAGUE' logos suggests the event is related to soccer. Her expressions range from focused and serious to slightly smiling, maintaining a calm demeanor.


torch.Size([3, 5, 512, 512])


 13%|█▎        | 13/100 [02:56<17:00, 11.73s/it]

/root/projects/Data/CelebV-HQ/celebvhq/8EcrxgHLhIg_4.mp4 5 seconds
A woman with fair skin and wavy brown hair appears in a series of close-ups, her expression shifting from concern to surprise and then to intense focus. Her green eyes are wide and alert, and her lips are slightly parted, suggesting she is reacting to an unexpected event. The lighting is warm, casting soft shadows on her face and highlighting her features. Her expressions and the soft lighting create a dramatic and emotional atmosphere, with her gaze fixed intently on something outside the frame, indicating a moment of significant engagement or reaction.


torch.Size([3, 14, 970, 970])


 14%|█▍        | 14/100 [03:09<17:26, 12.17s/it]

/root/projects/Data/CelebV-HQ/celebvhq/8cuISKrdEtM_5_3.mp4 14 seconds
 A young man with glasses, dressed in a light blue checkered shirt, is seated at a table in an indoor setting, possibly a conference room, engaging in a serious discussion or interview. The background features promotional banners for the 'AGRESEARCH INSTITUTE OF TENNESSEE' and the 'AGRICULTURE INSTITUTE OF TENNESSEE', indicating a focus on agricultural research. Throughout the sequence, the man's expression remains earnest and focused, suggesting a deep involvement in the conversation. The presence of a microphone and the consistent backdrop banners emphasize the professional and educational context of the event.


torch.Size([3, 6, 928, 928])


 15%|█▌        | 15/100 [03:21<17:03, 12.04s/it]

/root/projects/Data/CelebV-HQ/celebvhq/9kGHXqNutwg_13_0.mp4 6 seconds
A middle-aged man with a mustache and goatee, wearing a straw hat and a white shirt, stands in a rural setting, engaging earnestly with the camera. He is equipped with a black backpack, suggesting he is on a journey or adventure. The background features rolling hills and a faint mountain range under a partly cloudy sky, indicating a high-altitude or tropical climate. Throughout, the man gestures with his right hand, possibly emphasizing a point or explaining something, while his expression remains earnest and focused. The consistent scenery suggests little to no change in the man's activity or the environment.


torch.Size([3, 5, 512, 512])


 16%|█▌        | 16/100 [03:31<15:54, 11.37s/it]

/root/projects/Data/CelebV-HQ/celebvhq/ADRGgyhX4YE_0.mp4 5 seconds
A man with a contemplative expression, wearing a straw hat and a burgundy shirt, is seated outdoors, engaging in a conversation or making a point. His hands are raised mid-gesture, emphasizing his speech. The setting suggests a casual, possibly tropical environment with lush greenery. As time passes, his expression becomes more animated, and he continues to gesture with his hands, indicating an ongoing discussion or presentation. The consistent outdoor backdrop and his attire remain unchanged, highlighting a moment of earnest communication or explanation.


torch.Size([3, 4, 512, 512])


 17%|█▋        | 17/100 [03:41<15:21, 11.11s/it]

/root/projects/Data/CelebV-HQ/celebvhq/AeMESEz39ps_5.mp4 4 seconds
A young man with dark hair and a light complexion, wearing a white hairnet and a black t-shirt, stands in a professional kitchen. He appears to be engaged in a conversation or interview, looking directly at the camera with a slight smile. The kitchen is equipped with stainless steel appliances, including a deep fryer and a grill, and is clean and organized, suggesting a setting that values hygiene and culinary precision. The man's friendly demeanor and direct gaze indicate an open and approachable attitude, possibly as a chef or food preparer.


torch.Size([3, 17, 512, 512])


 18%|█▊        | 18/100 [03:54<15:52, 11.62s/it]

/root/projects/Data/CelebV-HQ/celebvhq/Agw7D-sR-c0_2.mp4 17 seconds
 A woman with short, dark hair and fair skin is engaged in a serious conversation, wearing a white blouse and a golden leaf-shaped hair accessory. Her expressions range from contemplative to earnest, suggesting a deep involvement in the discussion. The setting appears to be an outdoor urban environment, with a blurred cityscape in the background. Throughout the video, her demeanor shifts from thoughtful to confident, and her attire, including the consistent white blouse and the golden leaf accessory, remains elegant and professional. The natural lighting highlights her features, adding warmth to the scene.


torch.Size([3, 7, 898, 898])


 19%|█▉        | 19/100 [04:08<16:24, 12.16s/it]

/root/projects/Data/CelebV-HQ/celebvhq/Bju7yxstSm8_5_0.mp4 7 seconds
 A middle-aged woman with shoulder-length brown hair and a dark coat over a blue floral top is seated indoors, looking off-camera with a serious expression. A microphone is in front of her, suggesting she is being interviewed. Another woman in a black blazer stands behind her, observing. The setting appears to be a room with a checkered floor, minimalistic furniture, and a plain wall. As time passes, the woman in the dark coat speaks into the microphone, her expression earnest, while the other woman, now in a black jacket and patterned scarf, listens attentively. The room's simplicity and the presence of a watermark indicate the footage is from a news broadcast.


torch.Size([3, 7, 1080, 1080])


 20%|██        | 20/100 [04:20<16:30, 12.38s/it]

/root/projects/Data/CelebV-HQ/celebvhq/CfnbOOy54K0_1_0.mp4 7 seconds
 Initially, a glass jar filled with dark red sauce is shown on a wooden surface, next to a wooden spoon, with a blurred video call interface in the background. A young woman with long, dark hair and a green hoodie is then seen sitting at a wooden table, engaging in conversation, surrounded by a cozy room with a Christmas tree, a nutcracker, and personal photographs. The scene shifts slightly as she continues her conversation, now with a wooden dresser and a crocheted afghan in the background, maintaining the warm, festive atmosphere. Throughout, she appears to be discussing cooking or baking, indicated by the presence of a jar of sauce.


torch.Size([3, 5, 512, 512])


 21%|██        | 21/100 [04:32<16:02, 12.18s/it]

/root/projects/Data/CelebV-HQ/celebvhq/D4HGK-_5TkY_0.mp4 5 seconds
 An elderly man with a serious demeanor is seated indoors, wearing a plaid shirt and a checkered blazer, his silver hair neatly combed. His expression, one of deep thought or concern, is highlighted by the soft lighting. As time passes, his expression shifts to one of deep contemplation or concern, with his eyes slightly downcast and a furrowed brow. The setting remains consistent, with a dark background that focuses attention on him. By the end of the sequence, his eyes are closed, and his face shows a reflective or sorrowful expression, suggesting a moment of introspection or concern.


torch.Size([3, 5, 512, 512])


 22%|██▏       | 22/100 [04:43<15:19, 11.79s/it]

/root/projects/Data/CelebV-HQ/celebvhq/D6LGGWwpQRg_0.mp4 5 seconds
 A young woman with blonde hair and a solemn expression is captured in a series of black and white photographs, each portraying her in a dark top and a string of pearls, suggesting a formal occasion. Initially, her gaze is directed off-camera, indicating introspection or engagement with something unseen. As the scenes progress, her expressions shift from solemn to surprised, and then to contemplative, with her eyes slightly narrowed and lips parted in the final frame. The consistent monochromatic tone and the absence of color throughout these images highlight the emotional depth and the timeless quality of the scenes.


torch.Size([3, 10, 512, 512])


 23%|██▎       | 23/100 [04:56<15:25, 12.02s/it]

/root/projects/Data/CelebV-HQ/celebvhq/DIdGRrayLCM_4.mp4 10 seconds
 A man in a dark suit and white shirt is engaged in a serious conversation, likely an interview, against a backdrop featuring the title 'Doctor Strange' in a mystical, golden font. The setting suggests a fantasy theme, with a warm color palette of oranges, reds, and golds, and intricate designs. The man appears focused and earnest, indicating a significant discussion, possibly about the film or its themes. The backdrop evolves subtly, maintaining the fantasy atmosphere with a blend of warm and cool tones, and the man's expression remains concentrated throughout, suggesting a deep engagement with the subject matter.


torch.Size([3, 5, 512, 512])


 24%|██▍       | 24/100 [05:05<14:16, 11.28s/it]

/root/projects/Data/CelebV-HQ/celebvhq/DUjB9LTtzGg_0.mp4 5 seconds
A woman with blonde hair, wearing a white blouse, is in a dimly lit, possibly haunted room. Initially, she appears shocked or fearful, with wide eyes and an open mouth, as if reacting to something unexpected. Two seconds later, her expression intensifies to one of concern or alarm, with her mouth slightly open as if she's shouting or exclaiming. The dark background and her fair complexion are accentuated by the lighting, creating a dramatic and tense atmosphere.


torch.Size([3, 4, 512, 512])


 25%|██▌       | 25/100 [05:16<14:05, 11.28s/it]

/root/projects/Data/CelebV-HQ/celebvhq/EOQeU_6vbeg_0.mp4 4 seconds
 An older woman with a concerned expression and a younger woman, who appears distressed and is holding her cheek, are in a dimly lit room. The older woman, wearing a dark green blazer, seems to be in a professional or authoritative role, while the younger woman's attire is casual. The scene suggests a tense interaction, possibly involving a confrontation or a moment of emotional vulnerability. Two seconds later, the atmosphere remains intimate and emotional, with the older woman's concerned expression and the younger woman's contemplative demeanor. The warm lighting highlights their features, emphasizing the emotional depth of their interaction.


torch.Size([3, 4, 512, 512])


 26%|██▌       | 26/100 [05:27<13:33, 10.99s/it]

/root/projects/Data/CelebV-HQ/celebvhq/Eu1EfSDUKFg_1.mp4 4 seconds
 A man with short, dark hair and a stubble beard, wearing a plaid shirt, is in a serious conversation with a woman in a yellow top, indoors. They are surrounded by a wooden bookcase filled with books and a painting, suggesting a casual yet intellectual setting. The man appears earnest and slightly concerned, while the woman listens intently. Two seconds later, the man's expression turns animated, showing surprise or excitement, as he continues the conversation. The background remains consistent, with the bookcase and painting still visible.


torch.Size([3, 4, 512, 512])


 27%|██▋       | 27/100 [05:38<13:29, 11.09s/it]

/root/projects/Data/CelebV-HQ/celebvhq/F58XJgGx3DY_1.mp4 4 seconds
 A woman with short dark hair, wearing a floral nightgown, is in a dimly lit room having a serious conversation with a man. Initially, her expression is one of earnestness or concern, and the man's back is to the camera, indicating he is listening intently. Two seconds later, her expression shifts to one of concern or mild distress, while the man's presence is suggested by a blurred figure in the background. The scene is marked by soft lighting, creating a warm atmosphere, and the presence of a watermark from 'MOVIECLIPS' suggests it is from a film.


torch.Size([3, 7, 512, 512])


 28%|██▊       | 28/100 [05:49<13:25, 11.19s/it]

/root/projects/Data/CelebV-HQ/celebvhq/FQeoB_lTbf8_1.mp4 7 seconds
A man with a light beard and short hair, wearing dark sunglasses and a fur-lined hood, stands on a beach, engaging in an animated conversation with the camera. Initially, his expression is earnest and slightly open-mouthed, suggesting a lively discussion. As time passes, his demeanor becomes more relaxed and friendly, with a slight smile and a direct gaze that creates an inviting atmosphere. The background features a calm sea and a clear sky, indicating a serene coastal setting. Throughout, the man's attire suggests cooler weather, and his consistent engagement with the camera maintains a sense of casual confidence.


torch.Size([3, 4, 512, 512])


 29%|██▉       | 29/100 [06:01<13:16, 11.22s/it]

/root/projects/Data/CelebV-HQ/celebvhq/GcB7m_3Tlv8_1.mp4 4 seconds
A man with short, light brown hair and stubble is standing in a dimly lit room, wearing a black jacket over a white shirt. He appears to be in a serious conversation or interview, looking off-camera with an earnest expression. The room's wooden ceiling and beams add to the intimate atmosphere. As time passes, the man's expression shifts to one of deep thought or concern, with his eyes slightly narrowed and brows furrowed. The warm lighting continues to highlight his features against the dark background, maintaining the video's intimate and contemplative mood.


torch.Size([3, 4, 512, 512])


 30%|███       | 30/100 [06:11<12:51, 11.03s/it]

/root/projects/Data/CelebV-HQ/celebvhq/GplPjaD4VYk_0.mp4 4 seconds
A woman with dark hair tied up in an elegant bun and wearing a white shirt under a black vest is seen in a professional setting, possibly a kitchen or server area, with a blurred background featuring posters and a framed picture. Her expression is one of concentration and determination, suggesting she is engaged in a serious conversation or task. The scene is from a television show or movie, as indicated by the text "billibili" in the upper right corner. The same woman appears again, maintaining her focused demeanor in what seems to be a formal or professional environment.


torch.Size([3, 8, 512, 512])


 30%|███       | 30/100 [06:22<14:53, 12.77s/it]

/root/projects/Data/CelebV-HQ/celebvhq/H9gqgAcOaE4_0.mp4 8 seconds
 A man in historical attire, including a white shirt and dark vest, is seen in various states of shock and distress against a stone wall. Initially, he appears with his mouth open and eyes wide, suggesting a moment of intense emotion. As moments pass, his expressions evolve from shock to fear, with his eyes wide and mouth agape, indicating a reaction to an unforeseen event. The dim lighting throughout these scenes casts dramatic shadows, enhancing the tension and mystery. The presence of the text 'A24' suggests a connection to a film or television production.







In [13]:
video_seconds_list = np.array(video_seconds_list)
process_time_list = np.array(process_time_list)

# print(f'processing time per video seconds:{np.mean(process_time_list/video_seconds_list)}')

print(f'processing time per video seconds:{process_time_list.sum()/video_seconds_list.sum()}')

print(f'total video time:{process_time_list.sum()}')

processing time per video seconds:2.159343232287473
processing time per video seconds:1.8137862784037657


In [1]:
6000000 * 10 * 1.5 / 3600 / 24

1041.6666666666667