In [2]:
import sys
sys.path.append('../')

In [3]:
from DPF import ShardedFilesDatasetConfig, DatasetReader

config = ShardedFilesDatasetConfig.from_path_and_columns(
    'example_video_dataset',
    video_name_col='video_name',
    text_col='caption'
)

reader = DatasetReader()
processor = reader.read_from_config(config)

  0%|          | 0/1 [00:00<?, ?it/s]

# Check dataset and it's info

In [4]:
processor.print_summary()

Dataset format: sharded_files
Path: example_video_dataset
Modalities: ['video', 'text']
Columns: 3
Total samples: 5


In [5]:
processor.df

Unnamed: 0,video_path,text,split_name
0,example_video_dataset/0/0.mp4,Businessman Rides In A Car,0
1,example_video_dataset/0/1.mp4,Workspace In Natural Light,0
2,example_video_dataset/0/2.mp4,Woman In Dress Walking In Tulip Field,0
3,example_video_dataset/0/3.mp4,Film Burns Overlay,0
4,example_video_dataset/0/4.mp4,Portrait Leader Of The Roman Army,0


# Running filters

## VideoInfoFilter

In [6]:
from DPF.filters.videos.info_filter import VideoInfoFilter

datafilter = VideoInfoFilter(workers=16) 
print(datafilter.result_columns) # prints list of colums that will be added

processor.apply_data_filter(datafilter)

['is_correct', 'error', 'width', 'height', 'fps', 'duration']


100%|██████████| 5/5 [00:00<00:00, 12.70it/s]


In [7]:
processor.df

Unnamed: 0,video_path,text,split_name,is_correct,error,width,height,fps,duration
0,example_video_dataset/0/0.mp4,Businessman Rides In A Car,0,True,,1280,720,25.0,14.28
1,example_video_dataset/0/1.mp4,Workspace In Natural Light,0,True,,1280,720,25.0,10.08
2,example_video_dataset/0/2.mp4,Woman In Dress Walking In Tulip Field,0,True,,1280,720,25.0,12.12
3,example_video_dataset/0/3.mp4,Film Burns Overlay,0,True,,1280,720,23.976024,24.5245
4,example_video_dataset/0/4.mp4,Portrait Leader Of The Roman Army,0,True,,1280,720,29.97003,9.2092


## Farneback optical flow

In [8]:
from DPF.filters.videos.farneback_filter import GunnarFarnebackFilter

datafilter = GunnarFarnebackFilter()
processor.apply_data_filter(datafilter)

100%|██████████| 5/5 [00:02<00:00,  2.44it/s]


In [9]:
processor.df['mean_optical_flow_farneback']

0     7.428
1     5.926
2    10.874
3     0.194
4    10.816
Name: mean_optical_flow_farneback, dtype: float32

## RAFT optical flow

In [10]:
from DPF.filters.videos.raft_filter import RAFTOpticalFlowFilter

datafilter = RAFTOpticalFlowFilter()
processor.apply_data_filter(datafilter)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 5/5 [00:03<00:00,  1.58it/s]


In [11]:
processor.df['mean_optical_flow_raft']

0     20.997999
1     38.886002
2     49.102001
3    169.417999
4     65.942001
Name: mean_optical_flow_raft, dtype: float32

## VideoLLaVA captioning

In [8]:
from DPF.filters.videos.video_llava_filter import VideoLLaVAFilter

datafilter = VideoLLaVAFilter(
    prompt="detailed_video",
    device="cuda:0",
    workers=16,
    batch_size=8
)

processor.apply_data_filter(datafilter)

[2024-03-29 15:16:16,768] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


config.json: 100%|██████████| 1.09k/1.09k [00:00<00:00, 4.03MB/s]
Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  5.66it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:34<00:00, 17.04s/it]
Some weights of the model checkpoint at LanguageBind/Video-LLaVA-7B were not used when initializing LlavaLlamaForCausalLM: ['model.image_tower.image_tower.encoder.layers.16.self_attn.k_proj.weight', 'model.image_tower.image_tower.encoder.layers.16.self_attn.q_proj.weight', 'model.image_tower.image_tower.encoder.layers.10.self_attn.q_proj.weight', 'model.image_tower.image_tower.encoder.layers.3.self_attn.out_proj.bias', 'model.image_tower.image_tower.encoder.layers.20.self_attn.q_proj.bias', 'model.image_tower.image_tower.encoder.layers.5.mlp.fc2.weight', 'model.video_tower.video_tower.encoder.layers.4.temporal_attn.q_proj.weight', 'model.video_tower.video_tower.encoder.layers.15.self_attn.k_proj.bias', 'model.video_tower.video_tower.encoder.layers.23.self_attn.v_proj.bias', 'model.ima

100%|██████████| 1/1 [00:15<00:00, 15.02s/it]


In [9]:
processor.df

Unnamed: 0,video_path,text,split_name,is_correct,error,width,height,fps,duration,caption Video-LLaVA-7B prompt detailed_video
0,example_video_dataset/0/0.mp4,Businessman Rides In A Car,0,True,,1280,720,25.0,14.28,This video is a black and white portrait of a ...
1,example_video_dataset/0/1.mp4,Workspace In Natural Light,0,True,,1280,720,25.0,10.08,The video begins with a close-up shot of a tab...
2,example_video_dataset/0/2.mp4,Woman In Dress Walking In Tulip Field,0,True,,1280,720,25.0,12.12,The video starts with a woman walking through ...
3,example_video_dataset/0/3.mp4,Film Burns Overlay,0,True,,1280,720,23.976024,24.5245,This video features a woman with long blonde h...
4,example_video_dataset/0/4.mp4,Portrait Leader Of The Roman Army,0,True,,1280,720,29.97003,9.2092,"The video showcases a man wearing a large, orn..."


In [10]:
processor.df['caption Video-LLaVA-7B prompt detailed_video'].tolist()

["This video is a black and white portrait of a man sitting in the back of his car, dressed in formal attire. He is wearing a suit and tie, which is typical of the formal dress code. As he sits in his seat, he looks out of frame, indicating that he is either waiting for someone or simply enjoying the view outside. His gaze is directed towards the rearview mirror, suggesting that there might be something interesting happening outside the car. Overall, the video captures a moment of stillness and contemplation, with the man'taking his gazed fixed on something outside of view.",
 'The video begins with a close-up shot of a table with various items on it, including a laptop, a mouse, some books, flowers, vases, potted plants, cups, bowls, bottles, scissors, pens, an apple, notebooks, glasses, remote controls, candles and a cell phone. A wooden chair is also present in the scene. After the shot, the camera pans around the table, showing the items from different angles. Then, it cuts to a sh

## LLaVA 1.5 captioning

In [12]:
from DPF.filters.images.llava_captioning_filter import LLaVaCaptioningFilter
from DPF.filters.videos.image_filter_adapter import ImageFilterAdapter

datafilter = LLaVaCaptioningFilter(workers=8, prompt='short-video', batch_size=1, device='cuda:0')
video_adapter = ImageFilterAdapter(datafilter, 0.5, workers=8)

processor.apply_data_filter(video_adapter)

Describe this video very shortly in 1-2 short sentences. Describe what is happening in this video.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at openai/clip-vit-large-patch14-336 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encod

100%|██████████| 5/5 [00:05<00:00,  1.12s/it]


In [13]:
processor.df

Unnamed: 0,video_path,text,split_name,is_correct,error,width,height,fps,duration,mean_optical_flow_farneback,mean_optical_flow_raft,caption liuhaotian/llava-v1.5-13b prompt short-video
0,example_video_dataset/0/0.mp4,Businessman Rides In A Car,0,True,,1280,720,25.0,14.28,7.428,20.997999,"A man in a suit is sitting in a car, looking o..."
1,example_video_dataset/0/1.mp4,Workspace In Natural Light,0,True,,1280,720,25.0,10.08,5.926,38.886002,"The image features a white desk with a laptop,..."
2,example_video_dataset/0/2.mp4,Woman In Dress Walking In Tulip Field,0,True,,1280,720,25.0,12.12,10.874,49.102001,"A woman is walking through a flower garden, ca..."
3,example_video_dataset/0/3.mp4,Film Burns Overlay,0,True,,1280,720,23.976024,24.5245,0.194,169.417999,"The video is a colorful, abstract scene with a..."
4,example_video_dataset/0/4.mp4,Portrait Leader Of The Roman Army,0,True,,1280,720,29.97003,9.2092,10.816,65.942001,A man wearing a helmet and a blue feather is s...


In [14]:
processor.df['caption liuhaotian/llava-v1.5-13b prompt short-video'].tolist()

['A man in a suit is sitting in a car, looking out the window.',
 'The image features a white desk with a laptop, a chair, and a vase. The desk is situated in front of a window, and the laptop is open, possibly in use.',
 'A woman is walking through a flower garden, carrying a hat.',
 'The video is a colorful, abstract scene with a blurry background and a bright yellow light in the foreground.',
 'A man wearing a helmet and a blue feather is standing in front of a blue sky.']

## LITA captioning

In [4]:
from DPF.filters.videos.lita_filter import LITAFilter

datafilter = LITAFilter(batch_size=8)

processor.apply_data_filter(datafilter)

[2024-04-12 10:33:34,250] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at ./lita-vicuna-v1-3-13b-finetune were not used when initializing LitaLlamaForCausalLM: ['model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.post_layernorm.weight', 'model.vision_tower.vision_towe

100%|██████████| 1/1 [00:22<00:00, 22.03s/it]


In [5]:
datafilter.result_columns

['caption lita-vicuna-v1-3-13b-finetune prompt detailed_video']

In [6]:
processor.df['caption lita-vicuna-v1-3-13b-finetune prompt detailed_video'].tolist()

["The image features a man wearing a suit and tie, sitting in a car and looking forward. The scene has a noir atmosphere, with the man's suit and tie giving him a classic, sophisticated appearance. The car appears to be a vintage model, adding to the nostalgic and mysterious ambiance of the scene. The man's focused gaze and the overall composition of the image evoke a sense of anticipation and intrigue.",
 'The video is a white screen with a projected image of a desk with a laptop on it. The desk is made of wood and has a laptop sitting on top of it. There are also two books on the desk, one positioned to the left and the other to the right. \n\nIn the background, there is a window with light coming in, casting a shadow of the desk onto the wall. The overall style of the video is minimalistic and focuses on the desk setup with the laptop and books.',
 'The video is a short, romantic scene featuring a woman walking through a beautiful field of flowers. She is wearing a white dress and c