<a href="https://colab.research.google.com/github/aLehav/MLVideoDescriptionResearch/blob/main/blip2_for_vidCaption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Naive Generation of all frames in a video at the rate of 10 per second using opencv. Later on, we can look into different methods of extracting the change-points.



In [1]:
from datetime import timedelta
import cv2
import numpy as np
import os
import sys

# save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks/vidCaptions")
sys.path.append("/content/drive/My Drive/Colab Notebooks/vidCaptions")

SAVING_FRAMES_PER_SECOND = 6

Mounted at /content/drive


Helper functions

In [2]:
def format_timedelta(td):
    """Utility function to format timedelta objects in a cool way (e.g 00:00:20.05) 
    omitting microseconds and retaining milliseconds"""
    result = str(td)
    try:
        result, ms = result.split(".")
    except ValueError:
        return (result + ".00").replace(":", "-")
    ms = int(ms)
    ms = round(ms / 1e4)
    return f"{result}.{ms:02}".replace(":", "-")


def get_saving_frames_durations(cap, saving_fps):
    """A function that returns the list of durations where to save the frames"""
    s = []
    # get the clip duration by dividing number of frames by the number of frames per second
    clip_duration = cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
    # use np.arange() to make floating-point steps
    for i in np.arange(0, clip_duration, 1 / saving_fps):
        s.append(i)
    return s

Main function to extract the frames from video.

In [3]:
def main(video_file):
    filename, _ = os.path.splitext(video_file)
    filename += "-opencv"
    # make a folder by the name of the video file
    if not os.path.isdir(filename):
        os.mkdir(filename)
    # read the video file    
    cap = cv2.VideoCapture(video_file)
    # get the FPS of the video
    fps = cap.get(cv2.CAP_PROP_FPS)
    # if the SAVING_FRAMES_PER_SECOND is above video FPS, then set it to FPS (as maximum)
    saving_frames_per_second = min(fps, SAVING_FRAMES_PER_SECOND)
    # get the list of duration spots to save
    saving_frames_durations = get_saving_frames_durations(cap, saving_frames_per_second)
    # start the loop
    count = 0
    while True:
        is_read, frame = cap.read()
        if not is_read:
            # break out of the loop if there are no frames to read
            break
        # get the duration by dividing the frame count by the FPS
        frame_duration = count / fps
        try:
            # get the earliest duration to save
            closest_duration = saving_frames_durations[0]
        except IndexError:
            # the list is empty, all duration frames were saved
            break
        if frame_duration >= closest_duration:
            # if closest duration is less than or equals the frame duration, 
            # then save the frame
            frame_duration_formatted = format_timedelta(timedelta(seconds=frame_duration))
            cv2.imwrite(os.path.join(filename, f"frame{frame_duration_formatted}.jpg"), frame) 
            
            # drop the duration spot from the list, since this duration spot is already saved
            try:
                saving_frames_durations.pop(0)
            except IndexError:
                pass
        # increment the frame count
        count += 1

Running the functions to extract videos. 

In [4]:
location = "byxOvuiIJV0.mp4" # put location of video path here
video_file = location
main(video_file)

#### Large RAM is required to load the larger models. Running on GPU can optimize inference speed.

In [5]:
import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip3 install salesforce-lavis

Running in Colab.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting salesforce-lavis
  Downloading salesforce_lavis-1.0.0-py3-none-any.whl (495 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.8/495.8 KB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting contexttimer
  Downloading contexttimer-0.3.3.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting pre-commit
  Downloading pre_commit-3.1.1-py2.py3-none-any.whl (202 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m202.3/202.3 KB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops>=0.4.1
  Dow

In [6]:
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

#### Load an example image

In [7]:
def loadImg(imgPath):
  return Image.open(requests.get(imgPath, stream=True).raw).convert('RGB')   

In [8]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

#### Load pretrained/finetuned BLIP2 captioning model

In [9]:
# we associate a model with its preprocessors to make it easier for inference.
model, vis_processors, _ = load_model_and_preprocess(
    name="blip_caption", model_type="base_coco", is_eval=True, device=device
)

# Other available models:
# 
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_opt", model_type="pretrain_opt2.7b", is_eval=True, device=device
# )
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_opt", model_type="pretrain_opt6.7b", is_eval=True, device=device
# )
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_opt", model_type="caption_coco_opt2.7b", is_eval=True, device=device
# )
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_opt", model_type="caption_coco_opt6.7b", is_eval=True, device=device
# )
#
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_t5", model_type="pretrain_flant5xl", is_eval=True, device=device
# )
#
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_t5", model_type="caption_coco_flant5xl", is_eval=True, device=device
# )

vis_processors.keys()

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0.00/2.50G [00:00<?, ?B/s]

dict_keys(['train', 'eval'])

Generate text description with the model and save it to a list

In [10]:
frames_dir = location.replace('.mp4', '') + "-opencv"
fields = ["Title", "Cap1", "Cap2", "Cap3"]
rows = []
for rawImg in os.listdir(frames_dir):
  procRaw = Image.open(frames_dir + "/" + rawImg).convert("RGB")
  image = vis_processors["eval"](procRaw).unsqueeze(0).to(device)
  rows.append([rawImg] + model.generate({"image": image}, use_nucleus_sampling=True, num_captions=3))

Sort the rows with respect to the frames

In [11]:
sorted(rows, key=lambda x: x[0], reverse=False)

[['frame0-00-00.00.jpg',
  'two people in a dark place taking a photo with their phones',
  'the man is wearing a black jacket and carrying an orange frisbee',
  'a couple of giraffes standing on top of a grass covered field'],
 ['frame0-00-00.17.jpg',
  'a clock and the sky at the time of 9 29',
  'the dark is not so dark as a few clouds',
  'a group of three giraffes standing on a field'],
 ['frame0-00-00.33.jpg',
  'a couple of bananas sitting next to each other',
  'a man wearing a face mask riding a surf board',
  'a picture of a person walking by a sign on the side'],
 ['frame0-00-00.50.jpg',
  'a man and two children in the dark watching a movie on a large screen',
  'a man with glasses sits in front of the dark background',
  'the silhouette of a large elephant with dark background'],
 ['frame0-00-00.67.jpg',
  'a white plate that has bread on it',
  'a person taking a selfie in the dark with a camera',
  'an orange clock sitting on the side of a white clock tower'],
 ['frame0-

Save into CSV file in the folder

In [14]:
import pandas as pd

df = pd.DataFrame(rows, columns=fields, dtype=float)
df.to_csv(frames_dir + "_captions.csv")

# import csv

# device = torch.device("cpu")

# filename = frames_dir + "_captions.csv"

# with open(filename, 'w') as csvfile:
#   csvwriter = csv.writer(csvfile)
#   csvwriter.writerow(fields)
#   csvwriter.writerows(rows)

  exec(code_obj, self.user_global_ns, self.user_ns)
