In [2]:
from transformers import pipeline
import os
import pathlib
import cv2
import time
import argparse
import requests
from transformers import pipeline, WhisperTokenizer

### VIDEO BLIP

In [4]:
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

video = cv2.VideoCapture("tony.mp4")

# Set the desired frequency (in seconds)
save_frequency = 1

# Loop through all the frames
is_frame_available, frame = video.read()
frame_count = 0

while is_frame_available:
    # Get the current timestamp of the frame
    timestamp = video.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert from milliseconds to seconds
    
    # Save the frame as an image file with the timestamp in the filename
    if frame_count % int(save_frequency * video.get(cv2.CAP_PROP_FPS)) == 0:
        cv2.imwrite(f'video_frames/frame$${timestamp:.1f}$$.jpg', frame)
        print(f"Video processing: {timestamp:.1f}")
    
    
    # Read the next frame
    is_frame_available, frame = video.read()
    frame_count += 1

    # Add a delay to regulate the frequency
    time.sleep(1 / video.get(cv2.CAP_PROP_FPS))

# Release the video file
video.release()

image_path = os.listdir("video_frames")
sorted_images = sorted(image_path, key=lambda x: float(x.split('$$')[1]))

descriptions = []
for idx, image in enumerate(sorted_images):
    print("Processing by BLIP: video_frames/" + image)
    descriptions.append("time code " + sorted_images[idx].split("$$")[1] + ": " + pipe("video_frames/" + image)[0].get("generated_text"))


descriptions = descriptions[:-1]
joined_descriptions = ".\n".join(descriptions)
prompt_descriptions = ". ".join(descriptions)

print("""
FRAME-BY-FRAME DESCRIPTION:\n""", joined_descriptions)

Video processing: 0.0
Video processing: 1.0
Video processing: 2.0
Video processing: 3.0
Video processing: 4.0
Video processing: 5.0
Video processing: 6.0
Video processing: 7.0
Video processing: 8.0
Processing by BLIP: video_frames/frame$$0.0$$.jpg
Processing by BLIP: video_frames/frame$$1.0$$.jpg
Processing by BLIP: video_frames/frame$$2.0$$.jpg
Processing by BLIP: video_frames/frame$$3.0$$.jpg
Processing by BLIP: video_frames/frame$$4.0$$.jpg
Processing by BLIP: video_frames/frame$$5.0$$.jpg
Processing by BLIP: video_frames/frame$$6.0$$.jpg
Processing by BLIP: video_frames/frame$$7.0$$.jpg
Processing by BLIP: video_frames/frame$$8.0$$.jpg

FRAME-BY-FRAME DESCRIPTION:
 time code 0.0: a man in a suit and tie holding a piece of paper.
time code 1.0: a man in a suit and tie holding a piece of paper.
time code 2.0: a man in a suit and tie holding a piece of paper.
time code 3.0: a man in a suit and tie standing in front of a screen.
time code 4.0: a man in a suit and tie is looking at some

### WHISPER

In [5]:
pipe = pipeline("automatic-speech-recognition", model="AlanRobotics/whisper-tiny-ru", tokenizer=WhisperTokenizer.from_pretrained("openai/whisper-tiny"))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### extract audio

In [6]:
from moviepy.editor import VideoFileClip

# Load the video file
video = VideoFileClip('/Users/a.gazzaev/Desktop/dl_lab3/LLM_and_BLIP/tony.mp4')

# Extract the audio
audio = video.audio

# Save the audio file
audio.write_audiofile('tony.wav')

MoviePy - Writing audio in tony.wav


                                                        

MoviePy - Done.




In [7]:
with open("tony.wav", 'rb') as f:
    audio = f.read()

transcription = pipe(audio).get("text")
transcription

'Я железный человек.'

### VICUNA

In [8]:
def get_response(prompt):
    res = requests.get(url="http://10.207.0.31:5005/get_answer", json={"prompt": f"{prompt}"})
    answer = res.json()["result"][2:]
    return answer

In [17]:
prompt = f"This is a text description of consecutive frames with timecodes from the video. The video shows only one person. Based on these descriptions, try to briefly tell what is happening on the video? This video also have audio from person on video: {transcription}. Что происходит на видео? Ответь на русском"
res = get_response(prompt=(prompt + prompt_descriptions))
print("""
MODEL RESPONSE: """, res)


MODEL RESPONSE:  На видео показан один человек, надетый в костюм и галстук. В первых трех кадрах он держит лист бумаги. На четвертом кадре он стоят перед экраном. На пятом и шестом кадрах он смотрит на что-то. На седьмом кадре он смотрит в камеру. На протяжении всего видео он произносит фразу "Я железный человек".
