In [1]:
import av
import numpy as np
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"

# load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained("Neleac/timesformer-gpt2-video-captioning").to(device)

  return self.fget.__get__(instance, owner)()


In [2]:
from scenedetect import detect, AdaptiveDetector, split_video_ffmpeg
scene_list = detect('videos/1/vid2.mp4', AdaptiveDetector())
split_video_ffmpeg('videos/1/vid2.mp4', scene_list)

0

In [None]:
import os
from moviepy.editor import VideoFileClip

# Directory containing the .mp4 files
input_directory = "scenes/vid1"

# Output directory to save extracted audio and video without audio
output_directory = "scenes/audio1"

# Ensure the output directory exists, if not create it
os.makedirs(output_directory, exist_ok=True)

# Iterate over each file in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".mp4"):
        # Construct full paths for input and output files
        input_file = os.path.join(input_directory, filename)
        output_audio_file = os.path.join(output_directory, filename.replace(".mp4", "_audio.mp3"))
        output_video_file = os.path.join(output_directory, filename.replace(".mp4", "_no_audio.mp4"))

        # Load the video
        video = VideoFileClip(input_file)

        # Extract audio from the video
        audio = video.audio
        audio.write_audiofile(output_audio_file)  # Save the audio file

        # Close the video file to release resources
        video.close()

In [4]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import soundfile as sf


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)

model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Timestamp does not always work, so removed it for generalizability.

In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip

transcript_data = []

# Collect all mp3 files and sort them
files = [f for f in os.listdir(output_directory) if f.endswith(".mp3")]
files.sort()  # Sort files alphabetically and numerically

running_len = 0
# Iterate over each file in the directory
for filename in files:
    if filename.endswith((".mp3", ".wav")):  # Add other audio formats as needed
        curr_transcript = []
        file_path = os.path.join(output_directory, filename)
        audio__ = AudioFileClip(file_path)
        audio_duration = audio__.duration
        print(audio_duration)
        
        result = pipe(file_path, return_timestamps=True, generate_kwargs={"language": "english"})
        print(result["chunks"])

        for chunk in result["chunks"]:
            chunk_text = chunk['text']
            timestep_of_curr = chunk['timestamp']
            #beg, end = timestep_of_curr
            #beg, end = round(beg + running_len, 3), round(end + running_len, 3)

            curr_transcript.append(f"Transcribed Speech: {chunk_text}")
            print(f"Transcribed Speech: {chunk_text}")

        transcript_data.append(curr_transcript)
        running_len += audio_duration


In [21]:
transcript_data

[['Transcribed Speech:  🎵'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  Thank you.'],
 ['Transcribed Speech:  Thank you.'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  Thank you.'],
 ['Transcribed Speech:  Thank you.'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  Thank you.'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  Thank you.'],
 ['Transcribed Speech:  piano plays softly'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  you'],
 ['Transcribed Speech:  Bye-bye, bye-bye', 'Transcribed Speech:  Knock'],
 ["Transcribed Speech:  Can't change."],
 ['Transcribed Speech:  Bye.'],
 ['Transcribed Speech:  Even if I try'],
 ['Transcribed Speech:  Even if'],
 ['Transcribed Speech:  If I wanted to'],
 ["Transcribed Speech:  And I can't"],
 ['Transcribed Speech:  change'],
 ['Transcribed Spe

In [7]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification
from datasets import load_dataset
import torch

feature_extractor2 = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model2 = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [8]:
import librosa

# Collect all mp3 files and sort them
files = [f for f in os.listdir(output_directory) if f.endswith(".mp3")]
files.sort()  # Sort files alphabetically and numerically

audio_tags = []

# Iterate over each file in the directory
for filename in files:
    if filename.endswith((".mp3", ".wav")):  # Add other audio formats as needed
        file_path = os.path.join(output_directory, filename)
        
        audio1, sr = librosa.load(file_path, sr=16000, mono=True)  # Load with original sr, ensure mono
        inputs1 = feature_extractor2(audio1, sampling_rate=sr, return_tensors="pt")

        with torch.no_grad():
            logits = model2(**inputs1).logits

        #predicted_class_ids = torch.argmax(logits, dim=-1).item()
        predicted_class_ids = torch.topk(logits, 5, dim=-1).indices[0].tolist()
        predicted_label = list(map(model2.config.id2label.get, predicted_class_ids))
        audio_tags.append(predicted_label)
        print(predicted_label)

['Ambient music', 'Music', 'New-age music', 'Singing bowl', 'Gong']
['Music', 'Ambient music', 'New-age music', 'Musical instrument', 'Piano']
['Music', 'Ambient music', 'Musical instrument', 'Plucked string instrument', 'New-age music']
['Music', 'Vehicle', 'Musical instrument', 'Piano', 'Keyboard (musical)']
['Music', 'Speech', 'Ambient music', 'Singing bowl', 'Television']
['Music', 'Television', 'Scary music', 'Ambient music', 'Foghorn']
['Music', 'Piano', 'Keyboard (musical)', 'Musical instrument', 'Electric piano']
['Music', 'Piano', 'Musical instrument', 'Electric piano', 'Keyboard (musical)']
['Music', 'Piano', 'Keyboard (musical)', 'Electric piano', 'Musical instrument']
['Music', 'Musical instrument', 'Piano', 'Keyboard (musical)', 'Electric piano']
['Music', 'Speech', 'Piano', 'Musical instrument', 'Keyboard (musical)']
['Music', 'Piano', 'Musical instrument', 'Keyboard (musical)', 'Electric piano']
['Music', 'Piano', 'Musical instrument', 'Keyboard (musical)', 'Electric pia

In [9]:
audio_tags

[['Ambient music', 'Music', 'New-age music', 'Singing bowl', 'Gong'],
 ['Music', 'Ambient music', 'New-age music', 'Musical instrument', 'Piano'],
 ['Music',
  'Ambient music',
  'Musical instrument',
  'Plucked string instrument',
  'New-age music'],
 ['Music', 'Vehicle', 'Musical instrument', 'Piano', 'Keyboard (musical)'],
 ['Music', 'Speech', 'Ambient music', 'Singing bowl', 'Television'],
 ['Music', 'Television', 'Scary music', 'Ambient music', 'Foghorn'],
 ['Music',
  'Piano',
  'Keyboard (musical)',
  'Musical instrument',
  'Electric piano'],
 ['Music',
  'Piano',
  'Musical instrument',
  'Electric piano',
  'Keyboard (musical)'],
 ['Music',
  'Piano',
  'Keyboard (musical)',
  'Electric piano',
  'Musical instrument'],
 ['Music',
  'Musical instrument',
  'Piano',
  'Keyboard (musical)',
  'Electric piano'],
 ['Music', 'Speech', 'Piano', 'Musical instrument', 'Keyboard (musical)'],
 ['Music',
  'Piano',
  'Musical instrument',
  'Keyboard (musical)',
  'Electric piano'],
 ['M

In [None]:
import base64
import cv2

def extract_frames(video_path, frame_interval):
    # Initialize video capture
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    current_frame = 0
    base64Frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # No more frames or error

        # Check if the current frame number is the one we want
        if current_frame % frame_interval == 0:
            # Process the frame (e.g., save or display it)
            _, buffer = cv2.imencode(".jpg", frame)
            base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

        current_frame += 1

    cap.release()
    return base64Frames

# Collect all mp3 files and sort them
vid2_scene_files = [f for f in os.listdir('scenes/vid1') if f.endswith(".mp4")]
vid2_scene_files.sort()  # Sort files alphabetically and numerically

all_scenes_frames = []

for filename in vid2_scene_files:
    if filename.endswith(".mp4"):
        # Construct full paths for input and output files
        input_file = os.path.join(input_directory, filename)
        # Example usage: Extract every 10th frame from 'input_video.mp4'
        rate = 200
        #if filename == 'vid2-Scene-009.mp4': rate = 90
        base64Frames = extract_frames(input_file, rate)
        print(len(base64Frames), "frames read.")
        all_scenes_frames.append(base64Frames)


In [22]:
system_message = """
Your task is to identify metaphors in a video. I will give you frames. 

The procedure I followed in generating these frames is as follows: I first splitted the video into distinct scenes. 
Then sampled frames from each of the scene videos. 

This is not all. For each scene, in addition to sequence of frames, you will receive list of transcribed speech, which may include song lyrics,
and a list of audio classifications. Audio classification will include 5 most likely classifications for the audio in
that scene, with the first element being the most likely. 

Transcribed speech is a list, because there may be multiple speeches in a scene.
Since the most likely audio label for the audio in this scene is music, you will process the 'Transcribed Speech' accordingly.

I will send groups of frames, each of which represents a distinct scene, and two additional audio descriptions.
For each scene, I want you to consider what's going on, 

Here is the template:

Scene 1:
    Description of the scene.
    Metaphors, if any.

I ask you to do something similar for each scene, while putting each of these into their respective context in the sequence. 
Then, I want you to give me 3 most likely metaphors that are used to convey a message in the whole video.
"""

'\n    1. Objects, if any.\n    2. Subjects, if any.\n    3. Actions, if any.\nThen, using what you found in each frame, answer the following:"\n\n    1. Contextualize each frame according to their location in temporality of the scene. Consider\n    any transformations, actions, or interactions happening between or being performed by any\n    of the objects, subjects, and actions in the video, and generate 5 most likely metaphors that\n    are used in this scene.\n    2. Restructure each of the 5 metaphors using the format: a is a metaphor for b. You will fill in\n    a and b using most likely concepts. Note that b is not necessarily a metaphor for a.\n\nAfterwards, answer the following:\n    1. Contextualize each scene according to their location in temporality of the video. Describe\n    any transformations, actions, or interactions happening between or being performed by any\n    of the objects, subjects, and actions in the video.\n    2. What are the 5 most likely metaphors for the

In [37]:
PROMPT_MESSAGES = [
    {
        "role": "system",
        "content": system_message
    },
]


vid1_scenes_list = [8, 10, 11, 13, 14, 15, 16, 17, 19, 22, 24, 27, 31, 32, 33, 34, 36, 38]

for i in range(len(vid1_scenes_list)):
    PROMPT_MESSAGES.append({
          "role": "user",
          "content": [
            f"Image/Frame Information for scene {vid1_scenes_list[i]}",
            *map(lambda x: {"image": x, "resize": 600}, all_scenes_frames[vid1_scenes_list[i]]),
            "Transcribed Speeches",
            str(transcript_data[vid1_scenes_list[i]]),
            "Audio Classification Labels",
            str(audio_tags[vid1_scenes_list[i]][:3])
          ]
        }        
    )

In [26]:
(PROMPT_MESSAGES[1]['content'])

['Image/Frame Information for scene 3',
 {'image': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCALQBQADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD4rqGTtU1Qydq5z6AveGP+Qsn0NfcXgz/km0X1/rXw94Y/5CyV9weDP+SbRf5715mJ3A2fg9qMMKo0jHmdsce9fQnhC7iuNKTy25Svmn4TtmSFS3/LZ/6V

In [39]:
from openai import OpenAI


params = {
    "model": "gpt-4o",
    "messages": PROMPT_MESSAGES,
    "max_tokens": 4000,
}

client = OpenAI(
   api_key='api-keyC',
)

result = client.chat.completions.create(**params)
print(result.choices[0].message.content)

Considering what's going on in each scene, here are the metaphorical interpretations:

Scene 8:
    Description: An holographic image of two individuals hugging and kissing, viewed by a group of people.
    Metaphors: "Skeletons Hugging and Kissing" (love transcends physical appearances).

Scene 10:
    Description: A crowd watching intently towards the performance or screen.
    Metaphors: "Unified Interest and Curiosity" (collective engagement and shared experiences).

Scene 11:
    Description: Close-up of the holographic skeletons kissing.
    Metaphors: "Bare Bones of Love" (love is universal beyond surface-level appearances).

Scene 13:
    Description: One half of the holographic skeleton, visible on screen, with a crowd watching.
    Metaphors: "Half-visible" (only part of someone's identity is visible, representing hidden depths or unseen attributes).

Scene 14:
    Description: Side view of the holographic skeleton, with someone peeking out.
    Metaphors: "Hidden Identity Em