In [1]:
import os
from tqdm import tqdm
from typing import List, cast
import numpy as np
import pprint

In [2]:
import torch
import speech_recognition as sr

from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
from PIL import Image

In [3]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, PaliGemmaProcessor

In [4]:
from colpali_engine.models import ColPali, ColPaliProcessor
from colpali_engine.utils.torch_utils import ListDataset, get_torch_device

In [5]:
from scenedetect import open_video,SceneManager,StatsManager
from scenedetect.detectors import ContentDetector

In [6]:
import openai
from llama_index.core import SimpleDirectoryReader

In [7]:
api_key = 'AIzaSyB497zdXQLJBOI7wUj9g7mjhh7gOpa_UBU'

# 1. Video Precessing

In [7]:
video_path = './video/input_vid.mp4'
output_folder = "./img/"
output_audio_path = "./img/output_audio.wav"

In [8]:
def video_to_images(video_path, output_folder):
    output = output_folder
    video = open_video(video_path)

    scene_manager = SceneManager(stats_manager=StatsManager())
    scene_manager.add_detector(ContentDetector())
    scene_manager.detect_scenes(video)

    scene_list = scene_manager.get_scene_list()
    for index, scene in enumerate(scene_list):
        padded_index = f'{index:03}'
        save_images(scene_list=[scene], 
                    video=video,
                    image_extension='png',
                    image_name_template=f'$VIDEO_NAME-Scene-{padded_index}',
                    output_dir=output,
                    num_images=1)

# 2. Vector Store

## 2. 1 Load Embedding Model

In [9]:
Embedding_model_name = "vidore/colpali-v1.2"

Embedding_model = ColPali.from_pretrained(
    Embedding_model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",  # or "mps" if on Apple Silicon
).eval()

processor = ColPaliProcessor.from_pretrained(Embedding_model_name)



`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 2. 2 Video Embedding

In [10]:
# フォルダ内のPNGファイルをファイル名順に取得
images = []
png_files = sorted([filename for filename in os.listdir(output_folder) if filename.endswith('.png')])

# 画像を開いてリストに追加
for filename in png_files:
    image_path = os.path.join(output_folder, filename)
    images.append(Image.open(image_path))

# Run inference - docs
dataloader = DataLoader(
    dataset=ListDataset[str](images),
    batch_size=4,
    shuffle=False,
    collate_fn=lambda x: processor.process_images(x),
)
ds: List[torch.Tensor] = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(Embedding_model.device) for k, v in batch_doc.items()}
        embeddings_doc = Embedding_model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

  0%|                                                                                                     | 0/6 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.79it/s]


## 2. 3 Query Embedding

In [12]:
queries = [
    "How many astronauts in this spaceship?",
    "What are the names of the astronauts?",
    "What is the theme of this video?",
    "What kind of video is this?",
    "What activities are the astronauts performing?",
    "What year was this mission conducted?",
    "What are the key challenges faced by astronauts in space?",
    "What equipment is used by astronauts in the video?",
    "How does this mission contribute to space exploration?",
    "What is the target audience for this video?"
]
# Run inference - queries
dataloader = DataLoader(
    dataset=ListDataset[str](queries),
    batch_size=4,
    shuffle=False,
    collate_fn=lambda x: processor.process_queries(x),
)

qs: List[torch.Tensor] = []
for batch_query in dataloader:
    with torch.no_grad():
        batch_query = {k: v.to(Embedding_model.device) for k, v in batch_query.items()}
        embeddings_query = Embedding_model(**batch_query)
    qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))

# 3. Retrieve

In [13]:
# Run scoring
scores = processor.score(qs, ds).cpu().numpy()
idx_top_1 = scores.argsort(axis=1)[:, -5:][:, ::-1]
print("Indices of the top-3 retrieved documents for each query:\n" , idx_top_1)

Indices of the top-3 retrieved documents for each query:
 [[20 12 11  1 13]
 [11 12  1 20  6]
 [18 17 21  4  0]
 [21 17  4 18 15]
 [12 20 11  5  8]
 [ 1  7 13 11 12]
 [20 12 11  6  1]
 [20 12 10  5  6]
 [22 12  7 10 13]
 [17 18  4 21 10]]


# 4. Answer Generation (


## 4. 1 Gemini

In [13]:
import google.generativeai as genai


genai.configure(api_key=api_key)
model_gemini = genai.GenerativeModel(model_name="gemini-1.5-flash")

In [14]:
image_test = Image.open('./img/input_vid-Scene-011.png')
response = model_gemini.generate_content([images[22], queries[0]])
print(response.text)

There are no astronauts in this image. The image shows the NASA logo.


## 4. 2 PaliGemma

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
model_id = "google/paligemma-3b-ft-nlvr2-448"
model_paligemma = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
processor_paligemma = PaliGemmaProcessor.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
def answer_query(query_number):
    inputs = processor_paligemma(images=[np.array(images[i]) for i in idx_top_1[query_number]], text=[queries[query_number]]*3, return_tensors="pt")
    output = model_paligemma.generate(**inputs, max_new_tokens=20)
    print(processor_paligemma.decode(output[0], skip_special_tokens=True))

In [19]:
answer_query(3)
answer_query(4)
answer_query(5)


What kind of video is this?
True
What activities are the astronauts performing?
None
What year was this mission conducted?
2016


## 4.3 LLaMa

In [14]:
from dotenv import load_dotenv
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [15]:
qa_tmpl_str = (
    """
 Given the provided information, including relevant images and retrieved context from the video, \
 accurately and precisely answer the query without any additional prior knowledge.\n"
    "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
    "---------------------\n"
    "Context: {context_str}\n"
    "---------------------\n"
    "Query: {query_str}\n"
    "Answer: "
"""
)

In [22]:
def audio_to_text(audio_path):
    """
    Convert an audio file to text.

    Parameters:
    audio_path (str): The path to the audio file.

    Returns:
    test (str): The text recognized from the audio.

    """
    print(audio_path)
    recognizer = sr.Recognizer()
    audio = sr.AudioFile(audio_path)

    with audio as source:
        # Record the audio data
        audio_data = recognizer.record(source)

        try:
            # Recognize the speech
            text = recognizer.recognize_whisper(audio_data)
        except sr.UnknownValueError:
            print("Speech recognition could not understand the audio.")
        except sr.RequestError as e:
            print(f"Could not request results from service; {e}")

    return text
def video_to_audio(video_path, output_audio_path):
    """
    Convert a video to audio and save it to the output path.

    Parameters:
    video_path (str): The path to the video file.
    output_audio_path (str): The path to save the audio to.

    """
    from moviepy.editor import VideoFileClip
    clip = VideoFileClip(video_path)
    audio = clip.audio
    audio.write_audiofile(output_audio_path)

In [24]:
video_to_audio(video_path, output_audio_path)
text_data = audio_to_text(output_audio_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

MoviePy - Writing audio in ./img/output_audio.wav


                                                                                                                                

MoviePy - Done.
./img/output_audio.wav


100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 41.5MiB/s]


In [25]:
with open(output_folder + "output_text.txt", "w") as file:
    file.write(text_data)
print("Text data saved to file")
file.close()
os.remove(output_audio_path)
print("Audio file removed")

Text data saved to file
Audio file removed


In [16]:
def run_llama(index, qa_tmpl_str):
    # テキスト情報をまとめる
    txt = ["As I look back on the mission that we've had here on the International Space Station, I'm proud to have been a part of much of the science activities that happened over the last two months. I didn't think I would do another spacewalk and to now have the chance to have done four more was just icing on the cake for a wonderful mission. The 10th one, do you like the first one? No, a little more comfortable. It's hard to put into words just what it was like to be a part of this expedition, the Expedition 63. It'll be kind of a memory that will last a lifetime for me. It's been a true honor. Try and space X, Undock sequence commanded. The thrusters looking good. The hardest part was getting us launched, but the most important part is bringing us home. I've been trying that day. We love you. Hurry home for weeks and don't get my dog. Slash down. Welcome back to Planet Earth and thanks for flying SpaceX. We're literally on our own. Space dads are back on Earth after a 19-hour return journey from space. The Earth is a very important part of the planet. The Earth is a very important part of the planet. The Earth is a very important part of the planet. The Earth is a very important part of the planet. The Earth is a very important part of the planet."]    

    # 入力画像
    img = []
    for i in idx_top_1[index]:
        img.append(f"{output_folder}input_vid-Scene-{str(i).zfill(3)}.png")

    # クエリ
    query_str = queries[index]

    # ドキュメント
    image_documents = SimpleDirectoryReader(
        input_dir=output_folder, input_files=img
    ).load_data()
    context_str = "".join(txt)

    # LLM読み込み
    openai_mm_llm = OpenAIMultiModal(
        model="gpt-4o", api_key=os.getenv('OPENAI_API_KEY'), max_new_tokens=1500
    )

    # 回答文を生成
    response_1 = openai_mm_llm.complete(
        prompt=qa_tmpl_str.format(
            context_str=context_str, query_str=query_str, ),
        image_documents=image_documents,
    )
    print(response_1.text)

In [17]:
run_llama(1, qa_tmpl_str)

The astronauts mentioned are Douglas Hurley and Robert Behnken.
