In [1]:
import os
from dotenv import load_dotenv
from video_prompter import get_connection

# TODO: setup .env file
load_dotenv()
OPENAI_KEY = os.getenv('OPENAI_API_KEY')

# connect to VideoDB
conn = get_connection()

### ----- Upload a fresh video --------- #####
def fresh_video(url):
    video = conn.upload(url=url)
    #index spoken content in the video
    video.index_spoken_words()
    return video

#### ------ run the prompt on video --------####
def videodb_prompter(video_id, prompt):
    video = get_video(video_id)
    #get all the segment of videos that are
    return video_prompter(video, prompt)

In [2]:
from videodb import play_stream

# ----Existing video case ----

# TODO: replace with your video id
# # video_id = "m-replace-with-your-video-id-24-7"
video_id = 'm-beb3ee4a-3169-4cb6-8dc1-beee73eb00a1'
# video_id = 'm-beb3ee4a-3169-4cb6-8dc1-beee73eb00a1'


# ---- Fresh video case ----
# url = "https://youtu.be/pAMy7IhOVQE?si=iNvbJE02Pf8XXGd8"
# video = fresh_video(url)
# video_id = video.id


#watch the original video
video = conn.get_collection().get_video(video_id)
video.player_url

'https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/5b3d1e2c-1fc8-4b31-865c-a7da97ca0379.m3u8'

In [3]:
scene = video.list_scene_index()
print(scene)

[{'name': 'Scene Index 2024-07-12 10:42', 'scene_index_id': '245692238bce5048', 'status': 'done'}, {'name': 'Scene Index 2024-07-22 05:46', 'scene_index_id': '54c3ac895b77d3dc', 'status': 'done'}, {'name': 'Scene Index 2024-07-16 15:48', 'scene_index_id': '5f5a9ab5e724499a', 'status': 'done'}, {'name': 'Scene Index 2024-07-18 04:47', 'scene_index_id': 'bc32304c34e5e5a6', 'status': 'done'}]


In [5]:
scene_index = video.get_scene_index("bc32304c34e5e5a6")
print(scene_index)

[{'description': 'In these images, a group of armed individuals dressed in tactical gear and helmets are infiltrating a building. They are navigating through hallways and rooms cautiously, with firearms ready and using flashlights to illuminate their surroundings. The setting appears to be indoors, possibly a residential or office space, with furniture like chairs and tables visible. The team is moving in a coordinated manner, suggesting a planned operation. The immediate environment is dimly lit, which might be why they are using flashlights.', 'end': 2.48, 'start': 0.0}, {'description': 'In these images, a group of heavily armored individuals, likely soldiers or special forces, is moving quickly through a dimly lit hallway. They are wearing full-body protective gear, including helmets, and carry backpacks and equipment. The hallway appears to be in a building, possibly a hotel or office, with doors lining both sides. The lighting is subdued, and the figures are in motion, indicating 

In [4]:
transcript = video.get_transcript()

In [6]:
def create_multimodal_chunks_individual(transcript, scenes, chunk_size=5):
    """
    Create multimodal data chunks for each individual scene within specified chunk size intervals.

    :param transcript: List of transcript entries, each with 'start', 'end', and 'text' fields.
    :param scenes: List of scene data, each with 'start', 'end', and 'description' fields.
    :param chunk_size: Number of scenes to group into each chunk. Default is 5.
    :return: List of multimodal data chunks.
    """
    def filter_text_by_time(transcript, start_time, end_time):
        result = []

        for entry in transcript:
            if float(entry['end']) > start_time and float(entry['start']) < end_time:
                text = entry['text']
                if text != '-':
                    result.append(text)

        return ' '.join(result)

    chunks = []

    for i in range(0, len(scenes), chunk_size):
        chunk = []
        for scene in scenes[i:i+chunk_size]:
            spoken = filter_text_by_time(transcript, float(scene["start"]), float(scene["end"]))
            data = {
                "visual": scene["description"], 
                'spoken': spoken, 
                'start': scene["start"], 
                'end': scene["end"]
            }
            chunk.append(data)
        chunks.append(chunk)

    return chunks

 


chunks = create_multimodal_chunks_individual(transcript, scene_index)
print(chunks)


[[{'visual': 'In these images, a group of armed individuals dressed in tactical gear and helmets are infiltrating a building. They are navigating through hallways and rooms cautiously, with firearms ready and using flashlights to illuminate their surroundings. The setting appears to be indoors, possibly a residential or office space, with furniture like chairs and tables visible. The team is moving in a coordinated manner, suggesting a planned operation. The immediate environment is dimly lit, which might be why they are using flashlights.', 'spoken': '', 'start': 0.0, 'end': 2.48}, {'visual': 'In these images, a group of heavily armored individuals, likely soldiers or special forces, is moving quickly through a dimly lit hallway. They are wearing full-body protective gear, including helmets, and carry backpacks and equipment. The hallway appears to be in a building, possibly a hotel or office, with doors lining both sides. The lighting is subdued, and the figures are in motion, indica

In [7]:
import concurrent.futures
import json
from dotenv import load_dotenv
from videodb import connect
from llm_agent import LLM, LLMType

load_dotenv()


def get_connection():
    """
    Get connection and load the env.
    :return:
    """
    conn = connect()
    return conn


def get_video(id):
    """
    Get video object
    :param id:
    :return:
    """
    conn = get_connection()
    all_videos = conn.get_collection().get_videos()
    video = next(vid for vid in all_videos if vid.id == id)
    return video


# def chunk_transcript(docs, chunk_size):
#     """
#     chunk transcript to fit into context of your LLM
#     :param docs:
#     :param chunk_size:
#     :return:
#     """
#     for i in range(0, len(docs), chunk_size):
#         yield docs[i: i + chunk_size]  # Yield the current chunk


def chunk_descriptions(descriptions, chunk_size=2000):
    """
    Chunk scene descriptions to fit into context of your LLM.
    :param descriptions: List of scene descriptions
    :param chunk_size: Number of descriptions per chunk
    :return: Generator yielding chunks of descriptions
    """
    for i in range(0, len(descriptions), chunk_size):
        # print(descriptions[i:i + chunk_size])
        yield descriptions[i:i + chunk_size]


def send_msg_openai(chunk_prompt, llm=LLM()):
    print("Sendiing call to OPENAI",chunk_prompt)
    response = llm.chat(message=chunk_prompt)
    print(response)
    return json.loads(response["choices"][0]["message"]["content"])
    # sentences = output.get('sentence')
    # return sentences


def send_msg_claude(chunk_prompt, llm):
    response = llm.chat(message=chunk_prompt)
    # TODO : add claude reposnse parser
    return response


def text_prompter_2(chunks, prompt, llm=None):
    # chunk_size = 10000
    # sentence tokenizer
    # chunks = chunk_descriptions(transcript_text, chunk_size=chunk_size)

    if llm is None:
        llm = LLM()

    if llm.type == LLMType.OPENAI:
        llm_caller_fn = send_msg_openai
    else:
        llm_caller_fn = send_msg_claude

    matches = []
    prompts = []
    i = 0
    for chunk in chunks:
        # visual_descriptions = ' '.join([entry['visual'] for entry in chunk])
        # spoken_texts = ' '.join([entry['spoken'] for entry in chunk if entry['spoken']])

        chunk_prompt = f"""
       You are given visual and spoken information of the video of each second, and a transcipt of what's being spoken along with timestamp.
        Your task is to evaluate the data for relevance to the specified user prompt.
        Corelate visual and spoken content to find the relevant video segment.
        provide the start and end timestamps by analyse the full chunk and give longest matching timestamps. You can merge the 1 second chunks and transcripts to make continuous response.

        Multimodal Data:
        video: {chunk}
        User Prompt: {prompt}

    
        """
        chunk_prompt += """
         **Output Format**: Return a JSON list named 'result' that containes the  fileds `sentence`, `start`, `end` Ensure the final output
        strictly adheres to the JSON format specified without including additional text or explanations. \
        If there is no match return empty list without additional text. Use the following structure for your response:
        {"result":{"sentence":<>, "start":<>, "end":<>}}
        """
        prompts.append(chunk_prompt)
        i += 1

        # chunk_prompt = f"""
        
        # data: {chunk}
        # User Prompt: {prompt}

        # You will be given a user prompt and some data. You need to do a keyword search on the data and find similar and return as array 

        # """

    for prompt in prompts:
      try:
        res = llm_caller_fn(prompt)
        # print(res)
        matches.append(res)
      except Exception as e:
        print(f"Chunk failed to work with LLM {str(e)}")
    return matches


In [18]:

chunks = create_multimodal_chunks_individual(transcript, scene_index)
print(len(chunks))
prompt = "A man, wearing a suit and tie"
matches = text_prompter_2(chunks, prompt)
print(matches)


43
Sendiing call to OPENAI 
       You are given visual and spoken information of the video of each second, and a transcipt of what's being spoken along with timestamp.
        Your task is to evaluate the data for relevance to the specified user prompt.
        Corelate visual and spoken content to find the relevant video segment.
        provide the start and end timestamps by analyse the full chunk and give longest matching timestamps. You can merge the 1 second chunks and transcripts to make continuous response.

        Multimodal Data:
        video: [{'visual': 'In these images, a group of armed individuals dressed in tactical gear and helmets are infiltrating a building. They are navigating through hallways and rooms cautiously, with firearms ready and using flashlights to illuminate their surroundings. The setting appears to be indoors, possibly a residential or office space, with furniture like chairs and tables visible. The team is moving in a coordinated manner, suggesting 

In [19]:
matches

[{'result': []},
 {'result': []},
 {'result': []},
 {'result': []},
 {'result': []},
 {'result': []},
 {'result': []},
 {'result': [{'sentence': 'Three men are engaged in conversation. The man on the left is wearing a police or military uniform with a hat, while the two men on the right are dressed in suits. The man in the center is holding his hand to his ear, possibly listening to an earpiece or communicating via a device.',
    'start': 47.08,
    'end': 47.72},
   {'sentence': 'Two men, both dressed in suits, are in the foreground. One of them, wearing an earpiece, seems to be communicating or listening intently as he is looking to his side. Behind them, several firefighters wearing helmets and reflective gear are standing next to or moving around a fire truck, which has a sign written in a non-Latin script (possibly Cyrillic).',
    'start': 47.72,
    'end': 48.32},
   {'sentence': 'A man dressed in a suit, likely a central character, is running or moving briskly towards the came

In [13]:
from videodb import SearchType
from videodb import IndexType
from videodb.asset import VideoAsset, AudioAsset, ImageAsset
from videodb.timeline import Timeline


def extract_timestamps(data):
    timeframes = []
    for item in data:
        data_2 = item.get("result")
        for time in data_2:
            start_time = time.get('start')
            end_time = time.get('end')
            if start_time and end_time:
                start_seconds = convert_to_seconds(start_time)
                end_seconds = convert_to_seconds(end_time)
                timeframes.append({
                'start_time': start_seconds,
                'end_time': end_seconds
            })
            print(f"Extracted timeframe: start={start_seconds}, end={end_seconds}")  
    return timeframes

def convert_to_seconds(time):
    if isinstance(time, str):
        parts = time.split(':')
        if len(parts) == 3:
            return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
        elif len(parts) == 2:
            return int(parts[0]) * 60 + int(parts[1])
        else:
            return int(time)
    elif isinstance(time, int):
        return time
    else:
        raise ValueError("Unsupported time format")

def merge_timeframes(timeframes):
    if not timeframes:
        print("No timeframes to merge")  
        return []
    
    sorted_timeframes = sorted(timeframes, key=lambda x: x['start_time'])
    merged_timeframes = [sorted_timeframes[0]]

    for timeframe in sorted_timeframes[1:]:
        last = merged_timeframes[-1]
        if last['end_time'] >= timeframe['start_time']:
            last['end_time'] = max(last['end_time'], timeframe['end_time'])
            print(f"Merging: {last}")  
        else:
            merged_timeframes.append(timeframe)
            print(f"Appending: {timeframe}")  

    return merged_timeframes

def construct_timeline_and_stream( selected_frames):
    timeline = Timeline(conn)
    
    print("Selected frames:", selected_frames)  

    for frame in selected_frames:
        print("Adding frame:", frame)  
        video_asset = VideoAsset(
            asset_id=video_id,
            start=frame['start_time'],
            end=frame['end_time']
        )
        timeline.add_inline(video_asset)
    
    print("Timeline created:", timeline)  
    
    stream_url = timeline.generate_stream()
    print("Stream URL:", stream_url)
    return play_stream(stream_url)


In [20]:


timeframes = extract_timestamps(matches)
merged_timeframes = merge_timeframes(timeframes)
construct_timeline_and_stream(merged_timeframes)

ValueError: Unsupported time format