In [13]:
import re
import json
import argparse
import os
import weaviate
from extract_data_all_files import extract_metadata_and_sentences
from frame_desc_all import process_video_frames

In [14]:
def main(input_directory, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Get the list of VRT files in the input directory
    vrt_files = [f for f in os.listdir(input_directory) if f.endswith('.vrt')]

    # Process each VRT file
    for file_name in vrt_files:
        # Create the input file path
        input_file = os.path.join(input_directory, file_name)

        # Extract metadata and sentences
        video_data = extract_metadata_and_sentences(input_file)

        # Create the output file path
        output_file = os.path.join(output_folder, f"{file_name[:-4]}.json")

        # Save the result as JSON
        with open(output_file, 'w') as file:
            json.dump(video_data, file, indent=1)

        print(f"Extraction completed. Output file: {output_file}")
    
    video_files = [f for f in os.listdir(input_directory) if f.endswith('.mp4')]
    
    for file_name in video_files:
        input_file = os.path.join(input_directory, file_name)
        json_file = os.path.join(output_folder, f"{file_name[:-4]}.v4.json")
        process_video_frames(input_file, json_file, "../frames/")


In [15]:
# 'input_files` should only contain new .vrt and  their corresponding .mp4 files for which you want to populate data, delete already existing files to avoid duplicate insertions, as the below function will run all files which are there in input_files folder.

In [16]:
main("../input_files", "../output_data")   # Here input_files is input directory containing videos and vrt files.

input_file ../input_files/Second Persian Invasion.mp4


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf6 in position 26: invalid start byte

In [None]:
client = weaviate.Client("http://localhost:8080")
client.schema.get()  # For the first time you will get classes: [] as you have no classes.

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embedding(text):
    embeddings = model.encode(sentences)
    return embeddings

In [None]:
class_obj = {
    "class": "Video_text"
}
client.schema.create_class(class_obj)  # This will give error if class already exists.

In [None]:
class_obj1 = {
    "class": "Video_text_description"
}
client.schema.create_class(class_obj1)

In [None]:
class_obj2 = {
    "class": "Video_description"
}
client.schema.create_class(class_obj2)

In [None]:
input_directory = 'output_data'

In [None]:
json_files = [f for f in os.listdir(input_directory) if f.endswith('.json')]

In [None]:
# This list will show json files that you will be populating. Verify once that this files are already not populated to avoid duplicate data.
json_files

In [None]:
# Make sure that json_files you have in variable 'json_files' have already not been populated. Otherwise it will insert duplicate records in weaviate.

for file_name in json_files:
    input_file = os.path.join(input_directory, file_name)
    with open(input_file) as f:
        data = json.load(f)  
        metadata = data['metadata']['file']
        video_id = data['metadata']['text_id'][3:]
        with client.batch(batch_size=100) as batch:
            
            for sent in data['sentences']:
                
                embedding_video_text = model.encode(sent['sentence'])
                properties_text = {
                   "text": sent['sentence'],
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                
                client.batch.add_data_object(
                    properties_text,
                    "Video_text",
                    vector = embedding_video_text
                )
                                
                combined_text = "In the video you can hear: " + sent['sentence'] + " In the video you can see: " + ", ".join([sentence.strip(" .") for sentence in sent['frame_data']]) + '.'
                embedding_video_text_desc = model.encode(combined_text)
                properties_video_text_desc = {
                   "text": combined_text,
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                client.batch.add_data_object(
                    properties_video_text_desc,
                    "Video_text_description",
                     vector = embedding_video_text_desc
                )
                
                video_desc = ", ".join([sentence.strip(" .") for sentence in sent['frame_data']]) + '.'
                embedding_video_desc = model.encode(video_desc)
                properties_video_desc = {
                   "text": video_desc,
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                client.batch.add_data_object(
                    properties_video_desc,
                    "Video_description",
                     vector = embedding_video_desc
                )
    print("file done")

In [None]:
client.query.aggregate("Video_text_description").with_meta_count().do()