In [15]:
import weaviate
client = weaviate.Client("http://localhost:8080")
client.schema.get()  # For the first time you will get classes: [] as you have no classes.

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


{'classes': []}

In [16]:
!pip install -U sentence-transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embedding(text):
    embeddings = model.encode(sentences)
    return embeddings




In [18]:
class_obj = {
    "class": "Video_text"
}
client.schema.create_class(class_obj)  # This will give error if class already exists.

In [19]:
class_obj1 = {
    "class": "Video_text_description"
}
client.schema.create_class(class_obj1)

In [20]:
class_obj2 = {
    "class": "Video_description"
}
client.schema.create_class(class_obj2)

In [21]:
import os
import json
input_directory = 'output_data'

In [22]:
json_files = [f for f in os.listdir(input_directory) if f.endswith('.json')]


In [23]:
# This list will show json files that you will be populating. Verify once that this file is already not populated to avoid duplicate data.
json_files

['Second Persian Invasion.v4.json',
 'Caesar, Cleopatra and the Ides of March  World History  Khan Academy.v4.json',
 'The Peloponnesian War  World History  Khan Academy.v4.json',
 'Rise of Julius Caesar  World History  Khan Academy.v4.json',
 'Roman social and political structures  World History  Khan Academy.v4.json',
 'Pre Columbian Americas   World History  Khan Academy.v4.json',
 'Overview of ancient Persia  World History  Khan Academy.v4.json',
 'Socrates Plato Aristotle  World History  Khan Academy.v4.json',
 'Background of the Carthaginians  World History  Khan Academy.v4.json',
 'Ancient Egypt  Early Civilizations  World History  Khan Academy.v4.json',
 'Ancient Mesopotamia  Early Civilizations  World History  Khan Academy.v4.json',
 'Overview of ancient Greece  World History  Khan Academy.v4.json',
 'Indus Valley Civilization   Early Civilizations  World History  Khan Academy.v4.json',
 'Sanskrit connections to English   World History  Khan Academy.v4.json',
 'Beginning of th

In [24]:
# Make sure that json_files you have in variable 'json_files' have already not been populated. Otherwise it will insert duplicate records in weaviate.

for file_name in json_files:
    input_file = os.path.join(input_directory, file_name)
    with open(input_file) as f:
        data = json.load(f)  
        metadata = data['metadata']['file']
        video_id = data['metadata']['text_id']
        with client.batch(batch_size=100) as batch:
            
            for sent in data['sentences']:
                
                embedding_video_text = model.encode(sent['sentence'])
                properties_text = {
                   "text": sent['sentence'],
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                
                client.batch.add_data_object(
                    properties_text,
                    "Video_text",
                    vector = embedding_video_text
                )
                                
                combined_text = "In the video you can hear: " + sent['sentence'] + " In the video you can see: " + ", ".join([sentence.strip(" .") for sentence in []]) + '.'
                embedding_video_text_desc = model.encode(combined_text)
                properties_video_text_desc = {
                   "text": combined_text,
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                client.batch.add_data_object(
                    properties_video_text_desc,
                    "Video_text_description",
                     vector = embedding_video_text_desc
                )
                
                video_desc = ", ".join([sentence.strip(" .") for sentence in []]) + '.'
                embedding_video_desc = model.encode(video_desc)
                properties_video_desc = {
                   "text": video_desc,
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                client.batch.add_data_object(
                    properties_video_desc,
                    "Video_description",
                     vector = embedding_video_desc
                )
    print("file done")

file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done
file done


In [25]:
client.query.aggregate("Video_text_description").with_meta_count().do()

{'data': {'Aggregate': {'Video_text_description': [{'meta': {'count': 3657}}]}}}

In [26]:
## Vector Similarity Search

text_search_input = "Ted Cruz scores a huge victory"
image_search_input = "a group of people taking photos"
combined_text = "In the video you can hear: " + text_search_input + "In the video you can see: " + image_search_input
vector = model.encode(combined_text)

response = (
    client.query
    .get("Video_text_description", ["text", "starttime", "endtime", "metadata","video_id"])
    .with_near_vector({
        "vector" : vector
    })
    .with_limit(5)
    .with_additional(["distance"])
    .do()
)
print(json.dumps(response, indent=4))


{
    "data": {
        "Get": {
            "Video_text_description": [
                {
                    "_additional": {
                        "distance": 0.42231953
                    },
                    "endtime": "003.48",
                    "metadata": "/db/tv/2016/2016-02/2016-02-02/2016-02-02_0000_US_FOX-News_On_the_Record_with_Greta_Van_Susteren.txt",
                    "starttime": "056.96",
                    "text": "In the video you can hear: And so you can imagine as he is now victorious, the senators are worried. In the video you can see: .",
                    "video_id": "Rise of Julius Caesar  World History  Khan Academy.mp4"
                },
                {
                    "_additional": {
                        "distance": 0.43169546
                    },
                    "endtime": "042.44",
                    "metadata": "/db/tv/2016/2016-02/2016-02-02/2016-02-02_0000_US_FOX-News_On_the_Record_with_Greta_Van_Susteren.txt",
            