In [40]:
import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

from sentence_transformers import SentenceTransformer
from src.database.database_utils import get_weaviate_client
import requests
import pandas as pd

In [6]:
client = get_weaviate_client()
import weaviate.classes as wvc

In [7]:
client._client.collections.create(
        "Question",
        vectorizer_config=wvc.config.Configure.Vectorizer.none(),
        vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
            distance_metric=wvc.config.VectorDistances.COSINE # select prefered distance metric
        ),
    )

<weaviate.collections.collection.Collection at 0x7f1e9dbebee0>

In [8]:
client._client.is_live()

True

In [9]:
import json
metainfo = client._client.get_meta()

In [39]:
from rich import print
# print(json.dumps(metainfo, indent=4))

In [12]:
client._client.close()

In [16]:
import weaviate

import weaviate.classes.config as wc
import os


# Instantiate your client (not shown). e.g.:
# client = weaviate.connect_to_wcs(..., headers=headers) or
# client = weaviate.connect_to_local(..., headers=headers)
client._client.connect()
client._client.collections.create(
    name="Movies",
    properties=[
        wc.Property(name="title", data_type=wc.DataType.TEXT),
        wc.Property(name="overview", data_type=wc.DataType.TEXT),
        wc.Property(name="vote_average", data_type=wc.DataType.NUMBER),
        wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY),
        wc.Property(name="release_date", data_type=wc.DataType.DATE),
        wc.Property(name="tmdb_id", data_type=wc.DataType.INT),
    ],
    # Define the vectorizer module (none, as we will add our own vectors)
    vectorizer_config=wc.Configure.Vectorizer.none(),
    # Define the generative module
    generative_config=wc.Configure.Generative.openai()
)

# client.close()

<weaviate.collections.collection.Collection at 0x7f1e9d11b520>

In [17]:
client._client.close()

In [20]:
data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
resp = requests.get(data_url)
df = pd.DataFrame(resp.json())

In [24]:
def query(texts):
    import requests
    import os

    model_id = "sentence-transformers/all-MiniLM-L6-v2"
    hf_token = os.getenv("HF_TOKEN")

    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}

    response = requests.post(
        api_url,
        headers=headers,
        json={"inputs": texts, "options": {"wait_for_model": True}},
    )
    return response.json()

In [29]:
sample = df.overview.values.tolist()[:50]

In [41]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

In [43]:
emb_dfs = list()
src_texts = list()
for i, row in enumerate(df.itertuples(index=False)):
    # Concatenate text to create a source string
    src_text = "Title: " + row.title + "; Overview: " + row.overview
    # Add to the buffer
    src_texts.append(src_text)
    if (len(src_texts) == 50) or (i + 1 == len(df)):  # Get embeddings in batches of 50
        # Get a batch of embeddings
        output = model.encode(src_texts, show_progress_bar=True).tolist()
        emb_df = pd.DataFrame(output)
        # Add the batch of embeddings to a list
        emb_dfs.append(emb_df)
        # Reset the buffer
        src_texts = list()


emb_df = pd.concat(emb_dfs)  # Create a combined dataset

# Save the data as a CSV
emb_df.to_csv(
    f"/tmp/movies_data_1990_2024_embeddings.csv",
    index=False,
)

  self.comm = Comm(**args)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
import weaviate
import pandas as pd
import requests
from datetime import datetime, timezone
import json
from weaviate.util import generate_uuid5
from tqdm import tqdm
import os

# Instantiate your client (not shown). e.g.:
# client = weaviate.connect_to_wcs(...) or
# client = weaviate.connect_to_local(...)

data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
data_resp = requests.get(data_url)
df = pd.DataFrame(data_resp.json())

# embs_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_embeddings.csv"
embs_path = "/tmp/movies_data_1990_2024_embeddings.csv"
emb_df = pd.read_csv(embs_path)

client._client.connect()
# Get the collection
movies = client._client.collections.get("Movies")

# Enter context manager
with movies.batch.dynamic() as batch:
    # Loop through the data
    for i, movie in enumerate(df.itertuples(index=False)):
        # Convert data types
        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )
        # Convert a JSON array to a list of integers
        genre_ids = json.loads(movie.genre_ids)

        # Build the object payload
        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "vote_average": movie.vote_average,
            "genre_ids": genre_ids,
            "release_date": release_date,
            "tmdb_id": movie.id,
        }

        # Get the vector
        vector = emb_df.iloc[i].to_list()

        # Add object (including vector) to batch queue
        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id),
            vector=vector  # Add the custom vector
            # references=reference_obj  # You can add references here
        )
        # Batcher automatically sends batches

# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")

client._client.close()



In [46]:
client.show_all_collections()

['Huberman_minilm_256',
 'Huberman_minilm_128',
 'Huberman_minilm_512',
 'Question',
 'Movie',
 'Movies']

In [57]:
movies = client._client.collections.get("Movies")
query_vector = model.encode("history").tolist()
import weaviate.classes.query as wq

In [54]:
client._client.connect()
response = movies.query.near_vector(near_vector=query_vector, limit=5, return_metadata=wq.MetadataQuery(distance=True))

In [56]:
for o in response.objects:
    print(
        o.properties['title'], o.properties['release_date'].year
    )
    print(
        f"Distance to query: {o.metadata.distance:.3f}\n"
    )

In [64]:
client._client.connect()
movies = client._client.collections.get("Movies")

from datetime import datetime

# Perform query
response = movies.query.near_vector(
    near_vector=query_vector,
    limit=5,
    return_metadata=wq.MetadataQuery(distance=True),
    filters=wq.Filter.by_property("release_date").greater_than(datetime(2020, 1, 1))
)

# Inspect the response
for o in response.objects:
    print(
        o.properties["title"], o.properties["release_date"].year
    )  # Print the title and release year (note the release date is a datetime object)
    print(
        f"Distance to query: {o.metadata.distance:.3f}\n"
    )  # Print the distance of the object from the query
client._client.close()

In [61]:
response

QueryReturn(objects=[])