In [None]:
#
# Weaviate Academy
# Course: 101T - Working with text data
#
import weaviate
from weaviate.classes.init import Auth
import weaviate.classes.config as wc
import weaviate.classes.query as wq
import pandas as pd
import requests
from datetime import datetime, timezone
from weaviate.util import generate_uuid5
from tqdm import tqdm
import os
import json
import logging


headers = {
    "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
}

# 
# Connect to a local Weaviate instance.
#
client = weaviate.connect_to_local(headers=headers)


In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

In [None]:
logger.info("Test")

In [None]:
# Grab the movie data.
data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
resp = requests.get(data_url)
df = pd.DataFrame(resp.json())

In [8]:
# Create a movie collection.

try:
    assert client.is_live()
    logger.info(f'Creating movie collection')
    logger.info(client.get_meta())
    
    if client.collections.exists("Movie"):
        logger.info("Deleting existing Movie collection.")
        client.collections.delete("Movie")

    client.collections.create(
    name="Movie",
    properties=[
        wc.Property(name="title", data_type=wc.DataType.TEXT),
        wc.Property(name="overview", data_type=wc.DataType.TEXT),
        wc.Property(name="vote_average", data_type=wc.DataType.NUMBER),
        wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY),
        wc.Property(name="release_date", data_type=wc.DataType.DATE),
        wc.Property(name="tmdb_id", data_type=wc.DataType.INT),
    ],
    # Define the vectorizer module
    vectorizer_config=wc.Configure.Vectorizer.text2vec_openai(),
    # Define the generative module
    generative_config=wc.Configure.Generative.openai()
    )

finally:
    logger.info('created Movie collection')

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/live "HTTP/1.1 200 OK"
INFO:root:Creating movie collection
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:root:{'grpcMaxMessageSize': 104858000, 'hostname': 'http://[::]:8080', 'modules': {'generative-anthropic': {'documentationHref': 'https://docs.anthropic.com/en/api/getting-started', 'name': 'Generative Search - Anthropic'}, 'generative-anyscale': {'documentationHref': 'https://docs.anyscale.com/endpoints/overview', 'name': 'Generative Search - Anyscale'}, 'generative-aws': {'documentationHref': 'https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html', 'name': 'Generative Search - AWS'}, 'generative-cohere': {'documentationHref': 'https://docs.cohere.com/reference/chat', 'name': 'Generative Search - Cohere'}, 'generative-databricks': {'documentationHref': 'https://docs.databricks.com/en/machine-learning/foundation-models/api-reference.html#completion-task', 'name': 'Gen

In [10]:
# Import movies

try:
        # Get the collection
    movies = client.collections.get("Movie")

    # Enter context manager
    with movies.batch.fixed_size(batch_size=200) as batch:
        # Loop through the data
        for i, movie in tqdm(df.iterrows()):
            # Convert data types
            # Convert a JSON date to `datetime` and add time zone information
            release_date = datetime.strptime(movie["release_date"], "%Y-%m-%d").replace(
                tzinfo=timezone.utc
            )
            # Convert a JSON array to a list of integers
            genre_ids = json.loads(movie["genre_ids"])

            # Build the object payload
            movie_obj = {
                "title": movie["title"],
                "overview": movie["overview"],
                "vote_average": movie["vote_average"],
                "genre_ids": genre_ids,
                "release_date": release_date,
                "tmdb_id": movie["id"],
            }

            # Add object to batch queue
            batch.add_object(
                properties=movie_obj,
                uuid=generate_uuid5(movie["id"])
                # references=reference_obj  # You can add references here
            )
            # Batcher automatically sends batches

    # Check for failed objects
    if len(movies.batch.failed_objects) > 0:
        logger.error(f"Failed to import {len(movies.batch.failed_objects)} objects")
    else:
        logger.info(f'{movies.batch.failed_objects = }')

finally:
    logger.info('imported movies')

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema/Movie "HTTP/1.1 200 OK"
680it [00:00, 5746.28it/s]
INFO:root:movies.batch.failed_objects = []
INFO:root:imported movies


In [7]:
try:
    # Perform query
    print("Query = dystopian future")
    response = movies.query.near_text(
        query="dystopian future", limit=5, return_metadata=wq.MetadataQuery(distance=True)
    )

    # Inspect the response
    for o in response.objects:
        print(
            o.properties["title"], o.properties["release_date"].year
        )  # Print the title and release year (note the release date is a datetime object)
        print(
            f"Distance to query: {o.metadata.distance:.3f}\n"
        )  # Print the distance of the object from the query

    # Perform query
    print("BM25 query for history")
    response = movies.query.bm25(
        query="history", limit=5, return_metadata=wq.MetadataQuery(score=True)
    )

    # Inspect the response
    for o in response.objects:
        print(
            o.properties["title"], o.properties["release_date"].year
        )  # Print the title and release year (note the release date is a datetime object)
        print(
            f"BM25 score: {o.metadata.score:.3f}\n"
        )  # Print the BM25 score of the object from the query

    # Hybrid Query
    print("Hybrid Query")
    response = movies.query.hybrid(
        query="history", limit=5, return_metadata=wq.MetadataQuery(score=True)
    )

    # Inspect the response
    for o in response.objects:
        print(
            o.properties["title"], o.properties["release_date"].year
        )  # Print the title and release year (note the release date is a datetime object)
        print(
            f"Hybrid score: {o.metadata.score:.3f}\n"
        )  # Print the hybrid search score of the object from the query

    # Perform query
    print("Query using release_date filter")
    response = movies.query.near_text(
        query="dystopian future",
        limit=5,
        return_metadata=wq.MetadataQuery(distance=True),
        filters=wq.Filter.by_property("release_date").greater_than(datetime(2020, 1, 1))
    )

    # Inspect the response
    for o in response.objects:
        print(
            o.properties["title"], o.properties["release_date"].year
        )  # Print the title and release year (note the release date is a datetime object)
        print(
            f"Distance to query: {o.metadata.distance:.3f}\n"
        )  # Print the distance of the object from the query

    # Single Prompt
    print("Single prompt query: Translate this into French")

    response = movies.generate.near_text(
        query="dystopian future",
        limit=5,
        single_prompt="Translate this into French: {title}"
    )

    # Inspect the response
    for o in response.objects:
        print(o.properties["title"])  # Print the title
        print(o.generated)  # Print the generated text (the title, in French)

    # Generative Search
    print("Grouped task prompt query: What do these movies have in common?")
    response = movies.generate.near_text(
        query="dystopian future",
        limit=5,
        grouped_task="What do these movies have in common?",
        # grouped_properties=["title", "overview"]  # Optional parameter; for reducing prompt length
    )

    # Inspect the response
    for o in response.objects:
        print(o.properties["title"])  # Print the title
    print(response.generated)  # Print the generated text (the commonalities between them)

finally:
    print('finished with queries')


Query = dystopian future
In Time 2011
Distance to query: 0.564

Mad Max: Fury Road 2015
Distance to query: 0.574

I, Robot 2004
Distance to query: 0.585

Gattaca 1997
Distance to query: 0.587

Children of Men 2006
Distance to query: 0.593

BM25 query for history
A Beautiful Mind 2001
BM25 score: 2.723

Legends of the Fall 1994
BM25 score: 2.483

Night at the Museum 2006
BM25 score: 2.412

Hacksaw Ridge 2016
BM25 score: 2.367

The Butterfly Effect 2004
BM25 score: 2.202

Hybrid Query
GoodFellas 1990
Hybrid score: 0.700

A Beautiful Mind 2001
Hybrid score: 0.628

The Butterfly Effect 2004
Hybrid score: 0.549

Hidden Figures 2016
Hybrid score: 0.424

Hancock 2008
Hybrid score: 0.391

Query using release_date filter


            To use a different timezone, specify it in the datetime object. For example:
            datetime.datetime(2021, 1, 1, 0, 0, 0, tzinfo=datetime.timezone(-datetime.timedelta(hours=2))).isoformat() = 2021-01-01T00:00:00-02:00
            


The Adam Project 2022
Distance to query: 0.659

Jurassic World Dominion 2022
Distance to query: 0.663

Dune 2021
Distance to query: 0.666

Greenland 2020
Distance to query: 0.673

Don't Look Up 2021
Distance to query: 0.674

Single prompt query: Translate this into French
In Time
À temps
Mad Max: Fury Road
Mad Max: Fury Road
I, Robot
Moi, Robot
Gattaca
Gattaca
Children of Men
Les enfants des hommes
Grouped task prompt query: What do these movies have in common?
In Time
Mad Max: Fury Road
I, Robot
Gattaca
Children of Men
These movies all take place in a dystopian future where society has undergone significant changes and faces various challenges. They explore themes such as inequality, survival, technology, and the consequences of human actions.
finished with queries


In [None]:
client.close()