In [1]:
!pip install google-cloud-aiplatform



In [2]:
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel
from google.cloud.aiplatform_v1.types import index as index_types
import uuid


In [9]:
#  1. Init Vertex AI
PROJECT_ID = "project-xxxxxxxxxxxxx"
LOCATION = "us-east1"
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [10]:
#  2. Load text file from GCS
from google.cloud import storage

BUCKET_NAME = "vertexai88978786"
BLOB_NAME = "books.txt"

In [11]:
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(BLOB_NAME)
text_data = blob.download_as_text()

In [12]:
# Split into small chunks (for embeddings)
documents = [line.strip() for line in text_data.split("\n") if line.strip()]

In [13]:
#  3. Create embeddings
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-005")

In [14]:
embeddings = [
    embedding_model.get_embeddings([doc])[0].values for doc in documents
]

In [15]:
#  4. Create a STREAM_UPDATE vector index
index_id = f"my-book-index-{uuid.uuid4().hex[:6]}"
print(index_id)

my-book-index-6124f1


In [17]:
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=index_id,
    dimensions=len(embeddings[0]),     # embedding size (768 for Gecko)
    distance_measure_type="COSINE_DISTANCE",
    index_update_method="STREAM_UPDATE",   #  allows upserts
    approximate_neighbors_count=10,         #  required for Tree-AH
    leaf_node_embedding_count=500
)
index.wait()

In [18]:
endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"{index_id}-endpoint",
    public_endpoint_enabled=True,   # you can set False for private
)


In [19]:
endpoint.deploy_index(index=index,  deployed_index_id="my_unique_deployment_id_v2")

<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7a3f18577a70> 
resource name: projects/582918007154/locations/us-east1/indexEndpoints/3624434527847317504

In [20]:
datapoints = [
    index_types.IndexDatapoint(
        datapoint_id=str(i),
        feature_vector=emb,
    )
    for i, emb in enumerate(embeddings)
]

In [21]:
index.upsert_datapoints(datapoints=datapoints)
print(f"Upserted {len(datapoints)} vectors ")

Upserted 500 vectors 


In [22]:
#  1. Build query embedding
query_text = "find similar to Bible"
query_emb = embedding_model.get_embeddings([query_text])[0].values

In [23]:
#  2. Call find_neighbors
neighbors_list = endpoint.find_neighbors(
    deployed_index_id=endpoint.deployed_indexes[0].id,
    queries=[query_emb],
    num_neighbors=5,
)

In [26]:
# on error this code is written
import time

print("Waiting for index to sync...")
# Give it a moment if you just upserted
# time.sleep(60)

# 1. Call find_neighbors
neighbors_list = endpoint.find_neighbors(
    deployed_index_id="my_unique_deployment_id_v2",
    queries=[query_emb],
    num_neighbors=5,
)

print("\n Query Results:")
if not neighbors_list or not neighbors_list[0]:
    print("No results found. The index may still be syncing or the deployment is warming up.")
else:
    for neighbor in neighbors_list[0]:
        try:
            idx = int(neighbor.id)
            # Ensure the ID exists in your local documents list
            if idx < len(documents):
                print(f"- {documents[idx]} (score={neighbor.distance:.4f})")
            else:
                print(f"- [ID {idx} found in index but not in local documents list]")
        except ValueError:
            print(f"- neighbor.id is not an integer: {neighbor.id}")

Waiting for index to sync...

 Query Results:
- The Testament (score=0.4721)
- Book-Title (score=0.4897)
- The Gospel of Judas: A Novel (score=0.4903)
- Poisonwood Bible Edition Uk (score=0.4967)
- Chocolate Jesus (score=0.5041)


In [27]:
print("\n Query Results:")
for neighbor in neighbors_list[0]:  # first query
    idx = int(neighbor.id)  #  vector ID
    print(f"- {documents[idx]} (score={neighbor.distance:.4f})")


 Query Results:
- The Testament (score=0.4721)
- Book-Title (score=0.4897)
- The Gospel of Judas: A Novel (score=0.4903)
- Poisonwood Bible Edition Uk (score=0.4967)
- Chocolate Jesus (score=0.5041)
