# Weaviate Quickstart

## 1. Load API Key with .env

In [None]:
from dotenv import load_dotenv

# Your .env file should contain WEAVIATE_API_KEY=abc and WEAVIATE_URL=https://example.com
load_dotenv()

## 2. Initialize Weaviate client

Next, use your API key to initialize your client.
Weaviate is open source so you can deploy your clusters and collections locally or use their managed cloud clusters.
Just make sure you're pointing to the correct cluster endpoint.

In [None]:
import os
import weaviate
import weaviate.classes as wvc

client = weaviate.connect_to_wcs(
    cluster_url=os.environ.get("WEAVIATE_URL"),
    auth_credentials=weaviate.auth.AuthApiKey(os.environ.get("WEAVIATE_API_KEY")),
)

## 3. Prepare language model for vector encoder

We use a small transformers language model to create 364-dimensional embeddings. You can out models for generating embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedder.encode("Example sentences.")
dimension = embeddings.shape[0]

## 4. Create a Weaviate collection

This creates a collection named "quickstart" that performs similarity search with your vectors.

In [None]:
collection_name = "quickstart"
collection = client.collections.create(
    collection_name,
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistances.COSINE # select prefered distance metric
    ),
)

## 5. Generate vector values from wikipedia text

We retrieve a wikipedia based dataset with Hugging Face's datasets library. Note that this dataset contains Cohere's vectors, but we're generating our own in this notebook.

In [None]:
from datasets import load_dataset
import torch
import cohere
from tqdm.auto import tqdm

max_docs = 1000 # Increase to use etnire dataset
docs_stream = load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train", streaming=True)

vectors = []

pbar = tqdm(total=max_docs)
for doc in docs_stream:
    vec = wvc.data.DataObject(
        vector=embedder.encode(doc["title"] + " " + doc["text"]).tolist(),
        properties={
            "title": doc["title"],
            "text": doc["text"],
            "url": doc["url"],
        },
    )
    vectors.append(vec)
    pbar.update(1)
    if len(vectors) >= max_docs:
        break


## 6. Upsert vectors

Now that you’ve created your collection and the vector embeddings of your wikipedia data, you can upsert these vectors into your collection.

In [None]:
res = collection.data.insert_many(vectors)

## 7. Check the that vectors were inserted to the collection

In [None]:
res

## 8. Run a similarity search

In [None]:
query = "What's the greatest human advancement of all time?"

def search(query: str):
    query_results = collection.query.near_vector(
        near_vector=embedder.encode(query).tolist(),
        limit=2,
        return_metadata=wvc.query.MetadataQuery(certainty=True)
    )
    matches = query_results.objects
    if matches:
        return matches[0].properties
    else:
        return {}

search(query)

## 9. Deploy an app to port forward and share publically

In [None]:
import gradio as gr

gr.close_all()

iface = gr.Interface(
    fn=search,
    inputs=gr.Textbox(lines=2, placeholder="Type your query here..."),
    outputs="json",
    title="Search Wikipedia text with a Weaviate Collection",
    description="This is a simple wikipedia search engine powered by transformers and weaviate",
)

# Run the Gradio app on localhost:5000 or whichever port you specified
iface.launch(server_port=5000, inline=False, quiet=True)

print(f"See your app deployed publically with the port you're securely forwarding: {os.environ.get('PORT_FORWARD_URL')}")

## 10. Clean up

When you no longer need the collection, call `drop_collection` and specify the name to shut it down.

In [None]:
client.collections.delete(collection_name)