Install client library:


In [None]:
!pip install weaviate-client

Connect to client:

In [1]:
import weaviate
import json

In [2]:
client = weaviate.Client(
    url="http://localhost:8080",   
)

# Schema

Normal class schema with properties

In [15]:
class_obj = {'class':'Product',
             'properties' : [
                 {
                     'name':'title',
                     'dataType':['text']
                 },
                 {
                     'name':'body',
                     'dataType':['text'],
                 },
             ],}

In [16]:
client.schema.create_class(class_obj)

Class schema with Vectorizer

In [28]:
class_obj = {'class':'test2class',
             'properties' : [
                 {
                     'name':'title',
                     'dataType':['text']
                 },
             ],
            'vectorizer':'text2vec-transformers',
            "description": "Content that will be vectorized",
            "moduleConfig": {
            "text2vec-transformers": {
              "skip": False,
              "vectorizePropertyName": False }
             }
            }	
client.schema.create_class(class_obj)

In [45]:
class_obj = {'class':'new_class_test2',
             'vectorizer':None,
             "moduleConfig": {
            "text2vec-contextionary": {
            "vectorizeClassName": False
            },
             'properties' : [
                 {
                     'name':'title',
                     'dataType':['text']
                 },
             ],
            }	}
client.schema.create_class(class_obj)

In [3]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Weaviate
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from torch import cuda, bfloat16
from langchain.llms import HuggingFacePipeline
import transformers

from langchain.chains import ChatVectorDBChain,RetrievalQA

  from .autonotebook import tqdm as notebook_tqdm


In [46]:
loader = PyPDFLoader("../pdf_querying_summarization/temp_file/doc.pdf")

documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}
        )

load INSTRUCTOR_Transformer
max_seq_length  512


In [15]:
docs

[Document(page_content='A Short Introduction to  \nthe GRI Standards\nwww.globalreporting.org www.globalreporting.org', metadata={'source': '../pdf_querying_summarization/temp_file/doc.pdf', 'page': 0}),
 Document(page_content='2\nA Short Introduction to the GRI Standards\nIntroduction\nThe GRI Standards are a modular system \nof interconnected standards. They allow \norganizations to publicly report the \nimpacts of their activities in a structured \nway that is transparent to stakeholders \nand other interested parties.\nThis Short Introduction will:\n•  give new users of the GRI Standards an overview \nof how the Standards are set up, and equip them to \nstart working with the various elements involved in the \nreporting process;\n•  be of assistance to experienced users in gaining an \nunderstanding of changes in the system and the role of \nthe GRI Sector Standards; and\n•  aid stakeholders and other information users (such \nas analysts and policymakers) to understand how the \nr

In [17]:
def get_embeddings(text):
        #string_text = [doc.page_content for doc in text]
        embeddings_vec = embeddings.embed_query(text)
        return embeddings_vec

In [18]:
get_embeddings

<function __main__.get_embeddings(text)>

In [48]:
embeddings_test = [get_embeddings(text) for text in [doc.page_content for doc in docs]]

In [36]:
embeddings_test

[[0.057569120079278946,
  -0.02708945795893669,
  0.023874597623944283,
  0.0009388840990141034,
  -0.06048440560698509,
  -0.043822143226861954,
  -0.08492988348007202,
  0.02667873725295067,
  -0.013093858025968075,
  -0.003932942170649767,
  0.030655259266495705,
  0.03695352002978325,
  -0.016235746443271637,
  -0.10847834497690201,
  -0.026136623695492744,
  0.03254524618387222,
  0.00814276747405529,
  -0.040576886385679245,
  -0.01013531256467104,
  -0.01243689563125372,
  0.0033774753101170063,
  -0.04618673399090767,
  -0.0391654297709465,
  0.020864658057689667,
  -0.01573486439883709,
  -0.05547720193862915,
  0.03146849572658539,
  0.0071702152490615845,
  -0.0423942469060421,
  -0.041371092200279236,
  0.046772610396146774,
  -0.020595015957951546,
  0.007197203580290079,
  -0.04409021884202957,
  0.045877523720264435,
  -0.007676710374653339,
  -0.022702062502503395,
  -0.020662201568484306,
  0.0005983285955153406,
  -0.0022936032619327307,
  -0.05718176066875458,
  0.02

In [49]:
def serialize_document(doc, embed):
    return {
        "page_content": doc.page_content,
        "vector": embed,
    }

In [50]:
serialized_docs = [
    serialize_document(doc, embed) 
    for doc, embed in zip(docs, embeddings_test)
]

In [51]:
serialized_docs

[{'page_content': 'A Short Introduction to  \nthe GRI Standards\nwww.globalreporting.org www.globalreporting.org',
  'vector': [0.057569120079278946,
   -0.02708945795893669,
   0.023874597623944283,
   0.0009388840990141034,
   -0.06048440560698509,
   -0.043822143226861954,
   -0.08492988348007202,
   0.02667873725295067,
   -0.013093858025968075,
   -0.003932942170649767,
   0.030655259266495705,
   0.03695352002978325,
   -0.016235746443271637,
   -0.10847834497690201,
   -0.026136623695492744,
   0.03254524618387222,
   0.00814276747405529,
   -0.040576886385679245,
   -0.01013531256467104,
   -0.01243689563125372,
   0.0033774753101170063,
   -0.04618673399090767,
   -0.0391654297709465,
   0.020864658057689667,
   -0.01573486439883709,
   -0.05547720193862915,
   0.03146849572658539,
   0.0071702152490615845,
   -0.0423942469060421,
   -0.041371092200279236,
   0.046772610396146774,
   -0.020595015957951546,
   0.007197203580290079,
   -0.04409021884202957,
   0.0458775237202644

In [21]:
serialized_docs = [serialize_document(doc,embeddings_test[docs]) for doc in docs]

TypeError: list indices must be integers or slices, not list

In [23]:
serialized_docs

[{'page_content': 'A Short Introduction to  \nthe GRI Standards\nwww.globalreporting.org www.globalreporting.org',
  'vector': [0.057569120079278946,
   -0.02708945795893669,
   0.023874597623944283,
   0.0009388840990141034,
   -0.06048440560698509,
   -0.043822143226861954,
   -0.08492988348007202,
   0.02667873725295067,
   -0.013093858025968075,
   -0.003932942170649767,
   0.030655259266495705,
   0.03695352002978325,
   -0.016235746443271637,
   -0.10847834497690201,
   -0.026136623695492744,
   0.03254524618387222,
   0.00814276747405529,
   -0.040576886385679245,
   -0.01013531256467104,
   -0.01243689563125372,
   0.0033774753101170063,
   -0.04618673399090767,
   -0.0391654297709465,
   0.020864658057689667,
   -0.01573486439883709,
   -0.05547720193862915,
   0.03146849572658539,
   0.0071702152490615845,
   -0.0423942469060421,
   -0.041371092200279236,
   0.046772610396146774,
   -0.020595015957951546,
   0.007197203580290079,
   -0.04409021884202957,
   0.0458775237202644

In [52]:
class_names = "new_class_test2"

In [24]:
with client.batch as batch:
    for serialized_doc in serialized_docs:
        try:
            batch.add_data_object(
                {
                    "page_content": serialized_doc["page_content"]
                },
                class_name="new_class_test",  # Replace with the actual class name
                vector=serialized_doc["vector"]
            )
        except Exception as e:
            print(f"Failed to add a document: {str(e)}")

In [53]:
client.batch.configure(batch_size=100) 
with client.batch as batch:
    for i, data_obj in enumerate(serialized_docs):
        batch.add_data_object(data_obj, class_names,vector=embeddings_test[i])

In [65]:
from weaviate.util import generate_uuid5

In [7]:
from weaviate.util import generate_uuid5

class_name = "YourClassName"  # Replace with your class name
data_objs = [
    {"title": f"Object {i+1}"} for i in range(5)  # Replace with your actual objects
]
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    for data_obj in data_objs:
        batch.add_data_object(
            data_obj,
            class_name,
            uuid=generate_uuid5(data_obj)  # Optional: Specify an object ID
        )

In [7]:
model_id = 'meta-llama/Llama-2-70b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

        # set quantization configuration to load large model with less GPU memory
        # this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

        # begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
            model_id
        )

model = transformers.AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            config=model_config,
            quantization_config=bnb_config,
            device_map='auto'
        )
model.eval()

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
generate_text = transformers.pipeline(
            model=model, tokenizer=tokenizer,
            return_full_text=True,  # langchain expects the full text
            task='text-generation',
            # we pass model parameters here too
            #stopping_criteria=stopping_criteria,  # without this model rambles during chat
            temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
            max_new_tokens=512,  # mex number of tokens to generate in the output
            repetition_penalty=1.1  # without this output begins repeating
        )
llm = HuggingFacePipeline(pipeline=generate_text)

Loading checkpoint shards: 100%|██████████| 15/15 [00:17<00:00,  1.13s/it]


In [60]:
vectorstore = Weaviate(client, "new_class_test2", "vector")

In [61]:
QA_document = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())

In [62]:
QA_document.run("What are the GRI standards?")

ValueError: Error during query: [{'locations': [{'column': 6, 'line': 1}], 'message': "explorer: get class: vector search: object vector search at index new_class_test2: shard new_class_test2_QvoEEUrhAWwc: vector search: knn search: distance between entrypoint and query node: vector lengths don't match: 768 vs 384", 'path': ['Get', 'New_class_test2']}]

In [33]:
from langchain.chains import ChatVectorDBChain
qa = ChatVectorDBChain.from_llm(llm, vectorstore)



In [35]:
chat_history = []

In [36]:
qa({"question": "What are the GRI standards?", "chat_history": chat_history})

TypeError: class name must be of type str but was <class 'langchain.embeddings.huggingface.HuggingFaceInstructEmbeddings'>

In [63]:
client.schema.get(class_name="new_class_test2")

{'class': 'New_class_test2',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
   'vectorizeClassName': True}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-transformers': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'title',
   'tokenization': 'word'},
  {'dataType': ['text'],
   'description': "This property was generated by Weaviate's auto-schema feature on Thu Oct  5 12:28:16 2023",
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-transformers': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'page_content',
   'tokenization': 'word'},
  {'dataType': ['number[]'],
   'description': "This property was g

In [30]:
client.data_object.get(class_name="new_class_test")

{'deprecations': [],
 'objects': [{'class': 'New_class_test',
   'creationTimeUnix': 1696507275744,
   'id': '56ddbcbc-7db9-453f-85e8-d4ee9fee6c52',
   'lastUpdateTimeUnix': 1696507275744,
   'properties': {'page_content': '2\nA Short Introduction to the GRI Standards\nIntroduction\nThe GRI Standards are a modular system \nof interconnected standards. They allow \norganizations to publicly report the \nimpacts of their activities in a structured \nway that is transparent to stakeholders \nand other interested parties.\nThis Short Introduction will:\n•  give new users of the GRI Standards an overview \nof how the Standards are set up, and equip them to \nstart working with the various elements involved in the \nreporting process;\n•  be of assistance to experienced users in gaining an \nunderstanding of changes in the system and the role of \nthe GRI Sector Standards; and\n•  aid stakeholders and other information users (such \nas analysts and policymakers) to understand how the \nrepor

In [15]:
client.data_object.get_by_id('f73db862-ee72-4810-9ee5-10e3a8decaa5')



{'class': 'Wipwip',
 'creationTimeUnix': 1696429405484,
 'id': 'f73db862-ee72-4810-9ee5-10e3a8decaa5',
 'lastUpdateTimeUnix': 1696429405484,
 'properties': {'page_content': 'PO Box 10039\n1001 EA  Amsterdam\nThe Netherlands\ninfo@globalreporting.orgwww.globalreporting.org'},
 'vectorWeights': None}

In [29]:
json_docs = json.dumps(serialized_docs)

In [30]:
json_docs

'[{"page_content": "A Short Introduction to  \\nthe GRI Standards\\nwww.globalreporting.org www.globalreporting.org", "metadata": {"source": "../pdf_querying_summarization/temp_file/doc.pdf", "page": 0}}, {"page_content": "2\\nA Short Introduction to the GRI Standards\\nIntroduction\\nThe GRI Standards are a modular system \\nof interconnected standards. They allow \\norganizations to publicly report the \\nimpacts of their activities in a structured \\nway that is transparent to stakeholders \\nand other interested parties.\\nThis Short Introduction will:\\n\\u2022  give new users of the GRI Standards an overview \\nof how the Standards are set up, and equip them to \\nstart working with the various elements involved in the \\nreporting process;\\n\\u2022  be of assistance to experienced users in gaining an \\nunderstanding of changes in the system and the role of \\nthe GRI Sector Standards; and\\n\\u2022  aid stakeholders and other information users (such \\nas analysts and policyma

In [8]:
type(docs)

list

In [5]:
data_objs = [{"title": doc} for doc in docs]

In [6]:
from weaviate.util import generate_uuid5

In [45]:
client.batch.configure(batch_size=50)

with client.batch as batch:
    for serialized_doc in serialized_docs:
        batch.add_data_object(
            serialized_doc,
            class_name="Test_with_vectorizer",
        )

TypeError: Object of type Document is not JSON serializable

In [3]:
name= "londonwelcome"
class_obj = {
            "class": str(name),
            "description": "normal desc",
        }
client.schema.create_class(class_obj)

Getting the schemas:

In [18]:
client.schema.get()

{'classes': [{'class': 'Vec',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'answer',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       've

In [16]:
class_name = [cls['class'] for cls in test['classes']]

In [17]:
class_name

['Vec',
 'Article',
 'Product',
 'Product_with_vectorizer',
 'Web_data',
 'JeopardyQuestion',
 'YourClassName',
 'LangChain_138df2f1432f4c5c825cff1cf8d79ee9',
 'LangChain_7a5927f0c2a145c3b6c40189e7977144',
 'LangChain_cbbf8734b3684ba2a61621f2f3448040',
 'LangChain_9f38fc3bc65c4ce5887f4bf58df3ffc1',
 'LangChain_f66777b2f3f64ca382e8b5fe6e334e7e',
 'LangChain_a86c9267ee0c4e5d8ee96c859d8ebb3a',
 'Londonwelcome',
 'Forbidden_feelings',
 'Vectorizzar']

Deleting class:

In [None]:
client.schema.delete_class('Vec')

Example of a schema:

In [None]:
{
  "classes": [
    {
      "class": "Article",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text",
          "vectorizeClassName": true
        }
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "moduleConfig": {
            "text2vec-openai": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "title",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "moduleConfig": {
            "text2vec-openai": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "body",
          "tokenization": "word"
        }
      ],
      "replicationConfig": {
        "factor": 1
      },
      "shardingConfig": {
        "virtualPerPhysical": 128,
        "desiredCount": 1,
        "actualCount": 1,
        "desiredVirtualCount": 128,
        "actualVirtualCount": 128,
        "key": "_id",
        "strategy": "hash",
        "function": "murmur3"
      },
      "vectorIndexConfig": {
        "skip": false,
        "cleanupIntervalSeconds": 300,
        "maxConnections": 64,
        "efConstruction": 128,
        "ef": -1,
        "dynamicEfMin": 100,
        "dynamicEfMax": 500,
        "dynamicEfFactor": 8,
        "vectorCacheMaxObjects": 1000000000000,
        "flatSearchCutoff": 40000,
        "distance": "cosine",
        "pq": {
          "enabled": false,
          "bitCompression": false,
          "segments": 0,
          "centroids": 256,
          "encoder": {
            "type": "kmeans",
            "distribution": "log-normal"
          }
        }
      },
      "vectorIndexType": "hnsw",
      "vectorizer": "text2vec-openai"
    }
  ]
}

# Modules

Modules are added in the docker-compose file. To enable modules we need to add the variable 'ENABLE_MODULES' who takes as value the name of the wanted modules.

```yaml
services:
  weaviate:
    environment:
      ENABLE_MODULES: 'text2vec-contextionary,generative-cohere,qna-openai'
```

Some modules require module-specific variables, for example for a backup-S3 we use the variable: BACKUP_S3_BUCKET. 
For Vectorizer there is a list of all vectorizer modules here: https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules

For default Vectorizer: 
```yaml
services:
  weaviate:
    environment:
      DEFAULT_VECTORIZER_MODULE: text2vec-huggingface
```

Finally we can create our own modules if needed: https://weaviate.io/developers/weaviate/modules/other-modules/custom-modules

# Backups

Integrated with AWS S3, GCS, Azure.
Can backup an entire instance or the selected classes. 

Add backup modules to docker file: 

```ENABLE_MODULES=backup-filesystem,text2vec-transformers```

To make backups:

```POST /v1/backups/{backend}```

Making a backup with filesystem:

In [None]:
#client = weaviate.Client('http://localhost:8080')
result = client.backup.create(
  backup_id='test_backup',
  backend='filesystem',
  include_classes=['Article', 'Product'],
  wait_for_completion=True,
)

print(result)

# Indexes

Weaviate use vector-first storage system to manage all storage operations with a vector index, which is needed to store the vectors that capture the semantic or context of the data which in turn allowing to do semantic search. 

HNSW is supported by Weaviate and is considered the default vector index type. HNSW is great for querying but less good in the building of the DB process. 

Configuring index type:
* vectorIndexType is the ANN to be used, by default it is Hierarchical Navigable Small World (HNSW)
* vectorIndexConfig allow to set parameters to the vector index such as:
    * distance (cosine, euclidian)
    * ef (high ef is high accuracy but slower search)
    * efConstruction
    * etc... https://weaviate.io/developers/weaviate/configuration/indexes

In [21]:
schema_index = {
  "class": "Article",
  "description": "string",
  "properties": [
    {
      "name": "title",
      "description": "string",
      "dataType": ["text"]
    }
  ],
  "vectorIndexType": "hnsw",
  "vectorIndexConfig": {
    "skip": False,
    "ef": 100,
    "efConstruction": 128,
    "maxConnections": 64,
  }
}

# Authentication

We can also manage authentication and authorization: https://weaviate.io/developers/weaviate/configuration/authentication

# Persistence

With Docker and Kubernetes, can persist data by mounting a volume to store the data outside the container, then when the container is reloaded so is the persist volume. 

To initiate the volumes it needs to be specified in the docker-compose file like that: 

```yaml
services:
  weaviate:
    volumes:
      - /var/weaviate:/var/lib/weaviate
    environment:
      CLUSTER_HOSTNAME: 'node1'
```

Example of a config file that supports persist volume:

```yaml
---
version: '3.4'
services:
  weaviate:
    command:
    - --host
    - 0.0.0.0
    - --port
    - '8080'
    - --scheme
    - http
    image: semitechnologies/weaviate:1.21.2
    ports:
    - 8080:8080
    restart: on-failure:0
    volumes:
      - /var/weaviate:/var/lib/weaviate # <== set a volume here
    environment:
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'none'
      ENABLE_MODULES: ''
      CLUSTER_HOSTNAME: 'node1' # <== this can be set to an arbitrary name
...
```

# Monitoring

Compatible with Prometheus and Grafana. To enable it in the docker-compose add: ```PROMETHEUS_MONITORING_ENABLED=true```, then it becomes accessible here: ```<hostname>:2112/metrics``` https://weaviate.io/developers/weaviate/configuration/monitoring

# Managing Data

## Creation of DB

In [22]:
class_name = "Web_data"
class_obj = {'class':class_name,
             'vectorizer': 'text2vec-transformers',
             'vectorIndexConfig':{
                 'distance':"cosine",
             },
            "properties": [
                {
                    "name": "title",
                    "dataType": ["text"]
                },
                {
                    "name": "chunk",
                    "dataType": ["text"]
                },
                {
                    "name": "chunk_no",
                    "dataType": ["int"]
                },
                {
                    "name": "url",
                    "dataType": ["text"],
                    "tokenization": "field"
                },
            ],
        }
client.schema.create_class(class_obj)

In [24]:
response=client.schema.get(class_name)
print(json.dumps(response, indent=2))

{
  "class": "Web_data",
  "invertedIndexConfig": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanupIntervalSeconds": 60,
    "stopwords": {
      "additions": null,
      "preset": "en",
      "removals": null
    }
  },
  "moduleConfig": {
    "text2vec-transformers": {
      "poolingStrategy": "masked_mean",
      "vectorizeClassName": true
    }
  },
  "multiTenancyConfig": {
    "enabled": false
  },
  "properties": [
    {
      "dataType": [
        "text"
      ],
      "indexFilterable": true,
      "indexSearchable": true,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": false,
          "vectorizePropertyName": false
        }
      },
      "name": "title",
      "tokenization": "word"
    },
    {
      "dataType": [
        "text"
      ],
      "indexFilterable": true,
      "indexSearchable": true,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": false,
          "vectorizePropertyName": false
   

In [25]:
uuid = client.data_object.create({
    'question': 'This vector DB is OSS & supports automatic property type inference on import',
    # 'answer': 'Weaviate',  # schema properties can be omitted
    'somePropNotInTheSchema': 123,  # will be automatically added as a number property
}, 'JeopardyQuestion')

print(uuid)  # the return value is the object's UUID

36f36ee9-088f-478a-b126-71529631d019


In [28]:
from weaviate.util import generate_uuid5

class_name = "YourClassName"  # Replace with your class name
data_objs = [
    {"title": f"Object {i+1}"} for i in range(5)
]
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    for data_obj in data_objs:
        batch.add_data_object(
            data_obj,
            class_name,
            uuid=generate_uuid5(data_obj)
            # tenant="tenantA"  # If multi-tenancy is enabled, specify the tenant to which the object will be added.
        )

In [30]:
client.data_object.get(class_name='YourClassName')

{'deprecations': [],
 'objects': [{'class': 'YourClassName',
   'creationTimeUnix': 1694617319061,
   'id': '0ca5ea3e-7661-48b5-9711-927cbf56fb73',
   'lastUpdateTimeUnix': 1694617319061,
   'properties': {'title': 'Object 3'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617410523,
   'id': '23bdb1f7-2149-5199-88ef-4d939e2ad2ed',
   'lastUpdateTimeUnix': 1694617410523,
   'properties': {'title': 'Object 4'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617319061,
   'id': '2cf9567a-fe6e-4020-a0c7-f25c80c90f87',
   'lastUpdateTimeUnix': 1694617319061,
   'properties': {'title': 'Object 5'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617410523,
   'id': '376747d4-5685-56f2-8cb2-d260291c1ff0',
   'lastUpdateTimeUnix': 1694617410523,
   'properties': {'title': 'Object 3'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617319061,
   'id': 

In [6]:
class_obj = {
    "class": "Vec",
    "vectorizer": "text2vec-transformers",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
}

client.schema.create_class(class_obj)

UnexpectedStatusCodeException: Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Vec" already exists'}]}.

In [8]:
client.data_object.get()

{'deprecations': None,
 'objects': [{'class': 'Vec',
   'creationTimeUnix': 1694529898324,
   'id': '10b92bef-cfe9-4762-8703-97dd1d8b9a77',
   'lastUpdateTimeUnix': 1694529898324,
   'properties': {'answer': 'the nose or snout',
    'category': 'ANIMALS',
    'question': 'The gavial looks very much like a crocodile except for this bodily feature'},
   'vectorWeights': None},
  {'class': 'Vec',
   'creationTimeUnix': 1694529898324,
   'id': '39aae306-204c-481d-8548-6257d67a1fcd',
   'lastUpdateTimeUnix': 1694529898324,
   'properties': {'answer': 'Sound barrier',
    'category': 'SCIENCE',
    'question': 'In 70-degree air, a plane traveling at about 1,130 feet per second breaks it'},
   'vectorWeights': None},
  {'class': 'Vec',
   'creationTimeUnix': 1694529898325,
   'id': '470426e8-0383-4e88-a66e-22e7b4d84dd4',
   'lastUpdateTimeUnix': 1694529898325,
   'properties': {'answer': 'the atmosphere',
    'category': 'SCIENCE',
    'question': 'Changes in the tropospheric layer of this ar

In [9]:
client.schema.get()

{'classes': [{'class': 'Vec',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'answer',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       've

In [4]:
class_obj = {
    "class": "Vec",
    "vectorizer": "text2vec-huggingface",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-huggingface": {},
    }
}

client.schema.create_class(class_obj)

In [18]:
import requests
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:  # Initialize a batch process
    for i, d in enumerate(data):  # Batch import data
        print(f"importing question: {i+1}")
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        batch.add_data_object(
            data_object=properties,
            class_name="Vec"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [20]:
import weaviate
import json

client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your endpoint
    #auth_client_secret=weaviate.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"),  # Replace w/ your Weaviate instance API key

)

response = (
    client.query
    .get("Question", ["question", "answer", "category"])
    .with_near_text({"concepts": ["biology"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "errors": [
        {
            "locations": [
                {
                    "column": 6,
                    "line": 1
                }
            ],
            "message": "Cannot query field \"Question\" on type \"GetObjectsObj\".",
            "path": null
        }
    ]
}
