Install client library:


In [None]:
!pip install weaviate-client

Connect to client:

In [1]:
import weaviate
import json

In [2]:
client = weaviate.Client(
    url="http://localhost:8080",   
)

# Schema

Normal class schema with properties

In [15]:
class_obj = {'class':'Product',
             'properties' : [
                 {
                     'name':'title',
                     'dataType':['text']
                 },
                 {
                     'name':'body',
                     'dataType':['text'],
                 },
             ],}

In [16]:
client.schema.create_class(class_obj)

Class schema with Vectorizer

In [17]:
class_obj = {'class':'Product_with_vectorizer',
             'properties' : [
                 {
                     'name':'title',
                     'dataType':['text']
                 },
             ],
             'vectorizer':'text2vec-transformers'
             }

client.schema.create_class(class_obj)

Getting the schemas:

In [18]:
client.schema.get()

{'classes': [{'class': 'Vec',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'answer',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       've

Deleting class:

In [None]:
client.schema.delete_class('Vec')

Example of a schema:

In [None]:
{
  "classes": [
    {
      "class": "Article",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text",
          "vectorizeClassName": true
        }
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "moduleConfig": {
            "text2vec-openai": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "title",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "moduleConfig": {
            "text2vec-openai": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "body",
          "tokenization": "word"
        }
      ],
      "replicationConfig": {
        "factor": 1
      },
      "shardingConfig": {
        "virtualPerPhysical": 128,
        "desiredCount": 1,
        "actualCount": 1,
        "desiredVirtualCount": 128,
        "actualVirtualCount": 128,
        "key": "_id",
        "strategy": "hash",
        "function": "murmur3"
      },
      "vectorIndexConfig": {
        "skip": false,
        "cleanupIntervalSeconds": 300,
        "maxConnections": 64,
        "efConstruction": 128,
        "ef": -1,
        "dynamicEfMin": 100,
        "dynamicEfMax": 500,
        "dynamicEfFactor": 8,
        "vectorCacheMaxObjects": 1000000000000,
        "flatSearchCutoff": 40000,
        "distance": "cosine",
        "pq": {
          "enabled": false,
          "bitCompression": false,
          "segments": 0,
          "centroids": 256,
          "encoder": {
            "type": "kmeans",
            "distribution": "log-normal"
          }
        }
      },
      "vectorIndexType": "hnsw",
      "vectorizer": "text2vec-openai"
    }
  ]
}

# Modules

Modules are added in the docker-compose file. To enable modules we need to add the variable 'ENABLE_MODULES' who takes as value the name of the wanted modules.

```yaml
services:
  weaviate:
    environment:
      ENABLE_MODULES: 'text2vec-contextionary,generative-cohere,qna-openai'
```

Some modules require module-specific variables, for example for a backup-S3 we use the variable: BACKUP_S3_BUCKET. 
For Vectorizer there is a list of all vectorizer modules here: https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules

For default Vectorizer: 
```yaml
services:
  weaviate:
    environment:
      DEFAULT_VECTORIZER_MODULE: text2vec-huggingface
```

Finally we can create our own modules if needed: https://weaviate.io/developers/weaviate/modules/other-modules/custom-modules

# Backups

Integrated with AWS S3, GCS, Azure.
Can backup an entire instance or the selected classes. 

Add backup modules to docker file: 

```ENABLE_MODULES=backup-filesystem,text2vec-transformers```

To make backups:

```POST /v1/backups/{backend}```

Making a backup with filesystem:

In [None]:
#client = weaviate.Client('http://localhost:8080')
result = client.backup.create(
  backup_id='test_backup',
  backend='filesystem',
  include_classes=['Article', 'Product'],
  wait_for_completion=True,
)

print(result)

# Indexes

Weaviate use vector-first storage system to manage all storage operations with a vector index, which is needed to store the vectors that capture the semantic or context of the data which in turn allowing to do semantic search. 

HNSW is supported by Weaviate and is considered the default vector index type. HNSW is great for querying but less good in the building of the DB process. 

Configuring index type:
* vectorIndexType is the ANN to be used, by default it is Hierarchical Navigable Small World (HNSW)
* vectorIndexConfig allow to set parameters to the vector index such as:
    * distance (cosine, euclidian)
    * ef (high ef is high accuracy but slower search)
    * efConstruction
    * etc... https://weaviate.io/developers/weaviate/configuration/indexes

In [21]:
schema_index = {
  "class": "Article",
  "description": "string",
  "properties": [
    {
      "name": "title",
      "description": "string",
      "dataType": ["text"]
    }
  ],
  "vectorIndexType": "hnsw",
  "vectorIndexConfig": {
    "skip": False,
    "ef": 100,
    "efConstruction": 128,
    "maxConnections": 64,
  }
}

# Authentication

We can also manage authentication and authorization: https://weaviate.io/developers/weaviate/configuration/authentication

# Persistence

With Docker and Kubernetes, can persist data by mounting a volume to store the data outside the container, then when the container is reloaded so is the persist volume. 

To initiate the volumes it needs to be specified in the docker-compose file like that: 

```yaml
services:
  weaviate:
    volumes:
      - /var/weaviate:/var/lib/weaviate
    environment:
      CLUSTER_HOSTNAME: 'node1'
```

Example of a config file that supports persist volume:

```yaml
---
version: '3.4'
services:
  weaviate:
    command:
    - --host
    - 0.0.0.0
    - --port
    - '8080'
    - --scheme
    - http
    image: semitechnologies/weaviate:1.21.2
    ports:
    - 8080:8080
    restart: on-failure:0
    volumes:
      - /var/weaviate:/var/lib/weaviate # <== set a volume here
    environment:
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'none'
      ENABLE_MODULES: ''
      CLUSTER_HOSTNAME: 'node1' # <== this can be set to an arbitrary name
...
```

# Monitoring

Compatible with Prometheus and Grafana. To enable it in the docker-compose add: ```PROMETHEUS_MONITORING_ENABLED=true```, then it becomes accessible here: ```<hostname>:2112/metrics``` https://weaviate.io/developers/weaviate/configuration/monitoring

# Managing Data

## Creation of DB

In [22]:
class_name = "Web_data"
class_obj = {'class':class_name,
             'vectorizer': 'text2vec-transformers',
             'vectorIndexConfig':{
                 'distance':"cosine",
             },
            "properties": [
                {
                    "name": "title",
                    "dataType": ["text"]
                },
                {
                    "name": "chunk",
                    "dataType": ["text"]
                },
                {
                    "name": "chunk_no",
                    "dataType": ["int"]
                },
                {
                    "name": "url",
                    "dataType": ["text"],
                    "tokenization": "field"
                },
            ],
        }
client.schema.create_class(class_obj)

In [24]:
response=client.schema.get(class_name)
print(json.dumps(response, indent=2))

{
  "class": "Web_data",
  "invertedIndexConfig": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanupIntervalSeconds": 60,
    "stopwords": {
      "additions": null,
      "preset": "en",
      "removals": null
    }
  },
  "moduleConfig": {
    "text2vec-transformers": {
      "poolingStrategy": "masked_mean",
      "vectorizeClassName": true
    }
  },
  "multiTenancyConfig": {
    "enabled": false
  },
  "properties": [
    {
      "dataType": [
        "text"
      ],
      "indexFilterable": true,
      "indexSearchable": true,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": false,
          "vectorizePropertyName": false
        }
      },
      "name": "title",
      "tokenization": "word"
    },
    {
      "dataType": [
        "text"
      ],
      "indexFilterable": true,
      "indexSearchable": true,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": false,
          "vectorizePropertyName": false
   

In [25]:
uuid = client.data_object.create({
    'question': 'This vector DB is OSS & supports automatic property type inference on import',
    # 'answer': 'Weaviate',  # schema properties can be omitted
    'somePropNotInTheSchema': 123,  # will be automatically added as a number property
}, 'JeopardyQuestion')

print(uuid)  # the return value is the object's UUID

36f36ee9-088f-478a-b126-71529631d019


In [28]:
from weaviate.util import generate_uuid5

class_name = "YourClassName"  # Replace with your class name
data_objs = [
    {"title": f"Object {i+1}"} for i in range(5)
]
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    for data_obj in data_objs:
        batch.add_data_object(
            data_obj,
            class_name,
            uuid=generate_uuid5(data_obj)
            # tenant="tenantA"  # If multi-tenancy is enabled, specify the tenant to which the object will be added.
        )

In [30]:
client.data_object.get(class_name='YourClassName')

{'deprecations': [],
 'objects': [{'class': 'YourClassName',
   'creationTimeUnix': 1694617319061,
   'id': '0ca5ea3e-7661-48b5-9711-927cbf56fb73',
   'lastUpdateTimeUnix': 1694617319061,
   'properties': {'title': 'Object 3'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617410523,
   'id': '23bdb1f7-2149-5199-88ef-4d939e2ad2ed',
   'lastUpdateTimeUnix': 1694617410523,
   'properties': {'title': 'Object 4'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617319061,
   'id': '2cf9567a-fe6e-4020-a0c7-f25c80c90f87',
   'lastUpdateTimeUnix': 1694617319061,
   'properties': {'title': 'Object 5'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617410523,
   'id': '376747d4-5685-56f2-8cb2-d260291c1ff0',
   'lastUpdateTimeUnix': 1694617410523,
   'properties': {'title': 'Object 3'},
   'vectorWeights': None},
  {'class': 'YourClassName',
   'creationTimeUnix': 1694617319061,
   'id': 

In [6]:
class_obj = {
    "class": "Vec",
    "vectorizer": "text2vec-transformers",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
}

client.schema.create_class(class_obj)

UnexpectedStatusCodeException: Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Vec" already exists'}]}.

In [8]:
client.data_object.get()

{'deprecations': None,
 'objects': [{'class': 'Vec',
   'creationTimeUnix': 1694529898324,
   'id': '10b92bef-cfe9-4762-8703-97dd1d8b9a77',
   'lastUpdateTimeUnix': 1694529898324,
   'properties': {'answer': 'the nose or snout',
    'category': 'ANIMALS',
    'question': 'The gavial looks very much like a crocodile except for this bodily feature'},
   'vectorWeights': None},
  {'class': 'Vec',
   'creationTimeUnix': 1694529898324,
   'id': '39aae306-204c-481d-8548-6257d67a1fcd',
   'lastUpdateTimeUnix': 1694529898324,
   'properties': {'answer': 'Sound barrier',
    'category': 'SCIENCE',
    'question': 'In 70-degree air, a plane traveling at about 1,130 feet per second breaks it'},
   'vectorWeights': None},
  {'class': 'Vec',
   'creationTimeUnix': 1694529898325,
   'id': '470426e8-0383-4e88-a66e-22e7b4d84dd4',
   'lastUpdateTimeUnix': 1694529898325,
   'properties': {'answer': 'the atmosphere',
    'category': 'SCIENCE',
    'question': 'Changes in the tropospheric layer of this ar

In [9]:
client.schema.get()

{'classes': [{'class': 'Vec',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'answer',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': "This property was generated by Weaviate's auto-schema feature on Tue Sep 12 14:44:58 2023",
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-transformers': {'skip': False,
       've

In [4]:
class_obj = {
    "class": "Vec",
    "vectorizer": "text2vec-huggingface",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-huggingface": {},
    }
}

client.schema.create_class(class_obj)

In [18]:
import requests
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:  # Initialize a batch process
    for i, d in enumerate(data):  # Batch import data
        print(f"importing question: {i+1}")
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        batch.add_data_object(
            data_object=properties,
            class_name="Vec"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [20]:
import weaviate
import json

client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your endpoint
    #auth_client_secret=weaviate.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"),  # Replace w/ your Weaviate instance API key

)

response = (
    client.query
    .get("Question", ["question", "answer", "category"])
    .with_near_text({"concepts": ["biology"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "errors": [
        {
            "locations": [
                {
                    "column": 6,
                    "line": 1
                }
            ],
            "message": "Cannot query field \"Question\" on type \"GetObjectsObj\".",
            "path": null
        }
    ]
}
