# Load data into the local vector database

This notebook requires that you have downloaded a set of markdown documents into the `corpus` folder.

In my case I used the Apache 2.0 licensed repository https://github.com/simonw/til

To download a copy run the following command:

```bash
git clone https://github.com/simonw/til.git corpus
```

In [None]:
import chromadb
import os

In [None]:
client = chromadb.PersistentClient(path="db/")
collection_name = "Corpus"
device = "cuda"
corpus_dir = "corpus"

In [None]:
if len(client.list_collections()) > 0:
    print("Removing collection")
    client.delete_collection(name=collection_name)

In [None]:
collection = client.create_collection(name=collection_name)

In [None]:
def add_file_to_collection(full_path, file_id):
    with open(full_path, 'rt') as f:
        doc = f.read()
        collection.add(documents=[doc], metadatas=[{"source": full_path}], ids=[f"{file_id:09}"])

In [None]:
class FileIdGenerator:
    file_id = 0

    def get_id(self):
        self.file_id += 1
        return self.file_id

def recurse_directory(directory, file_id_gen):
    for file_name in os.listdir(directory):
        full_path = os.path.join(directory, file_name)
        if os.path.isfile(full_path):
            if file_name.lower().endswith('.md'):
                add_file_to_collection(full_path, file_id_gen.get_id())
        else:
            recurse_directory(full_path, file_id_gen)

recurse_directory(corpus_dir, FileIdGenerator())

In [None]:
collection.query(
    query_texts=["How do I serve traffic to a subdomain?"], 
    n_results=2
)