Feat/indexing faissless (#173)

* fix: fix searcher always being reloaded * feat: implement torch kmeans * chore: lower cutoff * chore: move warning * chore: higher kmeans batch size * chore: argument support * chore: restore all default behaviour when `use_faiss` is True after having been false * chore: lint * chore: print exception if one occurs when using pytorch indexing * chore: make _original_train_kmeans robust to subsequent calls * nit: comment feat: rework kmeans to be closer to FAISS chore: store kmeans functions as class attributes fix: method assignment chore: more memory efficient lint chore: lower bsize, resultd unaffected feat: better batching, slower max doc count chore: batch size safe for 8gb GPUs chore: more elaborate warning chore: use external lib to support minibatching, revert to homebrew later * poetry lock * lint * chore: better batch size * 0.0.8 dependency prep
bclavie · Mar 18, 2024 · d27b693 · d27b693
1 parent f8c53cb
commit d27b693
Show file tree

Hide file tree

Showing 8 changed files with 1,954 additions and 1,163 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -17,19 +17,14 @@ jobs:
         with:
           python-version: 3.9
 
-      - name: Cache Poetry virtualenv
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-poetry-
-
       - name: Install Poetry
         uses: snok/install-poetry@v1.3.1
 
+      - name: Clean poetry
+        run: rm poetry.lock
+
       - name: Install dependencies
-        run: poetry install --with dev
+        run: poetry install --with dev --no-cache
 
       - name: Run tests
         run: poetry run pytest tests/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "RAGatouille"
-version = "0.0.7post11"
+version = "0.0.8"
 description = "Library to facilitate the use of state-of-the-art retrieval models in common RAG contexts."
 authors = ["Benjamin Clavie <ben@clavie.eu>"]
 license = "Apache-2.0"
@@ -9,20 +9,19 @@ packages = [{include = "ragatouille"}]
 repository = "https://github.com/bclavie/ragatouille"
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<4.0"
-ruff = "^0.1.9"
+python = ">=3.9,<4.0"
 faiss-cpu = "^1.7.4"
 transformers = "^4.36.2"
 voyager = "^2.0.2"
-aiohttp = "3.9.1"
 sentence-transformers = "^2.2.2"
-torch = "^2.0.1"
-llama-index = "^0.9.24"
+torch = ">=1.13"
+llama-index = ">=0.7"
 langchain_core = "^0.1.4"
 colbert-ai = "0.2.19"
 langchain = "^0.1.0"
 onnx = "^1.15.0"
 srsly = "2.4.8"
+fast-pytorch-kmeans= "0.2.0.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.0"

diff --git a/ragatouille/RAGPretrainedModel.py b/ragatouille/RAGPretrainedModel.py
@@ -180,6 +180,7 @@ def index(
         document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
         preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
         bsize: int = 32,
+        use_faiss: bool = False,
     ):
         """Build an index from a list of documents.
 
@@ -215,6 +216,7 @@ def index(
             max_document_length=max_document_length,
             overwrite=overwrite_index,
             bsize=bsize,
+            use_faiss=use_faiss,
         )
 
     def add_to_index(
@@ -227,6 +229,7 @@ def add_to_index(
         document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
         preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None,
         bsize: int = 32,
+        use_faiss: bool = False,
     ):
         """Add documents to an existing index.
 
@@ -258,6 +261,7 @@ def add_to_index(
             new_docid_metadata_map=new_docid_metadata_map,
             index_name=index_name,
             bsize=bsize,
+            use_faiss=use_faiss,
         )
 
     def delete_from_index(

diff --git a/ragatouille/__init__.py b/ragatouille/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.7post11"
+__version__ = "0.0.8"
 from .RAGPretrainedModel import RAGPretrainedModel
 from .RAGTrainer import RAGTrainer
 

diff --git a/ragatouille/models/colbert.py b/ragatouille/models/colbert.py
@@ -35,7 +35,7 @@ def __init__(
         self.pid_docid_map = None
         self.docid_pid_map = None
         self.docid_metadata_map = None
-        self.base_model_max_tokens = 512
+        self.base_model_max_tokens = 510
         if n_gpu == -1:
             n_gpu = 1 if torch.cuda.device_count() == 0 else torch.cuda.device_count()
 
@@ -86,7 +86,7 @@ def __init__(
             )
             self.base_model_max_tokens = (
                 self.inference_ckpt.bert.config.max_position_embeddings
-            )
+            ) - 4
 
         self.run_context = Run().context(self.run_config)
         self.run_context.__enter__()  # Manually enter the context
@@ -125,6 +125,7 @@ def add_to_index(
         new_docid_metadata_map: Optional[List[dict]] = None,
         index_name: Optional[str] = None,
         bsize: int = 32,
+        use_faiss: bool = False,
     ):
         self.index_name = index_name if index_name is not None else self.index_name
         if self.index_name is None:
@@ -181,6 +182,7 @@ def add_to_index(
             new_collection,
             verbose=self.verbose != 0,
             bsize=bsize,
+            use_faiss=use_faiss,
         )
         self.config = self.model_index.config
 
@@ -294,6 +296,7 @@ def index(
         max_document_length: int = 256,
         overwrite: Union[bool, str] = "reuse",
         bsize: int = 32,
+        use_faiss: bool = False,
     ):
         self.collection = collection
         self.config.doc_maxlen = max_document_length
@@ -341,6 +344,7 @@ def index(
             overwrite,
             verbose=self.verbose != 0,
             bsize=bsize,
+            use_faiss=use_faiss,
         )
         self.config = self.model_index.config
         self._save_index_metadata()
@@ -494,7 +498,11 @@ def _set_inference_max_tokens(
             not hasattr(self, "inference_ckpt_len_set")
             or self.inference_ckpt_len_set is False
         ):
-            if max_tokens == "auto" or max_tokens > self.base_model_max_tokens:
+            if max_tokens == "auto":
+                max_tokens = self.base_model_max_tokens
+            else:
+                max_tokens = int(max_tokens)
+            if max_tokens > self.base_model_max_tokens:
                 max_tokens = self.base_model_max_tokens
                 percentile_90 = np.percentile(
                     [len(x.split(" ")) for x in documents], 90
@@ -504,6 +512,7 @@ def _set_inference_max_tokens(
                     self.base_model_max_tokens,
                 )
                 max_tokens = max(256, max_tokens)
+
                 if max_tokens > 300:
                     print(
                         f"Your documents are roughly {percentile_90} tokens long at the 90th percentile!",

diff --git a/ragatouille/models/index.py b/ragatouille/models/index.py
@@ -1,13 +1,17 @@
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from pathlib import Path
 from time import time
 from typing import Any, List, Literal, Optional, TypeVar, Union
 
 import srsly
 import torch
 from colbert import Indexer, IndexUpdater, Searcher
+from colbert.indexing.collection_indexer import CollectionIndexer
 from colbert.infra import ColBERTConfig
 
+from ragatouille.models import torch_kmeans
+
 IndexType = Literal["FLAT", "HNSW", "PLAID"]
 
 
@@ -126,6 +130,8 @@ class HNSWModelIndex(ModelIndex):
 class PLAIDModelIndex(ModelIndex):
     _DEFAULT_INDEX_BSIZE = 32
     index_type = "PLAID"
+    faiss_kmeans = staticmethod(deepcopy(CollectionIndexer._train_kmeans))
+    pytorch_kmeans = staticmethod(torch_kmeans._train_kmeans)
 
     def __init__(self, config: ColBERTConfig) -> None:
         super().__init__(config)
@@ -168,21 +174,6 @@ def build(
         bsize = kwargs.get("bsize", PLAIDModelIndex._DEFAULT_INDEX_BSIZE)
         assert isinstance(bsize, int)
 
-        if torch.cuda.is_available():
-            import faiss
-
-            if not hasattr(faiss, "StandardGpuResources"):
-                print(
-                    "________________________________________________________________________________\n"
-                    "WARNING! You have a GPU available, but only `faiss-cpu` is currently installed.\n",
-                    "This means that indexing will be slow. To make use of your GPU.\n"
-                    "Please install `faiss-gpu` by running:\n"
-                    "pip uninstall --y faiss-cpu & pip install faiss-gpu\n",
-                    "________________________________________________________________________________",
-                )
-                print("Will continue with CPU indexing in 5 seconds...")
-                time.sleep(5)
-
         nbits = 2
         if len(collection) < 5000:
             nbits = 8
@@ -192,22 +183,76 @@ def build(
             self.config, ColBERTConfig(nbits=nbits, index_bsize=bsize)
         )
 
+        # Instruct colbert-ai to disable forking if nranks == 1
+        self.config.avoid_fork_if_possible = True
+
         if len(collection) > 100000:
             self.config.kmeans_niters = 4
         elif len(collection) > 50000:
             self.config.kmeans_niters = 10
         else:
             self.config.kmeans_niters = 20
 
-        # Instruct colbert-ai to disable forking if nranks == 1
-        self.config.avoid_fork_if_possible = True
-        indexer = Indexer(
-            checkpoint=checkpoint,
-            config=self.config,
-            verbose=verbose,
+        # Monkey-patch colbert-ai to avoid using FAISS
+        monkey_patching = (
+            len(collection) < 100000 and kwargs.get("use_faiss", False) is False
         )
-        indexer.configure(avoid_fork_if_possible=True)
-        indexer.index(name=index_name, collection=collection, overwrite=overwrite)
+        if monkey_patching:
+            print(
+                "---- WARNING! You are using PLAID with an experimental replacement for FAISS for greater compatibility ----"
+            )
+            print("This is a behaviour change from RAGatouille 0.8.0 onwards.")
+            print(
+                "This works fine for most users and smallish datasets, but can be considerably slower than FAISS and could cause worse results in some situations."
+            )
+            print(
+                "If you're confident with FAISS working on your machine, pass use_faiss=True to revert to the FAISS-using behaviour."
+            )
+            print("--------------------")
+            CollectionIndexer._train_kmeans = self.pytorch_kmeans
+
+            # Try to keep runtime stable -- these are values that empirically didn't degrade performance at all on 3 benchmarks.
+            # More tests required before warning can be removed.
+            try:
+                indexer = Indexer(
+                    checkpoint=checkpoint,
+                    config=self.config,
+                    verbose=verbose,
+                )
+                indexer.configure(avoid_fork_if_possible=True)
+                indexer.index(
+                    name=index_name, collection=collection, overwrite=overwrite
+                )
+            except Exception as err:
+                print(
+                    f"PyTorch-based indexing did not succeed with error: {err}",
+                    "! Reverting to using FAISS and attempting again...",
+                )
+                monkey_patching = False
+        if monkey_patching is False:
+            CollectionIndexer._train_kmeans = self.faiss_kmeans
+            if torch.cuda.is_available():
+                import faiss
+
+                if not hasattr(faiss, "StandardGpuResources"):
+                    print(
+                        "________________________________________________________________________________\n"
+                        "WARNING! You have a GPU available, but only `faiss-cpu` is currently installed.\n",
+                        "This means that indexing will be slow. To make use of your GPU.\n"
+                        "Please install `faiss-gpu` by running:\n"
+                        "pip uninstall --y faiss-cpu & pip install faiss-gpu\n",
+                        "________________________________________________________________________________",
+                    )
+                    print("Will continue with CPU indexing in 5 seconds...")
+                    time.sleep(5)
+            indexer = Indexer(
+                checkpoint=checkpoint,
+                config=self.config,
+                verbose=verbose,
+            )
+            indexer.configure(avoid_fork_if_possible=True)
+            indexer.index(name=index_name, collection=collection, overwrite=overwrite)
+
         return self
 
     def _load_searcher(