handle batched embeddings

iamlemec · iamlemec · commit ba35aef093fc · 2024-02-13T23:07:34.000-06:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -506,6 +506,14 @@ def __del__(self):
             self._llama_batch_free(self.batch)
             self.batch = None
 
+    def n_tokens(self) -> int:
+        assert self.batch is not None
+        return self.batch.n_tokens
+
+    def reset(self):
+        assert self.batch is not None
+        self.batch.n_tokens = 0
+
     def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
         assert self.batch is not None
         n_tokens = len(batch)
@@ -518,6 +526,20 @@ def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
             self.batch.logits[i] = logits_all
         self.batch.logits[n_tokens - 1] = True
 
+    def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
+        assert self.batch is not None
+        n_tokens = len(batch)
+        n_tokens0 = self.batch.n_tokens
+        self.batch.n_tokens += n_tokens
+        for i in range(n_tokens):
+            j = n_tokens0 + i
+            self.batch.token[j] = batch[i]
+            self.batch.pos[j] = i
+            self.batch.seq_id[j][0] = seq_id
+            self.batch.n_seq_id[j] = 1
+            self.batch.logits[j] = logits_all
+        self.batch.logits[n_tokens - 1] = True
+
 
 class _LlamaTokenDataArray:
     def __init__(self, *, n_vocab: int):
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -717,10 +717,44 @@ def create_embedding(
         Returns:
             An embedding object.
         """
-        assert self._ctx.ctx is not None
         assert self._model.model is not None
         model_name: str = model if model is not None else self.model_path
 
+        # get numeric embeddings
+        embeds, total_tokens = self.embed(input, return_count=True)
+
+        # convert to CreateEmbeddingResponse
+        data = [
+            {
+                "object": "embedding",
+                "embedding": emb,
+                "index": idx,
+            } for idx, emb in enumerate(embeds)
+        ]
+
+        return {
+            "object": "list",
+            "data": data,
+            "model": model_name,
+            "usage": {
+                "prompt_tokens": total_tokens,
+                "total_tokens": total_tokens,
+            },
+        }
+
+    def embed(self, input: str, normalize: bool = True, truncate: bool = True, return_count: bool = False) -> List[float]:
+        """Embed a string.
+
+        Args:
+            input: The utf-8 encoded string to embed.
+
+        Returns:
+            A list of embeddings
+        """
+        assert self._ctx.ctx is not None
+        n_embd = self.n_embd()
+        n_ctx = self.n_ctx()
+
         if self.context_params.embedding == False:
             raise RuntimeError(
                 "Llama model must be created with embedding=True to call this method"
@@ -734,48 +768,68 @@ def create_embedding(
         else:
             inputs = input
 
+        def normalize(x):
+            norm = np.linalg.norm(x)
+            return [v/norm for v in x]
+
+        # reset batch
+        self._batch.reset()
+
+        # decode and fetch embeddings
         data: List[Embedding] = []
+        def decode_batch(n_seq):
+            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+            self._ctx.decode(self._batch)
+            self._batch.reset()
+
+            # store embeddings
+            for i in range(n_seq):
+                embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[:n_embd]
+                if normalize:
+                    embedding = normalize(embedding)
+                data.append(embedding)
+
+        # init state
         total_tokens = 0
-        for index, input in enumerate(inputs):
-            tokens = self.tokenize(input.encode("utf-8"), special=True)
-            self.reset()
-            self.eval(tokens)
+        p_batch = 0
+        t_batch = 0
+
+        # accumulate batches and encode
+        for text in inputs:
+            tokens = self.tokenize(text.encode("utf-8"))
+            if truncate:
+                tokens = tokens[:n_ctx]
             n_tokens = len(tokens)
-            total_tokens += n_tokens
-            embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[
-                : llama_cpp.llama_n_embd(self._model.model)
-            ]
 
-            data.append(
-                {
-                    "object": "embedding",
-                    "embedding": embedding,
-                    "index": index,
-                }
-            )
-        if self.verbose:
-            llama_cpp.llama_print_timings(self._ctx.ctx)
+            # check for overrun
+            if n_tokens > n_ctx:
+                raise ValueError(
+                    f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
+                )
 
-        return {
-            "object": "list",
-            "data": data,
-            "model": model_name,
-            "usage": {
-                "prompt_tokens": total_tokens,
-                "total_tokens": total_tokens,
-            },
-        }
+            # time to eval batch
+            if n_tokens + t_batch > self._n_ctx:
+                decode_batch(p_batch)
+                total_tokens += t_batch
+                p_batch = 0
+                t_batch = 0
 
-    def embed(self, input: str) -> List[float]:
-        """Embed a string.
+            # add to batch
+            self._batch.add_sequence(tokens, p_batch, False)
+            p_batch += 1
+            t_batch += n_tokens
 
-        Args:
-            input: The utf-8 encoded string to embed.
+        # hanlde last batch
+        decode_batch(p_batch)
+        total_tokens += t_batch
 
-        Returns:
-            A list of embeddings
-        """
-        return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
+        if self.verbose:
+            llama_cpp.llama_print_timings(self._ctx.ctx)
+
+        if return_count:
+            return data, total_tokens
+        else:
+            return data
 
     def _create_completion(
         self,