From d52ab0ec093422610a5f69f836429028238aa986 Mon Sep 17 00:00:00 2001
From: Daniel <danielkull91@gmail.com>
Date: Tue, 4 Jun 2024 20:16:28 +0200
Subject: [PATCH] Chore: replace logging strings in rolling_window.py by
 printf-style strings to improve performance through lazy evaluation

---
 semantic_router/splitters/rolling_window.py | 56 +++++++++++++--------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py
index dc393b55..0e90eeba 100644
--- a/semantic_router/splitters/rolling_window.py
+++ b/semantic_router/splitters/rolling_window.py
@@ -85,10 +85,10 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]:
             token_count = tiktoken_length(docs[0])
             if token_count > self.max_split_tokens:
                 logger.info(
-                    f"Single document exceeds the maximum token limit "
-                    f"of {self.max_split_tokens}. "
-                    "Splitting to sentences before semantically splitting."
+                    "Single document exceeds the maximum token limit of %s. Splitting to sentences before semantically splitting.",
+                    self.max_split_tokens,
                 )
+
             docs = split_to_sentences(docs[0])
         encoded_docs = self._encode_documents(docs)
         similarities = self._calculate_similarity_scores(encoded_docs)
@@ -125,7 +125,7 @@ def _encode_documents(self, docs: List[str]) -> np.ndarray:
                 batch_embeddings = self.encoder(batch_docs)
                 embeddings.extend(batch_embeddings)
             except Exception as e:
-                logger.error(f"Error encoding documents {batch_docs}: {e}")
+                logger.error("Error encoding documents %s: %s", batch_docs, e)
                 raise
 
         return np.array(embeddings)
@@ -145,12 +145,14 @@ def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]:
     def _find_split_indices(self, similarities: List[float]) -> List[int]:
         split_indices = []
         for idx, score in enumerate(similarities):
-            logger.debug(f"Similarity score at index {idx}: {score}")
+            logger.debug("Similarity score at index %d: %f", idx, score)
             if score < self.calculated_threshold:
                 logger.debug(
-                    f"Adding to split_indices due to score < threshold: "
-                    f"{score} < {self.calculated_threshold}"
+                    "Adding to split_indices due to score < threshold: %f < %f",
+                    score,
+                    self.calculated_threshold,
                 )
+
                 # Split after the document at idx
                 split_indices.append(idx + 1)
         return split_indices
@@ -173,7 +175,9 @@ def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float
             self.calculated_threshold = (low + high) / 2
             split_indices = self._find_split_indices(similarity_scores)
             logger.debug(
-                f"Iteration {iteration}: Trying threshold: {self.calculated_threshold}"
+                "Iteration %d: Trying threshold: %f",
+                iteration,
+                self.calculated_threshold,
             )
 
             # Calculate the token counts for each split using the cumulative sums
@@ -187,8 +191,9 @@ def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float
             # Calculate the median token count for the splits
             median_tokens = np.median(split_token_counts)
             logger.debug(
-                f"Iteration {iteration}: Median tokens per split: {median_tokens}"
+                "Iteration %d: Median tokens per split: %d", iteration, median_tokens
             )
+
             if (
                 self.min_split_tokens - self.split_tokens_tolerance
                 <= median_tokens
@@ -205,9 +210,11 @@ def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float
             iteration += 1
 
         logger.debug(
-            f"Optimal threshold {self.calculated_threshold} found "
-            f"with median tokens ({median_tokens}) in target range "
-            f"({self.min_split_tokens}-{self.max_split_tokens})."
+            "Optimal threshold %f found with median tokens (%d) in target range (%d-%d).",
+            self.calculated_threshold,
+            median_tokens,
+            self.min_split_tokens,
+            self.max_split_tokens,
         )
 
         return self.calculated_threshold
@@ -234,8 +241,9 @@ def _split_documents(
 
         for doc_idx, doc in enumerate(docs):
             doc_token_count = token_counts[doc_idx]
-            logger.debug(f"Accumulative token count: {current_tokens_count} tokens")
-            logger.debug(f"Document token count: {doc_token_count} tokens")
+            logger.debug("Accumulative token count: %d tokens", current_tokens_count)
+            logger.debug("Document token count: %d tokens", doc_token_count)
+
             # Check if current index is a split point based on similarity
             if doc_idx + 1 in split_indices:
                 if (
@@ -260,9 +268,11 @@ def _split_documents(
                         )
                     )
                     logger.debug(
-                        f"Split finalized with {current_tokens_count} tokens due to "
-                        f"threshold {self.calculated_threshold}."
+                        "Split finalized with %d tokens due to threshold %f.",
+                        current_tokens_count,
+                        self.calculated_threshold,
                     )
+
                     current_split, current_tokens_count = [], 0
                     splits_by_threshold += 1
                     continue  # Move to the next document after splitting
@@ -280,8 +290,9 @@ def _split_documents(
                     )
                     splits_by_max_chunk_size += 1
                     logger.debug(
-                        f"Split finalized with {current_tokens_count} tokens due to "
-                        f"exceeding token limit of {self.max_split_tokens}."
+                        "Split finalized with %d tokens due to exceeding token limit of %d.",
+                        current_tokens_count,
+                        self.max_split_tokens,
                     )
                     current_split, current_tokens_count = [], 0
 
@@ -300,8 +311,8 @@ def _split_documents(
             )
             splits_by_last_split += 1
             logger.debug(
-                f"Final split added with {current_tokens_count} "
-                "tokens due to remaining documents."
+                "Final split added with %d tokens due to remaining documents.",
+                current_tokens_count,
             )
 
         # Validation to ensure no tokens are lost during the split
@@ -311,8 +322,11 @@ def _split_documents(
         )
         if original_token_count != split_token_count:
             logger.error(
-                f"Token count mismatch: {original_token_count} != {split_token_count}"
+                "Token count mismatch: %d != %d",
+                original_token_count,
+                split_token_count,
             )
+
             raise ValueError(
                 f"Token count mismatch: {original_token_count} != {split_token_count}"
             )