Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Added fix to encode documents within rolling window #256

Merged
merged 11 commits into from
Apr 27, 2024
27 changes: 21 additions & 6 deletions semantic_router/splitters/rolling_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,27 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]:
return splits

def _encode_documents(self, docs: List[str]) -> np.ndarray:
try:
embeddings = self.encoder(docs)
return np.array(embeddings)
except Exception as e:
logger.error(f"Error encoding documents {docs}: {e}")
raise
"""
Encodes a list of documents into embeddings. If the number of documents exceeds 2000,
the documents are split into batches to avoid overloading the encoder. OpenAI has a
limit of len(array) < 2048.

:param docs: List of text documents to be encoded.
:return: A numpy array of embeddings for the given documents.
"""
max_docs_per_batch = 2000
embeddings = []

for i in range(0, len(docs), max_docs_per_batch):
batch_docs = docs[i : i + max_docs_per_batch]
try:
batch_embeddings = self.encoder(batch_docs)
embeddings.extend(batch_embeddings)
except Exception as e:
logger.error(f"Error encoding documents {batch_docs}: {e}")
raise

return np.array(embeddings)

def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]:
raw_similarities = []
Expand Down
Loading