-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
107 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
"""STF-IDF Retriever. | ||
Based on https://github.com/artitw/text2text""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import Any, Dict, Iterable, List, Optional | ||
|
||
from langchain.callbacks.manager import ( | ||
AsyncCallbackManagerForRetrieverRun, | ||
CallbackManagerForRetrieverRun, | ||
) | ||
from langchain.schema import BaseRetriever, Document | ||
|
||
|
||
class STFIDFRetriever(BaseRetriever): | ||
index: Any | ||
docs: List[Document] | ||
k: int = 4 | ||
|
||
class Config: | ||
"""Configuration for this pydantic object.""" | ||
|
||
arbitrary_types_allowed = True | ||
|
||
@classmethod | ||
def from_texts( | ||
cls, | ||
texts: Iterable[str], | ||
metadatas: Optional[Iterable[dict]] = None, | ||
**kwargs: Any, | ||
) -> STFIDFRetriever: | ||
try: | ||
import text2text as t2t | ||
except ImportError: | ||
raise ImportError( | ||
"Could not import text2text, please install with `pip install " | ||
"text2text`." | ||
) | ||
|
||
index = t2t.Indexer().transform(texts) | ||
metadatas = metadatas or ({} for _ in texts) | ||
docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)] | ||
return cls(index=index, docs=docs, **kwargs) | ||
|
||
@classmethod | ||
def from_documents( | ||
cls, | ||
documents: Iterable[Document], | ||
*, | ||
tfidf_params: Optional[Dict[str, Any]] = None, | ||
**kwargs: Any, | ||
) -> STFIDFRetriever: | ||
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents)) | ||
return cls.from_texts( | ||
texts=texts, metadatas=metadatas, **kwargs | ||
) | ||
|
||
def _get_relevant_documents( | ||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun | ||
) -> List[Document]: | ||
distances, pred_ids = self.index.search([query], k=self.k) | ||
return [self.docs[i] for i in pred_ids[0] if i >= 0] | ||
|
||
async def _aget_relevant_documents( | ||
self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun | ||
) -> List[Document]: | ||
raise NotImplementedError | ||
|
||
async def aadd_documents( | ||
self, documents: List[Document], **kwargs: Any | ||
) -> List[str]: | ||
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents)) | ||
self.docs += documents | ||
self.index.add(texts) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import pytest | ||
|
||
from text2text.langchain.stfidf import STFIDFRetriever | ||
from langchain.schema import Document | ||
|
||
|
||
@pytest.mark.requires("langchain") | ||
def test_from_texts() -> None: | ||
input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."] | ||
stfidf_retriever = STFIDFRetriever.from_texts(texts=input_texts) | ||
assert len(stfidf_retriever.docs) == 3 | ||
|
||
|
||
@pytest.mark.requires("langchain") | ||
def test_retrieval_with_stfidf_params() -> None: | ||
input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."] | ||
stfidf_retriever = STFIDFRetriever.from_texts( | ||
texts=input_texts, k=2 | ||
) | ||
assert len(stfidf_retriever._get_relevant_documents("pen")) == 2 | ||
|
||
@pytest.mark.requires("langchain") | ||
def test_from_documents() -> None: | ||
input_docs = [ | ||
Document(page_content="I have a pen."), | ||
Document(page_content="Do you have a pen?"), | ||
Document(page_content="I have a bag."), | ||
] | ||
tfidf_retriever = STFIDFRetriever.from_documents(documents=input_docs) | ||
assert len(tfidf_retriever.docs) == 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters