In [36]:
import httpx
from bs4 import BeautifulSoup
from langchain.schema import Document
from pydantic import Field
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
import re
import httpx
from bs4 import BeautifulSoup
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers.ensemble import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings # HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.retrievers.document_compressors.embeddings_filter import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.schema import Document

In [4]:
from NCBI import call_api

2024-03-14 19:55:23.606 | INFO     | schema_agents.config:__init__:44 - Config loading done.
2024-03-14 19:55:23.607 | INFO     | schema_agents.config:__init__:55 - Set OPENAI_API_BASE in case of network issues


In [5]:
documents = []
pubmed_query_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC1790863"
pubmed_response = call_api(pubmed_query_url).decode()

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC1790863


In [15]:
bm25_retriever = BM25Retriever.from_texts(documents, metadatas=[{"source": "pubmed"}] * len(documents))
bm25_retriever.k = 5

In [17]:
embedding = OpenAIEmbeddings()

In [18]:
faiss_vectorstore = FAISS.from_texts(documents, embedding=embedding, metadatas=[{"source": "pubmed"}] * len(documents))

In [19]:
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs = {'k' : 5})

In [21]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights= [0.5, 0.5])

In [9]:
from xml.etree import cElementTree as ET
tree = ET.fromstring(pubmed_response)
article_body = tree.find(".//body")

In [12]:
def extract_text(element):
    text = ""
    if element.text:
        text += element.text
    for child in element:
        text += extract_text(child)
        if child.tail:
            text += child.tail
    return text

In [14]:
article_text = extract_text(article_body).strip()
with open("pubmedtool.txt", 'w') as f:
    f.write(article_text)

In [21]:
documents = [Document(page_content=article_text, metadata={"source": "pubmed"})]

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [28]:
chunk_size = 500
num_results = 5

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=10,
                                                       separators=["\n\n", "\n", ".", ", ", " ", ""])
split_docs = text_splitter.split_documents(documents)

In [26]:
embedding = OpenAIEmbeddings()

In [29]:
faiss_retriever = FAISS.from_documents(split_docs, embedding).as_retriever(search_kwargs={"k": 5})

In [33]:
def preprocess_text(text: str) -> str:
    text = text.replace("\n", " \n")
    spaces_regex = re.compile(r" {3,}")
    text = spaces_regex.sub(" ", text)
    text = text.strip()
    return text

In [34]:
bm25_retriever = BM25Retriever.from_documents(split_docs, preprocess_func=preprocess_text)
bm25_retriever.k = num_results

In [38]:
similarity_threshold = 0.5

In [54]:
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding)
embeddings_filter = EmbeddingsFilter(embeddings=embedding, k=None, similarity_threshold=similarity_threshold)
pipeline_compressor = DocumentCompressorPipeline(transformers=[redundant_filter, embeddings_filter])
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=faiss_retriever)
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, compression_retriever], weights=[0.5, 0.5])

In [55]:
query = 'What methods were used in the paper?'

In [56]:
compressed_docs = await ensemble_retriever.aget_relevant_documents(query)
# Ensemble may return more than "num_results" results, so cut off excess ones
res = compressed_docs[:num_results]