In [1]:
import httpx
from bs4 import BeautifulSoup
from langchain.schema import Document
from pydantic import Field
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

In [3]:
from NCBI import call_api

In [6]:
documents = []
pubmed_query_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC1790863"
pubmed_response = call_api(pubmed_query_url).decode()

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC1790863


In [13]:
documents = [pubmed_response]

In [14]:
documents

['<?xml version="1.0" ?>\n<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">\n<pmc-articleset><article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article">\n  <?properties open_access?>\n  <front>\n    <journal-meta>\n      <journal-id journal-id-type="nlm-ta">PLoS One</journal-id>\n      <journal-id journal-id-type="iso-abbrev">PLoS ONE</journal-id>\n      <journal-id journal-id-type="publisher-id">plos</journal-id>\n      <journal-id journal-id-type="pmc">plosone</journal-id>\n      <journal-title-group>\n        <journal-title>PLoS ONE</journal-title>\n      </journal-title-group>\n      <issn pub-type="epub">1932-6203</issn>\n      <publisher>\n        <publisher-name>Public Library of Science</publisher-name>\n        <publisher-loc>San Francisco, USA</publisher-loc>\n      </publisher>\n    </journal-meta>\n    <article-meta>\n

In [15]:
bm25_retriever = BM25Retriever.from_texts(documents, metadatas=[{"source": "pubmed"}] * len(documents))
bm25_retriever.k = 5

In [17]:
embedding = OpenAIEmbeddings()

In [18]:
faiss_vectorstore = FAISS.from_texts(documents, embedding=embedding, metadatas=[{"source": "pubmed"}] * len(documents))

In [19]:
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs = {'k' : 5})

In [21]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights= [0.5, 0.5])

In [24]:
docs = ensemble_retriever.invoke("Authors")


In [28]:
with open('temp.xml', 'w') as f:
    f.write(docs[0].page_content)

In [29]:
with open("paper.xml", 'w') as f:
    f.write(pubmed_response)

In [30]:
from langchain_community.tools.pubmed.tool import PubmedQueryRun

In [31]:
tool = PubmedQueryRun()

In [33]:
response = tool.invoke("Malignant Breat Neoplasm")

In [35]:
with open('pubmedtool.txt', 'w') as f:
    f.write(response)

In [36]:
from langchain_community.utilities.pubmed import PubMedAPIWrapper

In [37]:
pmw = PubMedAPIWrapper()

In [39]:
g = pmw.lazy_load("Maligant Breast Neoplasm")

In [40]:
r = pmw.load("Maligant Breast Neoplasm")

In [41]:
r

[{'uid': '34527149',
  'Title': 'Utility of ultrasound guided versus conventional fine needle aspiration cytology in diagnosing breast malignancies among patients with palpable breast lumps at Bugando Medical Centre, Mwanza Tanzania.',
  'Published': '2021-06-16',
  'Copyright Information': 'Copyright: Tresphory Bonephace Kamushaga et al.',
  'Summary': 'INTRODUCTION: breast lump is the commonest presentation for both benign and maligant breast conditions. Both ultrasound guided and conventional fine needle aspiration cytology (FNAC) have been used for diagnosing of breast malignancy among patients with palpable breast lumps. This study compared diagnostic utility of ultrasound guided versus conventional FNAC in diagnosing breast malignancies among patients with palpable breast lumps at Bugando Medical Centre.\nMETHODS: this was a hospital based cross sectional study with a follow up component that combined both retrospective data (from January 2017 to June 2018) and prospective data (

In [44]:
with open('pubmedtool.txt', 'w') as f:
    f.write(r[0]['Summary'])


In [45]:
from pydantic import BaseModel

In [51]:
from typing import Any
class TestClass(BaseModel):
    parse: Any

t = TestClass(parse = "r")

In [52]:
from langchain_community.document_loaders import PubMedLoader

In [53]:
loader = PubMedLoader("chatgpt")

In [54]:
docs = loader.load()

In [62]:
docs[0].metadata

{'uid': '38482764',
 'Title': '[Applications, techniques, and best practices for using ChatGPT].',
 'Published': '--',
 'Copyright Information': ''}

In [63]:
from xml.etree import cElementTree as ET

In [71]:
tree = ET.parse('temp.xml')
root = tree.getroot()

In [72]:
for page in root.findall('body'):
    print(page.find('sec').text)

In [76]:
root.find(".//body")

<Element 'body' at 0x1579930b0>

In [77]:
# Extracting the body of the article
article_body = root.find(".//body")

# Initialize a list to hold all text from the body
article_body_text = []

# Iterate through each section in the body and append its contents
for sec in article_body:
    section_title = sec.find("title").text if sec.find("title") is not None else "No title"
    section_paragraphs = sec.findall("p")
    section_text = f"Section Title: {section_title}\n" + "\n".join(p.text for p in section_paragraphs if p.text)
    article_body_text.append(section_text)

# Join all section texts into a single string
article_body_text_combined = "\n\n".join(article_body_text)

# Output the extracted body text
article_body_text_combined

"Section Title: Introduction\nA persistent question in biology is how organismal complexity changes through the course of evolution \nPhenotypic complexity quantifies the number of genetically uncorrelated phenotypic traits contributing to an organism's fitness. A phenotypic trait contributes to an organism's fitness only to the extent that natural selection acts upon that trait. Thus an organismal phenotype that is no longer under selection (for example during an evolutionary transition from a generalist to specialist lifestyle), although expressed by the organism, contributes nothing to organismal complexity. Secondly, if two phenotypes contribute to complexity, they must be genetically separable: some mutations must exist that affect one phenotype but not the other. If no such mutations exist, then although we may perceive two phenotypes under selection, these phenotypes contribute only a single trait toward determining phenotypic complexity. As an example consider the affinity of a

In [78]:
article_body_text

["Section Title: Introduction\nA persistent question in biology is how organismal complexity changes through the course of evolution \nPhenotypic complexity quantifies the number of genetically uncorrelated phenotypic traits contributing to an organism's fitness. A phenotypic trait contributes to an organism's fitness only to the extent that natural selection acts upon that trait. Thus an organismal phenotype that is no longer under selection (for example during an evolutionary transition from a generalist to specialist lifestyle), although expressed by the organism, contributes nothing to organismal complexity. Secondly, if two phenotypes contribute to complexity, they must be genetically separable: some mutations must exist that affect one phenotype but not the other. If no such mutations exist, then although we may perceive two phenotypes under selection, these phenotypes contribute only a single trait toward determining phenotypic complexity. As an example consider the affinity of 

In [79]:
def extract_text(element):
    text = ""
    if element.text:
        text += element.text
    for child in element:
        text += extract_text(child)
        if child.tail:
            text += child.tail
    return text

In [83]:
with open("pubmedtool.txt", 'w') as f:
    f.write(extract_text(article_body).strip())