# Prepare documents for RAG

## Libraries

In [82]:
import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader, pdf
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os
import pprint
import re
from langchain_core.documents import Document
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field
import json
import uuid
import chromadb
from chromadb.config import Settings
import unicodedata
from langchain_google_genai import GoogleGenerativeAI
import uuid
# from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle as pkl
import requests
import subprocess
import pandas as pd
from bs4 import BeautifulSoup

## Google option

#### Config

In [2]:
import os
os.getcwd()

'/home/antonioparragaleo/Documents/RAG'

In [3]:
with open('api_google.txt') as f:
    
    api_key = json.load(f)

#### Functions

In [4]:
def clean_text(text):
    # Normalize weird Unicode characters to their closest ASCII equivalent
    text = unicodedata.normalize("NFKC", text)
    # Replace non-breaking hyphens and dashes with ASCII hyphen
    text = re.sub(r'[\u2010-\u2015]', '-', text)
     # Remove form feed characters (\x0c), common page breaks
    text = text.replace('\x0c', ' ')
    # Replace non-breaking spaces (\xa0) with regular space
    text = text.replace('\xa0', ' ')
    # 3. Fix hyphenated line breaks (e.g., treat-\nment -> treatment)
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
    # Remove numbered citations like (1), (9, 10), (5‚Äì7), etc.
    text = re.sub(r"\(\s?\d+(?:\s?(?:,|-)\s?\d+)*\s?\)", "", text)
    # Remove mid-sentence line breaks: "word\nword" -> "word word"
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Replace multiple newlines with just two (preserve paragraphs)
    text = re.sub(r'\n{2,}', '\n\n', text)
    return text.strip()

def get_system_message_rag(content):
        return f"""You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions to better understand it.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarize the answers, explain it properly.
        7. When you provide information, you must also provide the reference of the article.
        8. Do not look up on internet.
        9. Only show your final response! 
        
        Constraints:
        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT. Only do that when questions are related to coding.
        2. Don't mention that you are not able to find the answer in the provided context.
        3. Ignore the part of the content that only contains references.
        3. Don't make up the answers by yourself.
        4. Try your best to provide answer from the given context.

        CONTENT:
        {content}
        """

def get_ques_response_prompt(question, context):
    return f"""
    Context\n:
    {context}
    ==============================================================
    Based on the above context, please provide the answer to the following question\n:
    {question}
    """


#### Model

In [5]:
llm = GoogleGenerativeAI(model="gemini-2.5-flash",api_key=api_key['key'],temperature=0.2)

#### Load PDFs

In [44]:
paths = []
for root, dirs, files in os.walk("Data2"):
    for f in files:
        if f.endswith('.pdf'):
            paths.append(os.path.join(root,f))

We create a folder with each article to pre-process the references

In [45]:
[os.mkdir(re.sub(r'.pdf','',p)) for p in paths]

[None, None]

In [46]:
articles = []
for file in paths:
    loader = loader = PyPDFLoader(file, mode="single")
    doc = loader.load()[0] # As this function provides a list, we select the first element.
    doc.page_content = clean_text(doc.page_content)
    
    articles.append(doc)

#### Json generation

In [47]:
class paper(BaseModel):

    PaperTitle: str = Field(description="The full title of the research paper")
    Publication: str = Field(description="Year: The year the paper was published")
    Authors: str = Field(description="The full names of all authors of the paper")
    Email: str = Field(description="The email address of the author (if provided)")
    Abstract: str = Field(description="The full text of the paper's abstract.")
    Introduction: str = Field(description="The full text fo the paper's introduction. Bear in mind that it can have other names such as background.")
    Methods: str = Field(description="The full text fo the paper's methods. Don't take the information from abstract.")
    Results: str = Field(description="The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.")
    Discussion: str = Field(description="The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It should be entitled 'Discussion'")
    Conclusion: str = Field(description="The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.")
    URL: str = Field(description="the link where you can find the article. This link is also known as DOI. This url is quite relevant and you must find it. It usually starts with https://doi.org/")
    Journal: str = Field(description="provide the name of the journal, e.g Nature.")

In [48]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
    You are an expert in analyzing scientific research papers. Please carefully read the provided research paper above and extract the following key information:
Extract these nine (10) properties from the research paper:

Paper Title: The full title of the research paper

Publication Year: The year the paper was published

Authors: The full names of all authors of the paper

Email: The email address of the author (if provided)

Abstract: The full text of the paper's abstract

Introduction: The full text fo the paper's introduction. Bear in mind that it can have other names such as background.

Methods: The full text fo the paper's methods. Don't take the information from abstract.

Results: The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.

Discussion:The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It shoudl be entitled "Discussion"

Conclusion: The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.

URL: the link where you can find the article. This link is also known as DOI.

Journal: provide the name of the journal, e.g Nature.


Guidelines:


The extracted information should be factual and accurate to the document. Be extremely concise, except for the Abstract, Introduction, Methods, Results. Discussion and Conclusion which should be copied in full.
The extracted entities should be self-contained and easily understood without the rest of the paper. If a property is missing from the paper, please leave the field empty rather than guessing.
Answer in JSON format. The JSON should contain 9 keys: "PaperTitle", "PublicationYear", "Authors", "Email", "Abstract", "Introduction", "Methods","Results", "URL", "Reference".


Format instructions: \n{format_instructions}\n

    
The article is this:\n{query}\n""",

    input_variables=["query"],

    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [49]:
chain = prompt | llm | parser

In [50]:
my_json = []
for doc in articles:
    my_json.append(chain.invoke({"query": doc.page_content}))

In [None]:
# with open('info_articles.pkl','wb') as f:
#     pkl.dump(my_json,f)

In [85]:
# already DOI (To avoid repeat)

includedDOI = [ re.sub(r'https://doi.org/', '',a['URL']) for a in my_json]

## References

In [86]:
my_json_ref = []
pmid = {}
HEADERS = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }
for i, article in enumerate(my_json):
    path_name = re.sub(r'.pdf','',paths[i]).split('/')[1]
    pmid[path_name] = []

    if 'https://' in article['URL']:
        doi = re.sub(r'https://doi.org/', '', article['URL'])
        response = requests.get('https://api.crossref.org/works/'+doi)
        list_reference = response.json()['message'].get('reference',None)
        article['List_references'] = list(set([x.get('DOI') for x in list_reference if x.get('DOI') and x.get('DOI') not in includedDOI]))

        includedDOI.extend(article['List_references'])

        # We retrieve the pmid from each DOI

        for a in article['List_references']:
            
            pmid[path_name].append(get_pmid_from_doi(a))

        # We try to get the pdf

        command = [
        "python3", "-m", "pubmed2pdf", "pdf",
        "--out", os.path.join("Data2",path_name),
        "--pmids", f"{', '.join(pmid[path_name])}",
        "--verbose",
        "--errors", os.path.join("Data2",path_name,"pubmed2pdf_log.txt")
        ]
        subprocess.run(command)

        # We process pdfs

        for pdf in [os.path.join("Data2",path_name,f) for f in os.listdir(os.path.join("Data2",path_name)) if f.endswith('.pdf')]:

            loader = loader = PyPDFLoader(file, mode="single")
            doc = loader.load()[0] # As this function provides a list, we select the first element.
            doc.page_content = clean_text(doc.page_content)

            my_json_ref.append(chain.invoke({"query": doc.page_content}))
        

        # We try to get the html otherwise

        to_retrieve = pd.read_csv(os.path.join("Data2",path_name,"pubmed2pdf_log.txt"), header=None,dtype='str') # We get those that resulted in a mistake
        to_retrieve.columns = ["PMID"]
        to_retrieve = to_retrieve['PMID'].tolist()
        to_retrieve.extend([re.sub(r'.html','',f)  for f in os.listdir(os.path.join("Data2",path_name)) if f.endswith('.html')]) # We add those in html format since they do not have information
        
        [os.remove(os.path.join("Data2",path_name,f))  for f in os.listdir(os.path.join("Data2",path_name)) if f.endswith('.html')] # Then, we remove from directory
        
        
        for pmid in to_retrieve:
            pmcid = get_pmcid_from_pmid(pmid)

            try:
                response = requests.get(url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/", headers=HEADERS)
            except:
                print(f' The article with {pmid} is not avaiable')

            soup = BeautifulSoup(response.text, "html.parser")
            my_json_ref.append(chain.invoke({"query": soup.find("article").get_text(separator="\n", strip=True)}))

2025-08-10 22:03:28,424 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-10 22:03:28,424 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18192189
2025-08-10 22:03:29,182 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 18192189 failed from error list index out of range
2025-08-10 22:03:29,182 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23994285
2025-08-10 22:03:30,565 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 23994285 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-10 22:03:30,565 - INFO - pubmed2pdf.cli - Trying to fetch pmid 10592173
2025-08-10 22:03:31,277 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 10592173 failed from error list index out of range
2025-08-10 22:03:31,277 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18174356
2025-08-10 22:03:31,983 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 18174356 failed from error list index out of range
2025-08-10 22:03:31,983 - INFO - pubmed2pdf.cli - Trying to fetch 

Done downloading. All downloaded can be found in Data2/Parraga-Leo2023


KeyboardInterrupt: 

In [90]:
my_json_ref

[{'PaperTitle': 'Predicting risk of endometrial failure: a biomarker signature that identifies a novel disruption independent of endometrial timing in patients undergoing hormonal replacement cycles',
  'Publication': '2024',
  'Authors': 'Patricia Diaz-Gimeno, Ph.D., Patricia Sebastian-Leon, Ph.D., Katharina Spath, Ph.D., Diana Marti-Garcia, M.Sc., Josefa Maria Sanchez-Reyes, Ph.D., Maria del Carmen Vidal, Ph.D., Almudena Devesa-Peiro, Ph.D., Immaculada Sanchez-Ribas, Ph.D., Asunta Martinez-Martinez, M.Sc., Nuria Pellicer, M.D., Ph.D., Dagan Wells, Ph.D., and Antonio Pellicer, M.D., Ph.D.',
  'Email': 'patricia.diaz@ivirma.com or patricia_diaz@iislafe.es',
  'Abstract': 'Objective: To propose a new gene expression signature that identifies endometrial disruptions independent of endometrial luteal phase timing and predicts if patients are at risk of endometrial failure. Design: Multicentric, prospective study. Setting: Reproductive medicine research department in a public hospital affi

## Create database

In [166]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
)

In [167]:
my_json[1]

{'PaperTitle': 'SARS-CoV-2 infection risk assessment in the endometrium: viral infection-related gene expression across the menstrual cycle',
 'Publication': '2020',
 'Authors': 'Ismael Henarejos-Castillo, Patricia Sebastian-Leon, Almudena Devesa-Peiro, Antonio Pellicer, Patricia Diaz-Gimeno',
 'Email': 'patricia.diaz@ivirma.com',
 'Abstract': 'Objective: To determine the susceptibility of the endometrium to infection by- and thereby potential damage from- SARS-CoV-2. Design: Analysis of SARS-Cov-2 infection-related gene expression from endometrial transcriptomic data sets. Setting: Infertility research department affiliated with a public hospital. Patient(s): Gene expression data fromfive studies in 112 patients with normal endometrium collected throughout the menstrual cycle. Intervention(s): None. Main Outcome Measure(s):Gene expression and correlation between viral infectivity genes and age throughout the menstrual cycle. Result(s): Gene expression was high forTMPRSS4, CTSL, CTSB, 

In [168]:
info_paper = []

for j in my_json:

    for key, value in j.items():

        if len(value) > 1200:
            chunks = splitter.split_text(value)

            for i, c in enumerate(chunks):

                info_paper.append(
                    {
                        "chunk_index":i,
                        "content":c,
                        "parent":key,
                        "split":True,
                        "DOI":j.get("URL"),
                        "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Journal')+", "+j.get('Publication')
                    }
                )
        else:

            info_paper.append(
                    {
                        "chunk_index":0,
                        "content":value,
                        "parent":key,
                        "split":False,
                        "DOI":j.get("URL"),
                        "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Journal')+", "+j.get('Publication')
                    }
                )

In [None]:
# documents = [
#     Document(
#         page_content=chunk["content"],
#         metadata={
#             "parent": chunk["parent"],
#             "chunk_index": chunk["chunk_index"],
#             "DOI":chunk["DOI"],
#             "Reference":chunk["Reference"]
            
#         }
#     )
#     for chunk in info_paper
# ]

In [83]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# client = chromadb.HttpClient(host='localhost', port=7000, settings=Settings(allow_reset=True))

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [85]:
client = chromadb.PersistentClient(path="./chroma_RAG")

In [None]:
#client.delete_collection(name="prueba2")
collection = client.get_or_create_collection(
    name="prueba2"
)

In [173]:
# 2. Prepare documents, metadata, and IDs
texts = [chunk["content"] for chunk in info_paper]
metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_paper]
ids = [str(uuid.uuid1()) for _ in metadatas]
#ids = [f'{md["parent"]}_{md["chunk_index"]}' for md in metadatas]  # unique ID per chunk

In [174]:
collection.add(
    documents=texts,
    metadatas=metadatas,
    ids=ids
)

In [175]:
# tell LangChain to use our client and collection name
db = Chroma(
    client=client,
    collection_name="prueba",
    embedding_function=embedding_function,
)

In [77]:
a = collection.query(
        query_texts=[query],
        n_results=4
    )
parent = a['metadatas'][0][1].get('parent')
ref = a['metadatas'][0][1].get('Reference')

b = collection.get(
    where={

        "$and" :[
             {"Reference":ref},
             {"parent":parent}
        ]
        },
    include=["documents"]
        
    )

pprint.pprint("".join(b['documents']))

('A key factor for reproductive success in assisted reproduction treatments is '
 'the status of the maternal endometrium during embryo implantation and fetal '
 'development. The endometrial cycle is reflected by the cyclic structural and '
 'functional changes of the endometrium across the menstrual cycle, '
 'particularly during the mid-secretory phase, to prepare for the‚Äò‚Äòwindow of '
 'implantation‚Äô‚Äô (WOI) . After successful embryo implantation, the decidua '
 '(specialized layer of the endometrium) actively encapsulates the '
 'trophectoderm to support placentation and provide adequate vascularization '
 'for optimal fetal growth . Thus, approaches that predict and prevent '
 'endometrial-factor infertility could substantially improve assisted '
 'reproduction treatment outcomes by supporting the establishment and '
 'maintenance of pregnancy. There is a lack of consensus on the minimum number '
 'of implantation failures with good-quality embryos derived from ovum '
 'don

In [97]:
for r in a['metadatas'][0]:
     if r['parent'] not in ['Journal','URL']:
        print( "_".join([r['parent'],r['Reference'],str(r['chunk_index']+1)]))

Introduction_Patricia Diaz-Gimeno et al.,Fertil Steril¬Æ, 2024_1
Methods_Patricia Diaz-Gimeno et al.,Fertil Steril¬Æ, 2024_2
Results_Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023_8


In [127]:
a['metadatas']

[[{'parent': 'Journal',
   'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023',
   'chunk_index': 0,
   'DOI': 'https://doi.org/10.1186/s12958-023-01131-4'},
  {'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015',
   'chunk_index': 0,
   'parent': 'Introduction',
   'Reference': 'Patricia Diaz-Gimeno et al.,Fertil Steril¬Æ, 2024'},
  {'Reference': 'Patricia Diaz-Gimeno et al.,Fertil Steril¬Æ, 2024',
   'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015',
   'parent': 'Methods',
   'chunk_index': 1},
  {'parent': 'Results',
   'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023',
   'DOI': 'https://doi.org/10.1186/s12958-023-01131-4',
   'chunk_index': 7}]]

In [149]:
ideal_chunks

[['A key factor for reproductive success in assisted reproduction treatments is the status of the maternal endometrium during embryo implantation and fetal development. The endometrial cycle is reflected by the cyclic structural and functional changes of the endometrium across the menstrual cycle, particularly during the mid-secretory phase, to prepare for the‚Äò‚Äòwindow of implantation‚Äô‚Äô (WOI) . After successful embryo implantation, the decidua (specialized layer of the endometrium) actively encapsulates the trophectoderm to support placentation and provide adequate vascularization for optimal fetal growth . Thus, approaches that predict and prevent endometrial-factor infertility could substantially improve assisted reproduction treatment outcomes by supporting the establishment and maintenance of pregnancy',
  '. There is a lack of consensus on the minimum number of implantation failures with good-quality embryos derived from ovum donation or with confirmed euploid karyotypes re

In [157]:
results = db.similarity_search(query=query,k=4)

selected_index = []
ideal_chunks = []
meta_selected = []

for doc in results:

    r = doc.metadata

    if r['parent'] not in ['Journal','URL']:

        if "_".join([r['parent'],r['Reference'],str(r['chunk_index']+1)]) not in selected_index and "_".join([r['parent'],r['Reference'],str(r['chunk_index']-1)]) not in selected_index:
            
            ii = "_".join([r['parent'],r['Reference'],str(r['chunk_index'])])
            selected_index.append(ii)

            candidates = collection.get(
            where= {"$and" :[
                        {"Reference":r['Reference']},
                        {"parent":r['parent']}
                ]
            })

            max_index = len(candidates['metadatas'])-1

            meta_selected.append(candidates['metadatas'])
            ideal_chunks.append([doc for doc,meta in zip(candidates['documents'], candidates['metadatas'])
                        if meta['chunk_index'] in [r["chunk_index"], max(r["chunk_index"]-1,0), min(r["chunk_index"] + 1,max_index)]])

context = []
print(ideal_chunks)

for text, meta in zip(ideal_chunks, meta_selected):
    context.append(f'Reference:{meta[0]["Reference"]}\n\nLink (DOI)\n: {meta[0]["DOI"]}\n\nSummary:\n\n{"".join(text)}\n\n')

print(context)

[['A key factor for reproductive success in assisted reproduction treatments is the status of the maternal endometrium during embryo implantation and fetal development. The endometrial cycle is reflected by the cyclic structural and functional changes of the endometrium across the menstrual cycle, particularly during the mid-secretory phase, to prepare for the‚Äò‚Äòwindow of implantation‚Äô‚Äô (WOI) . After successful embryo implantation, the decidua (specialized layer of the endometrium) actively encapsulates the trophectoderm to support placentation and provide adequate vascularization for optimal fetal growth . Thus, approaches that predict and prevent endometrial-factor infertility could substantially improve assisted reproduction treatment outcomes by supporting the establishment and maintenance of pregnancy', '. There is a lack of consensus on the minimum number of implantation failures with good-quality embryos derived from ovum donation or with confirmed euploid karyotypes requ

In [122]:
min(1,len(candidates['metadatas'])-1)

1

In [109]:
collection.get(
    where={"$and":[
           {"Reference":'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023'},
           {"parent":"Results"},
           {"chunk_index":0}
    ]}
)

{'ids': ['bd725078-6b0e-11f0-8e08-00155de32492'],
 'embeddings': None,
 'documents': ['Results Gene signatures associated to endometrial progression and function Of the 19 gene lists used for our analysis, eleven were obtained from published studies evaluating control patients throughout the menstrual cycle [4, 17-19, 34-40], and the other eight were derived from studies comparing patients with RIF or unexplained infertility to controls [21, 41-47] (Table 1). Unifying all the aforementioned signatures, we compiled 3,608 genes related to endometrial progression and function. Hormonal regulation of endometrial progression is largely driven by progesterone We identified 7,540 and 698 genes related to estrogen and progesterone hormones, respectively. However, as determined by the relative contribution of each type of hormone within each gene list (Fig. 1A), 17/19 (89%) signatures favoured regulation by progesterone rather than estrogen. These differences were significant in 47% of the sign

In [11]:
query = "How many phases does the endoemtrium have?"
docs = db.similarity_search(query,k=4)

In [80]:
docs

[Document(metadata={'chunk_index': 7, 'parent': 'Discussion', 'DOI': 'https://doi.org/10.1186/s12958-023-01131-4', 'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023'}, page_content='. Considering these attributed functions and its variable expression throughout the menstrual cycle, we propose that CTCF exerts an inhibitory role in endometrial tissue during the PF phase of the menstrual cycle. With the significant downregulation of CTCF in the secretory endometrium, dually validated in-silico and experimentally herein, its inhibited genes would be derepressed and become transcriptionally active during the WOI. This interpretation supports previous findings from our group demonstrating that, during the WOI, a global transcriptional derepression may be required for implantation and early embryo development [63]. Transcriptional derepression has been associated with multiple human disease states and should be investigated further within the context of en

In [30]:
texts_re = []
metadata_re = []
for doc in docs:
    texts_re.append(doc.page_content)
    metadata_re.append(doc.metadata)

In [31]:
metadata_re

[{'parent': 'Discussion',
  'chunk_index': 7,
  'DOI': 'https://doi.org/10.1186/s12958-023-01131-4'},
 {'DOI': 'https://doi.org/10.1186/s12958-023-01131-4',
  'parent': 'Abstract',
  'chunk_index': 3},
 {'parent': 'Results',
  'DOI': 'https://doi.org/10.1186/s12958-023-01131-4',
  'chunk_index': 7},
 {'parent': 'Results',
  'chunk_index': 5,
  'DOI': 'https://doi.org/10.1186/s12958-023-01131-4'}]

In [32]:
# context = "\n\n---\n\n".join(texts_re)

context = []
for text, meta in zip(texts_re, metadata_re):

  context.append(f'The reference of article is {meta["DOI"]}, its information is:\n {text}')

print("\n\n".join(context))

The reference of article is https://doi.org/10.1186/s12958-023-01131-4, its information is:
 . Considering these attributed functions and its variable expression throughout the menstrual cycle, we propose that CTCF exerts an inhibitory role in endometrial tissue during the PF phase of the menstrual cycle. With the significant downregulation of CTCF in the secretory endometrium, dually validated in-silico and experimentally herein, its inhibited genes would be derepressed and become transcriptionally active during the WOI. This interpretation supports previous findings from our group demonstrating that, during the WOI, a global transcriptional derepression may be required for implantation and early embryo development [63]. Transcriptional derepression has been associated with multiple human disease states and should be investigated further within the context of endometrial-factor infertility. Despite previous associations of CTCF with endometriosis [67], its implication in endometrial p

In [113]:
prompt = get_ques_response_prompt(question=query, context=get_system_message_rag("\n\n".join(context)))

answer = llm.invoke(prompt)

In [114]:
answer

'CTCF is a conserved zinc finger protein with well-characterized regulatory functions throughout the human body. It acts as a transcriptional repressor in RNA polymerase II (Pol II) pausing, imprinting, and X-chromosome inactivation. Additionally, CTCF functions as an insulator, blocking the interaction between enhancers and the promoters of neighboring genes (https://doi.org/10.1186/s12958‚Äë023‚Äë01131‚Äë4).\n\nIn endometrial tissue, CTCF is proposed to exert an inhibitory role during the PF phase of the menstrual cycle. Its significant downregulation in the secretory endometrium leads to the derepression of its inhibited genes, which then become transcriptionally active during the Window of Implantation (WOI) (https://doi.org/10.1186/s12958‚Äë023‚Äë01131‚Äë4).\n\nCTCF is highlighted as a CCCTC-binding factor that functions as a transcriptomic repressor and has the most influential regulation of endometrial progression and function across 95% of studies (https://doi.org/10.1186/s1295

## Download reference of each paper

In [None]:
with open("info_articles.pkl","rb") as f:
    my_json = pkl.load(f)
my_json

[{'PaperTitle': 'Deciphering a shared transcriptomic regulation and the relative contribution of each regulator type through endometrial gene expression signatures',
  'Publication': '2023',
  'Authors': 'Antonio Parraga-Leo, Patricia Sebastian-Leon, Almudena Devesa-Peiro, Diana Marti-Garcia, Nuria Pellicer, Jose Remohi, Francisco Dominguez, Patricia Diaz-Gimeno',
  'Email': 'patricia.diaz@ivirma.com',
  'Abstract': 'Backgorund While various endometrial biomarkers have been characterized at the transcriptomic and functional level, there is generally a poor overlap among studies, making it unclear to what extent their upstream regulators (e.g., ovarian hormones, transcription factors (TFs) and microRNAs (miRNAs)) realistically contribute to menstrual cycle progression and function. Unmasking the intricacies of the molecular interactions in the endometrium from a novel systemic point of view will help gain a more accurate perspective of endometrial regulation and a better explanation the

In [None]:
pmid = {}
url_pmc = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{a}/"

for article in my_json:

    pmid[article['PaperTitle']] = []

    if 'https://' in my_json[0]['URL']:
        doi = re.sub(r'https://doi.org/', '', article['URL'])
        response = requests.get('https://api.crossref.org/works/'+doi)
        list_reference = response.json()['message'].get('reference',None)
        article['List_references'] = [x.get('DOI') for x in list_reference if x.get('DOI')]

        # We retrieve the pmid from each DOI

        for a in  article['List_references']:
            
            pmid[article['PaperTitle']].append(get_pmid_from_doi(a))

        # We try to get the pdf

        command = [
        "python3", "-m", "pubmed2pdf", "pdf",
        "--out", "Data2/",
        "--pmids", f"{', '.join(pmid[article['PaperTitle']])}",
        "--verbose",
        "--errors", "Data2/pubmed2pdf_log.txt"
        ]
        subprocess.run(command)

        # We try to get the html otherwise

        to_retrieve = pd.read_csv("Data2/pubmed2pdf_log.txt", header=None,dtype='str') # We get those that resulted in a mistake
        to_retrieve.columns = ["PMID"]
        to_retrieve = to_retrieve['PMID'].tolist()
        to_retrieve.extend([re.sub(r'.html','',f)  for f in os.listdir('./Data2') if f.endswith('.html')]) # We add those in html format
        [os.remove(os.path.join('Data2',f))  for f in os.listdir('./Data2') if f.endswith('.html')] # Then, we remove from directory
        
        pmcid = []
        for pmid in to_retrieve:
            pmcid.append(get_pmcid_from_pmid(pmid))
        
        response = requests.get(url_pmc, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")
        soup.find("article").get_text(separator="\n", strip=True)




2025-08-10 19:08:04,763 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-10 19:08:04,764 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16832043
2025-08-10 19:08:05,439 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 16832043 failed from error list index out of range
2025-08-10 19:08:05,440 - INFO - pubmed2pdf.cli - Trying to fetch pmid 20729534
2025-08-10 19:08:06,127 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 20729534 failed from error list index out of range
2025-08-10 19:08:06,127 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30929718
2025-08-10 19:08:07,885 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 30929718 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-10 19:08:07,885 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16306079
2025-08-10 19:08:08,525 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 16306079 failed from error list index out of range
2025-08-10 19:08:08,525 - INFO - pubmed2pdf.cli - Trying to fetch 

Done downloading. All downloaded can be found in Data2/


2025-08-10 19:09:48,571 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-10 19:09:48,572 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1155504
2025-08-10 19:09:49,824 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 1155504 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-10 19:09:49,825 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1424330
2025-08-10 19:09:50,989 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 1424330 failed from error Invalid URL 'wenmhi4L9dPogJ7fBmofTSqFqabAb1VqRbFzzuvnOkth6PgfEnsViH5uoEhxes7n': No scheme supplied. Perhaps you meant https://wenmhi4L9dPogJ7fBmofTSqFqabAb1VqRbFzzuvnOkth6PgfEnsViH5uoEhxes7n?
2025-08-10 19:09:50,989 - INFO - pubmed2pdf.cli - Trying to fetch pmid 10362823
2025-08-10 19:09:51,701 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 10362823 failed from error list index out of range
2025-08-10 19:09:51,701 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15353123
2025-08-10 19:09:53,993 - I

Done downloading. All downloaded can be found in Data2/


2025-08-10 19:10:51,291 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 12490681 failed from error list index out of range


{'Deciphering a shared transcriptomic regulation and the relative contribution of each regulator type through endometrial gene expression signatures': ['16832043',
  '20729534',
  '30929718',
  '16306079',
  '29102484',
  '30624659',
  '29452422',
  '23994285',
  '23290997',
  '30951376',
  '21849299',
  '24882617',
  '18192189',
  '31462323',
  '12200466',
  '12529417',
  '15878921',
  '35085395',
  '19933690',
  '33576824',
  '19617889',
  '10592173',
  '10802651',
  '31340985',
  '29156006',
  '26415722',
  '26395145',
  '32641214',
  '25605792',
  '11846609',
  '28855728',
  '20619403',
  '12021176',
  '15501903',
  '15666095',
  '12728018',
  '30081718',
  '18539642',
  '23555582',
  '22025212',
  '28523980',
  '16672246',
  '7962413',
  '23933037',
  '26804062',
  '32482256',
  '35147192',
  '22902743',
  '29663566',
  '28229988',
  '30037059',
  '22911744',
  '34117589',
  '31511876',
  '24100212',
  '16807381',
  '33830236',
  '21964334',
  '19563753',
  '35574918',
  '18174356

In [None]:
len(article['List_references'])
[os.remove(os.path.join('Data2',f))  for f in os.listdir('./Data2') if f.endswith('.html')]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [58]:
def get_pmid_from_doi(doi, api_key=None):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": f"{doi}[DOI]",
        "retmode": "json"
    }
    if api_key:
        params["api_key"] = api_key

    response = requests.get(url, params=params)
    result = response.json()
    pmids = result.get("esearchresult", {}).get("idlist", [])
    return pmids[0] if pmids else 'None'



def get_pmcid_from_pmid(pmid, api_key=None):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "retmode": "json"
    }
    if api_key:
        params["api_key"] = api_key

    response = requests.get(url, params=params)
    data = response.json()

    try:
        linksets = data["linksets"]
        if linksets and "linksetdbs" in linksets[0]:
            pmcid_link = linksets[0]["linksetdbs"][0]["links"][0]
            return pmcid_link
        else:
            return None
    except Exception:
        return None
    
def download_pmc_pdf(pmcid, filename=None):
    pmc_number = pmcid.replace("PMC", "")
    pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_number}/pdf/"

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }

    response = requests.get(pdf_url, headers=headers)
    if "pdf" in response.headers.get("Content-Type", ""):
        if not filename:
            filename = f"{pmcid}.pdf"
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"‚úÖ PDF downloaded as {filename}")
    else:
        print(f"‚ö†Ô∏è PDF not directly available at {pdf_url}")
        

def download_pdf_from_pmid(pmid, api_key=None):
    print(f"üîé Checking PMC for PMID: {pmid}")
    pmcid = get_pmcid_from_pmid(pmid, api_key)
    if pmcid:
        print(f"‚úÖ Found PMC ID: {pmcid}")
        download_pmc_pdf(pmcid)
    else:
        print("‚ùå No PMC version available for this PubMed article.")

In [None]:
pmids = []
for article in my_json:
    for doi in article['List_references']:
        pmid = get_pmid_from_doi(doi=doi)
        pmids.append(pmid)

In [None]:
for pmid in list(set(pmids)):
    with open('Data2/pmids.txt',"a") as f:
        f.write(pmid+'\n')

In [None]:
import subprocess
command = [
    "python3", "-m", "pubmed2pdf", "pdf",
    "--out", "Data2/",
    "--pmidsfile", "Data2/pmids.txt",
    "--verbose",
    "--errors", "Data2/pubmed2pdf_log.txt"
]
subprocess.run(command)

2025-08-02 16:38:25,860 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-02 16:38:25,861 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28923940
2025-08-02 16:38:26,583 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 28923940 failed from error list index out of range
2025-08-02 16:38:26,583 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18539642
2025-08-02 16:38:27,254 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 18539642 failed from error list index out of range
2025-08-02 16:38:27,254 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26415722
2025-08-02 16:38:28,786 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 26415722 failed from error list index out of range
2025-08-02 16:38:28,786 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31540845
2025-08-02 16:38:30,006 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 31540845 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-02 16:38:30,006 - INFO - pubmed2pdf.cli - Trying to fetch 

Done downloading. All downloaded can be found in Data2/


CompletedProcess(args=['python3', '-m', 'pubmed2pdf', 'pdf', '--out', 'Data2/', '--pmidsfile', 'Data2/pmids.txt', '--verbose', '--errors', 'Data2/pubmed2pdf_log.txt'], returncode=0)

In [None]:


to_retrieve = pd.read_csv("Data2/pubmed2pdf_log.txt", header=None,dtype='str')
to_retrieve.columns = ["PMID"]
to_retrieve['PMID']

0      1155504
1      1424330
2     10362823
3     16306079
4     10221616
5     28863933
6     28923940
7     33313697
8     36822566
9     24269084
10    24581986
11    30624659
12    33077239
13    33576824
14    34875061
15    29452422
16    20619403
17    23102856
18    34199109
19    12421895
20    22256780
21    24581625
22    36070983
23    32723696
24    33067123
25    36472596
26    11752295
27    25605792
28         NaN
29         NaN
30         NaN
31    10592173
32    33036008
33    28370781
34    34947871
35    35929523
36    27441287
37    31540845
38    33830236
39    29315421
40    35092277
41    32482256
42    12490681
Name: PMID, dtype: object

In [None]:
a = get_pmcid_from_pmid(to_retrieve['PMID'][4])
a

'12282335'

In [None]:
url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{a}/"
response = requests.get(url, headers=HEADERS)

In [None]:
soup = BeautifulSoup(response.text, "html.parser")
print(soup.find("article").get_text(separator="\n", strip=True))

Am J Obstet Gynecol
. Author manuscript; available in PMC: 2025 Jul 22.
Published in final edited form as:
Am J Obstet Gynecol. 2025 Apr;232(4 Suppl):S105‚ÄìS123. doi:
10.1016/j.ajog.2024.08.043
Search in PMC
Search in PubMed
View in NLM Catalog
Add to search
Endometriosis and Adenomyosis Unveiled Through Single-cell Glasses
Linda C Giudice
Linda C Giudice
1
Center for Reproductive Sciences, Department of Obstetrics, Gynecology & Reproductive Sciences, University of California, San Francisco, San Francisco, CA 94143
Find articles by
Linda C Giudice
1
,
Binya Liu
Binya Liu
1
Center for Reproductive Sciences, Department of Obstetrics, Gynecology & Reproductive Sciences, University of California, San Francisco, San Francisco, CA 94143
Find articles by
Binya Liu
1
,
Juan C Irwin
Juan C Irwin
1
Center for Reproductive Sciences, Department of Obstetrics, Gynecology & Reproductive Sciences, University of California, San Francisco, San Francisco, CA 94143
Find articles by
Juan C Irwin
1
Author

In [None]:
prueba = chain.invoke({"query": soup.find("article").get_text(separator="\n", strip=True)})

In [None]:
prueba

{'PaperTitle': 'Endometriosis and Adenomyosis Unveiled Through Single-cell Glasses',
 'Publication': '2025',
 'Authors': 'Linda C Giudice, Binya Liu, Juan C Irwin',
 'Email': 'Linda.Giudice@ucsf.edu',
 'Abstract': 'Single cell technologies are expanding our understanding of endometriosis and adenomyosis, sister disorders of the uterine endometrium that contain similar complements of lesion cell types but located in different niches ‚Äì outside and inside the endometrium, respectively. Both diseases cause significant morbidity and impaired quality of life among those affected, and current therapies mitigate most symptoms although with highly variable efficacy, duration of effect, and frequent intolerable side effects. Thus, there is a pressing need for transformative approaches to develop individualized therapies for the variety of presentations of endometriosis and adenomyosis symptoms and heterogeneity of lesion types histologically and architecturally. Single cell technologies are tr

In [None]:
pprint.pprint(prueba['Results'])

('Endometriosis through single cell glasses Subsequent to the single cell '
 'cartography of human endometrium published by Wang et al 19 (see Chapter 3), '
 'several groups have recently reported eutopic endometrial scRNAseq data of '
 'patients with endometriosis, endometriosis lesions and control eutopic '
 'endometrium 20 , 31 ‚Äì 36 ( Table 1 ). Types of samples, inclusion/exclusion '
 'criteria, choice of controls, study designs, numbers of cells sequenced, and '
 'number of reads (i.e., base pairs sequenced) are key to data integrity and '
 'interpretation and to comparative analyses across tissues and studies. They '
 'have provided insights into origins of ectopic disease, the heterogeneity of '
 'cell types/subtypes, unique clusters and signatures of endometrium and '
 'ectopic lesions informing mechanisms and pathways involved in cellular '
 'dysfunctions relevant to pain and fertility compromise, novel cell-cell '
 'communications, relationships between lesions and eutopic 

## Others

In [None]:
# Load several documents

storage = []

for root, dirs, files in os.walk("Data",): # for server all the path

    for file in files:
        print(file)

        loader = PyPDFLoader(os.path.join(root,file))
        pages = loader.load_and_split()

        # split it into chunks
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
        docs = text_splitter.split_documents(pages)
        storage.extend(docs) # This  is a list of lists


Devesa-Peiro2020.pdf
Henarejos-Castillo2020.pdf
Sebastian-Leon2021.pdf
Marti-Garcia2024(1).pdf
Henarejos-Castillo2021.pdf
Henarejos-Castillo2022.pdf
Diaz-Gimeno2024.pdf
Devesa-Peiro2021.pdf
Henarejos-Castillo2024.pdf
parraga-leo_2023.pdf
Marti-Garcia2024.pdf
Diaz-Gimeno2022.pdf
Devesa-Peiro2022.pdf
Sebastian-Leon2018.pdf
Diaz-Gimeno2017.pdf


In [83]:
# Load several documents

storage = []

for root, dirs, files in os.walk("Data",): # for server all the path

    for file in files:

        loader = PyPDFLoader(os.path.join(root,file),mode='page')
        pages = loader.load() # List of pages
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        docs = text_splitter.split_documents(pages)

        storage.extend(docs) # Add several items at the same time



In [13]:
# load the document and split it into pages
loader = PyPDFLoader("Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
storage = text_splitter.split_documents(pages)

In [14]:
pprint.pprint(storage[2].page_content)

('out the menstrual cycle (FDR < 0.05), dually validated in-silico and through '
 'endometrial biopsies, corroborated their \n'
 'potential regulatory roles in the endometrium.\n'
 '*Correspondence:\n'
 'Patricia Diaz‚ÄëGimeno\n'
 'patricia.diaz@ivirma.com; patricia_diaz@iislafe.es\n'
 'Full list of author information is available at the end of the article')


In [134]:
model = ChatOllama(
    model="gemma3:4b",
    temperature=0.1,
)

In [171]:
class paper(BaseModel):
    text: str = Field(description="Main text that explains the reuslts, introductions, methods or conclusions")
    doi: str = Field(description="link with the url of scientifc article known as doi.")
    citation: str = Field(description="add the citation of the manuscript is presented in the text.") 

In [172]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
    You are an expert analyzing scientifc research papers. Read the text carefully to provide the request information.
    Remove the potential information that is not related with the scientific article.

    If it appears a link with the doi of the artcile, take it. Otherwise write down UNKOWN.

    Help me to extract the information and create an json file with it. If you don't know the
    answer, say UNKNOWN.
    
    Format instructions: \n{format_instructions}\n
    
    Case:\n{query}\n""",

    input_variables=["query"],
    
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [173]:
chain = prompt | model | parser

In [174]:
my_json = []

for docs in storage:
    print(docs.page_content)

    my_json.append(chain.invoke({"query": docs.page_content}))


#chain.invoke({"query": storage[0].page_content})

Parraga‚ÄëLeo¬†et¬†al. 
Reproductive Biology and Endocrinology           (2023) 21:84  
https://doi.org/10.1186/s12958‚Äë023‚Äë01131‚Äë4
RESEARCH Open Access
¬© The Author(s) 2023. Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which 
permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the 
original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or 
other third party material in this article are included in the article‚Äôs Creative Commons licence, unless indicated otherwise in a credit line 
to the material. If material is not included in the article‚Äôs Creative Commons licence and your intended use is not permitted by statutory 
regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this 
licence, visit htt

In [175]:
my_json

# my_json['author'] = "parraga-leo"
# my_json['title'] = storage[0].metadata['title']
# my_json

[{'text': 'A total of 3,608 distinct genes from the 19 gene lists were associated with endometrial progression',
  'doi': '10.1186/s12958‚Äë023‚Äë01131‚Äë4',
  'citation': 'Parraga‚ÄëLeo et al. (2023)'},
 {'text': 'The lists‚Äô regulation was significantly favoured by TFs (89% (17/19) of gene lists) and progesterone (47% (8 /19) of gene lists), rather than miRNAs (5% (1/19) of gene lists) or estrogen (0% (0/19) of gene lists), respectively (FDR < 0.05). Exceptionally, two gene lists that were previously associated with implantation failure and unexplained infertility were less hormone‚Äëdependent, but primarily regulated by estrogen. Although endometrial progression genes were mainly targeted by hormones rather than non‚Äëhormonal contributors (odds ratio = 91.94, FDR < 0.05), we identified 311 TFs and 595 miRNAs not previously associated with ovarian hormones. We highlight CTCF, GATA6, hsa‚ÄëmiR‚Äë15a‚Äë5p, hsa‚ÄëmiR‚Äë218‚Äë5p, hsa‚ÄëmiR‚Äë107, hsa‚ÄëmiR‚Äë103a‚Äë3p, and hsa‚ÄëmiR‚Äë

In [4]:
# load the document and split it into pages
loader = PyPDFLoader("/data/local/aparraga/Bioinformatician/RAG/Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
storage = text_splitter.split_documents(pages)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# create the chroma client
import uuid
import chromadb
from chromadb.config import Settings

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

client = chromadb.HttpClient(host='localhost', port=7000, settings=Settings(allow_reset=True))
# client.list_collections()
# client.reset()  # resets the database
collection = client.get_or_create_collection("tfm_APL")
# collection = client.get_collection('tfm')

for i, doc in enumerate(my_json): #storage:
    print(doc['text'])
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=storage[i].metadata, documents=str(doc['text'])
    )

# tell LangChain to use our client and collection name
db = Chroma(
    client=client,
    collection_name="tfm",
    embedding_function=embedding_function,
)

A total of 3,608 distinct genes from the 19 gene lists were associated with endometrial progression
The lists‚Äô regulation was significantly favoured by TFs (89% (17/19) of gene lists) and progesterone (47% (8 /19) of gene lists), rather than miRNAs (5% (1/19) of gene lists) or estrogen (0% (0/19) of gene lists), respectively (FDR < 0.05). Exceptionally, two gene lists that were previously associated with implantation failure and unexplained infertility were less hormone‚Äëdependent, but primarily regulated by estrogen. Although endometrial progression genes were mainly targeted by hormones rather than non‚Äëhormonal contributors (odds ratio = 91.94, FDR < 0.05), we identified 311 TFs and 595 miRNAs not previously associated with ovarian hormones. We highlight CTCF, GATA6, hsa‚ÄëmiR‚Äë15a‚Äë5p, hsa‚ÄëmiR‚Äë218‚Äë5p, hsa‚ÄëmiR‚Äë107, hsa‚ÄëmiR‚Äë103a‚Äë3p, and hsa‚ÄëmiR‚Äë128‚Äë3p, as overlapping novel master regulators of endometrial function. The gene expression changes of selected r

In [None]:
# from ollama import Client
# client = Client(host='http://localhost:11434')
# stream = client.chat(model='gemma3:12b', messages=[
# {"role": "system", "content": get_system_message_rag(fullcontent)},            
# {"role": "user", "content": get_ques_response_prompt(query)}
# ],stream=True)