# Prepare documents for RAG

## Libraries

In [2]:
import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader, pdf
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os
import pprint
import re
from langchain_core.documents import Document
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field
import json
import uuid
import chromadb
from chromadb.config import Settings
import unicodedata
from langchain_google_genai import GoogleGenerativeAI
import uuid
# from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle as pkl
import requests
import subprocess
import pandas as pd
from bs4 import BeautifulSoup
import tqdm
import time
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import datetime
import time

  from .autonotebook import tqdm as notebook_tqdm


## Google option

#### Config

In [3]:
import os
os.getcwd()

'/home/antonioparragaleo/Documents/RAG'

In [4]:
with open('api_google.txt') as f:
    
    api_key = json.load(f)

#### Functions

In [64]:
def clean_text(text):
    # Normalize weird Unicode characters to their closest ASCII equivalent
    text = unicodedata.normalize("NFKC", text)
    # Replace non-breaking hyphens and dashes with ASCII hyphen
    text = re.sub(r'[\u2010-\u2015]', '-', text)
     # Remove form feed characters (\x0c), common page breaks
    text = text.replace('\x0c', ' ')
    # Replace non-breaking spaces (\xa0) with regular space
    text = text.replace('\xa0', ' ')
    # 3. Fix hyphenated line breaks (e.g., treat-\nment -> treatment)
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
    # Remove numbered citations like (1), (9, 10), (5–7), etc.
    text = re.sub(r"\(\s?\d+(?:\s?(?:,|-)\s?\d+)*\s?\)", "", text)
    # Remove mid-sentence line breaks: "word\nword" -> "word word"
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Replace multiple newlines with just two (preserve paragraphs)
    text = re.sub(r'\n{2,}', '\n\n', text)
    return text.strip()

def get_system_message_rag(content):
        return f"""You are an expert consultant helping executive advisors to get relevant information from scientific articles and code related to reproduction and bioinformatics.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions to better understand it.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarize the answers, explain it properly.
        7. When you provide information, you must also provide the reference of the article.
        8. Do not look up on internet.
        9. Only show your final response! 
        
        Constraints:
        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT. Only do that when questions are related to coding.
        2. Don't mention that you are not able to find the answer in the provided context.
        3. Ignore the part of the content that only contains references.
        3. Don't make up the answers by yourself.
        4. Try your best to provide answer from the given context.

        CONTENT:
        {content}
        """

def get_ques_response_prompt(question, context):
    return f"""
    Context\n:
    {context}
    ==============================================================
    Based on the above context, please provide the answer to the following question\n:
    {question}
    """

def get_pmid_from_doi(doi, api_key=None):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": f"{doi}[DOI]",
        "retmode": "json"
    }
    if api_key:
        params["api_key"] = api_key

    response = requests.get(url, params=params)
    result = response.json()
    pmids = result.get("esearchresult", {}).get("idlist", [])
    return pmids[0] if pmids else 'None'



def get_pmcid_from_pmid(pmid, api_key=None):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "retmode": "json"
    }
    if api_key:
        params["api_key"] = api_key

    response = requests.get(url, params=params)
    

    try:
        data = response.json()
        pmcid_link = None
        linksets = data['linksets'][0]['linksetdbs']

        for j in linksets:
            if j.get('linkname') =='pubmed_pmc':
                pmcid_link = j['links'][0]
                return pmcid_link
    except Exception:
        return None

def clean_doi(doi):
    """Remove prefixes from DOI links and standardize format."""
    return re.sub("(https://doi\.org/|http://dx\.doi\.org/)", "", doi.strip())

## Timing

In [6]:
time_management = {}

#### Model

In [7]:
llm = GoogleGenerativeAI(model="gemini-2.0-flash",api_key=api_key['key'],temperature=0.2)
# from langchain_google_genai import ChatGoogleGenerativeAI
# llm = ChatGoogleGenerativeAI(model="gemma-3-27b-it",api_key=api_key['key'], temperature=0.2)

#### Load PDFs

In [8]:
paths = []
paths_folder_create = []
for root, dirs, files in os.walk("Data"):
    for f in files:
        if f.endswith('.pdf'):
            paths.append(os.path.join(root,f))
            paths_folder_create.append(os.path.join(os.getcwd(),"ToDataBase",f))

We create a folder with each article to pre-process the references

In [137]:
os.mkdir(os.path.join(os.getcwd(),"ToDataBase"))
[os.mkdir(re.sub(r'.pdf','',p)) for p in paths_folder_create]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [9]:
articles = []
for file in paths:
    loader = loader = PyPDFLoader(file, mode="single")
    doc = loader.load()[0] # As this function provides a list, we select the first element.
    doc.page_content = clean_text(doc.page_content)
    
    articles.append(doc)

#### Json generation

In [10]:
class paper(BaseModel):

    PaperTitle: str = Field(description="The full title of the research paper")
    Publication: str = Field(description="Year: The year the paper was published")
    Authors: str = Field(description="The full names of all authors of the paper")
    Email: str = Field(description="The email address of the author (if provided)")
    Abstract: str = Field(description="The full text of the paper's abstract.")
    Introduction: str = Field(description="The full text fo the paper's introduction. Bear in mind that it can have other names such as background.")
    Methods: str = Field(description="The full text fo the paper's methods. Don't take the information from abstract.")
    Results: str = Field(description="The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.")
    Discussion: str = Field(description="The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It should be entitled 'Discussion'")
    Conclusion: str = Field(description="The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.")
    DOI: str = Field(description="Provide the Digital Object Identifier (DOI) for the paper. If a DOI is not available, provide a URL for the paper. This field is compulsory. Example of a DOI format: 10.1007/s10814-017-9105-3, 10.1016/j.cell.2023.08.001, 10.1038/nature12345.")
    Journal: str = Field(description="provide the name of the journal, e.g Nature.")

In [11]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
You are an expert in analyzing scientific research papers. Please carefully read the provided research paper above and extract the following key information:
Extract these nine (10) properties from the research paper:

Paper Title: The full title of the research paper

Publication Year: The year the paper was published

Authors: The full names of all authors of the paper

Email: The email address of the author (if provided)

Abstract: The full text of the paper's abstract

Introduction: The full text fo the paper's introduction. Bear in mind that it can have other names such as background.

Methods: The full text fo the paper's methods. Don't take the information from abstract.

Results: The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.

Discussion:The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It shoudl be entitled "Discussion"

Conclusion: The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.

DOI: Provide the Digital Object Identifier (DOI) for the paper. If a DOI is not available, provide a URL for the paper. This field is compulsory. Example of a DOI format: 10.1007/s10814-017-9105-3, 10.1016/j.cell.2023.08.001, 10.1038/nature12345.

Journal: provide the name of the journal, e.g Nature.


Guidelines:


The extracted information should be factual and accurate to the document. Be extremely concise, except for the Abstract, Introduction, Methods, Results. Discussion and Conclusion which should be copied in full.
The extracted entities should be self-contained and easily understood without the rest of the paper. If a property is missing from the paper, please leave the field empty rather than guessing.
Answer in JSON format. The JSON should contain 9 keys: "PaperTitle", "PublicationYear", "Authors", "Email", "Abstract", "Introduction", "Methods","Results", "DOI", "Reference". You MUST add the DOI since is key to identify the article.

Format instructions: \n{format_instructions}\n
    
The article is this:\n{query}\n""",

    input_variables=["query"],

    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [67]:
chain = prompt | llm | parser

doi_prompt = PromptTemplate(
    template=(
        """Extract the Digital Object Identifier (DOI) from the following paper.
        "The DOI must be returned in its raw form (e.g., 10.1007/s10814-017-9105-3,
        "10.1016/j.cell.2023.08.001, 10.1038/nature12345).
        "Return only the DOI string with no additional text.
        "If no DOI is found, return None.\n\n
        "Paper:\n{query}"""
    ),
    input_variables=["query"]
)

chain_doi = doi_prompt | llm

In [None]:
start = time.time()
my_json = []

for doc in tqdm.tqdm(articles, total=len(articles)):
    info_article = chain.invoke({"query": doc.page_content})

    if 'DOI' not in info_article.keys():
        previous_doi= chain_doi.invoke({"query":doc.page_content})
        if 'https://doi.org/' not in previous_doi:
            info_article['DOI'] = 'https://doi.org/' + previous_doi
        else:
            info_article['DOI'] = previous_doi
    
    my_json.append(info_article)

time_management["ParserMain"] = time.time()-start

with open('info_articles_main.pkl','wb') as f:
    pkl.dump(my_json,f)

100%|██████████| 18/18 [11:53<00:00, 39.62s/it]


## References

In [89]:
# already DOI (To avoid repeat)

includedDOI = [ re.sub('(https://doi\.org/|http://dx\.doi\.org/)', '',a['DOI']) for a in my_json]

In [105]:
my_json_ref = []
pmid_dict = {}
# tocheck={}
not_available = []

HEADERS = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }

for i, article in enumerate(my_json):
    
    path_name = re.sub(r'.pdf','',paths[i]).split('/')[1]
    pmid_dict[path_name] = []
    #tocheck[path_name] = []

    print(f"We process the article: {path_name}")

    doi = clean_doi(article['DOI'])
    
    cross_ref = requests.get('https://api.crossref.org/works/'+doi)
    list_reference = cross_ref.json()['message'].get('reference',None)
    article['List_references'] = list(set([x.get('DOI') for x in list_reference if x.get('DOI') and x.get('DOI') not in includedDOI]))

        # for x in list_reference:

        #     if x.get('DOI'):

        #         title = x.get('article-title')
        #         doi = x.get("DOI")
        #         pmid = get_pmid_from_doi(doi)
        #         pmcid = get_pmcid_from_pmid(pmid)
        #         year = x.get("year")
        #         tocheck[path_name].append({title:[doi,pmid, year,pmcid]})


        # tocheck[path_name] = [{'List_references' : list(set([(x.get('DOI')) for x in list_reference if x.get('DOI') and x.get('DOI') not in includedDOI]))}]
        # tocheck[path_name] = [{'List_references' : list(set([(x.get('DOI')) for x in list_reference if x.get('DOI') and x.get('DOI') not in includedDOI]))}]

        # We retrieve the pmid from each DOI

    for ref_doi in article['List_references']:
            
        pmid_dict[path_name].append(get_pmid_from_doi(ref_doi))

    # We try to get the pdf

    print("Getting PDFs-----------------------------\n")

    output_dir = os.path.join("ToDataBase", path_name)

    if len(pmid_dict[path_name]) > 0:

        command = [
        "python3", "-m", "pubmed2pdf", "pdf",
        "--out", output_dir,
        "--pmids", f"{', '.join(pmid_dict[path_name])}",
        "--maxtries","1",
        "--errors", os.path.join(output_dir,"pubmed2pdf_log.txt")
        ]
        subprocess.run(command)

        # We process pdfs

        pdfs2retrieve = [os.path.join(output_dir,f) for f in os.listdir(output_dir) if f.endswith('.pdf')]

        print("Turn PDFs into JSON--------------------\n")

        for pdf in tqdm.tqdm(pdfs2retrieve, total=len(pdfs2retrieve)):

            loader = loader = PyPDFLoader(pdf, mode="single")
            doc = loader.load()[0] # As this function provides a list, we select the first element.
            doc.page_content = clean_text(doc.page_content)

            try:
                info_article = chain.invoke({"query": doc.page_content})
                doi_candidate = info_article.get("DOI",None)

                if not doi_candidate:

                    retry_doi = chain_doi.invoke({"query":doc.page_content})
                    info_article['DOI'] = 'https://doi.org/' + retry_doi if 'https://doi.org/' not in retry_doi else retry_doi

                else:
                    info_article['DOI'] = 'https://doi.org/' + info_article['DOI'] if 'https://doi.org/' not in info_article['DOI'] else info_article['DOI']

                my_json_ref.append(info_article)
            except:
                not_available.append(re.sub(r".pdf","",os.path.basename(pdf)))

        # We try to get the html otherwise

        print("Getting xml from PMC--------------------\n")

        log_file = os.path.join(output_dir,"pubmed2pdf_log.txt")
        if os.path.exists(log_file):
            to_retrieve = pd.read_csv(log_file, header=None,dtype='str')[0].tolist() # We get those files that could not be retrieved
        else:
            to_retrieve = []

        to_retrieve.extend([re.sub(r'.html','',f)  for f in os.listdir(output_dir) if f.endswith('.html')]) # We add those in html format since they do not have information


        # Clean information
        for f in os.listdir(output_dir):
            if f.endswith('html'):
                os.remove(os.path.join(output_dir,f)) # Then, we remove from directory
        
        print("Turn XML into JSON--------------------\n")
        
        for pmid in tqdm.tqdm(to_retrieve, total=len(to_retrieve)):

            try:
                pmcid = get_pmcid_from_pmid(pmid)
                response = requests.get(url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/", headers=HEADERS)
                soup = BeautifulSoup(response.text, "html.parser")
                text_paper = soup.find("article").get_text(separator="\n", strip=True)
                info_article = chain.invoke({"query": text_paper})

                doi_candidate = info_article.get("DOI",None)
                if not doi_candidate:
                    retry_doi= chain_doi.invoke({"query": text_paper})
                    info_article['DOI'] = 'https://doi.org/' + retry_doi if 'https://doi.org/' not in retry_doi else retry_doi
                        
                else:
                    info_article['DOI'] = 'https://doi.org/' + info_article['DOI'] if 'https://doi.org/' not in info_article['DOI'] else info_article['DOI']

                my_json_ref.append(info_article)
                
            except:
                not_available.append(pmid)

            time.sleep(1)
    

    includedDOI.extend(article['List_references'])
    with open("info_articles_ref_final.pkl","wb") as f:
        pkl.dump(my_json_ref, f)

We process the article: Devesa-Peiro2020
Getting PDFs-----------------------------

We process the article: Sebastian-Leon2025
Getting PDFs-----------------------------

We process the article: Henarejos-Castillo2020
Getting PDFs-----------------------------

We process the article: Parraga-Leo2023
Getting PDFs-----------------------------

We process the article: Garcia-Acero2025
Getting PDFs-----------------------------

We process the article: Sebastian-Leon2021
Getting PDFs-----------------------------

We process the article: Henarejos-Castillo2021
Getting PDFs-----------------------------



2025-08-17 21:24:52,774 - INFO - pubmed2pdf.cli - Trying to fetch pmid 21989058
2025-08-17 21:24:53,518 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27901055
2025-08-17 21:24:53,518 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24020646
2025-08-17 21:24:53,518 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25227694
2025-08-17 21:24:55,788 - INFO - pubmed2pdf.cli - Trying to fetch pmid 11125122
2025-08-17 21:24:56,503 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 21:24:57,337 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26243799
2025-08-17 21:24:58,065 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30276597
2025-08-17 21:25:00,376 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32461654
2025-08-17 21:25:00,376 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26620551
2025-08-17 21:25:01,311 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 21:25:02,131 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16722528
2025-08-17 21:25:02,131 - INFO - pubmed2pdf.cli 

Done downloading. All downloaded can be found in ToDataBase/Henarejos-Castillo2021
Turn PDFs into JSON--------------------



100%|██████████| 20/20 [09:05<00:00, 27.29s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 62/62 [20:21<00:00, 19.71s/it]


We process the article: Henarejos-Castillo2022
Getting PDFs-----------------------------



2025-08-17 21:56:26,230 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18586725
2025-08-17 21:56:26,911 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19628854
2025-08-17 21:56:27,900 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30817321
2025-08-17 21:56:28,605 - INFO - pubmed2pdf.cli - Trying to fetch pmid 17960529
2025-08-17 21:56:32,202 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31885807
2025-08-17 21:56:33,032 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32898168
2025-08-17 21:56:36,176 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32918567
2025-08-17 21:56:38,758 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22287627
2025-08-17 21:56:39,504 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32552854
2025-08-17 21:56:40,864 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30612956
2025-08-17 21:56:43,235 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29259471
2025-08-17 21:56:44,503 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33568179
2025-08-17 21:56:47,152 - INFO - pubmed2

Done downloading. All downloaded can be found in ToDataBase/Henarejos-Castillo2022
Turn PDFs into JSON--------------------



100%|██████████| 26/26 [12:39<00:00, 29.21s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 53/53 [18:13<00:00, 20.63s/it]


We process the article: Diaz-Gimeno2024
Getting PDFs-----------------------------



2025-08-17 22:29:39,976 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33880419
2025-08-17 22:29:41,121 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36822566
2025-08-17 22:29:42,415 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18546601
2025-08-17 22:29:44,658 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36472596
2025-08-17 22:29:45,352 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28370781
2025-08-17 22:29:46,334 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35929523
2025-08-17 22:29:47,132 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33036008
2025-08-17 22:29:48,727 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26109056
2025-08-17 22:29:50,228 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32723696
2025-08-17 22:29:51,467 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24581625
2025-08-17 22:29:52,774 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29737471
2025-08-17 22:29:55,961 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 22:29:56,712 - INFO - pubmed2pdf.

Done downloading. All downloaded can be found in ToDataBase/Diaz-Gimeno2024
Turn PDFs into JSON--------------------



100%|██████████| 3/3 [02:09<00:00, 43.30s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 32/32 [08:47<00:00, 16.49s/it]


We process the article: Devesa-Peiro2021
Getting PDFs-----------------------------



2025-08-17 22:41:40,580 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16707507
2025-08-17 22:41:42,320 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27323161
2025-08-17 22:41:43,472 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 22:41:44,302 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22819144
2025-08-17 22:41:45,575 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26856931
2025-08-17 22:41:46,857 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29576469
2025-08-17 22:41:48,184 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26366788
2025-08-17 22:41:49,534 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30588329
2025-08-17 22:41:52,025 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27998009
2025-08-17 22:41:52,740 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19910308
2025-08-17 22:41:53,436 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30477193
2025-08-17 22:41:54,306 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25243856
2025-08-17 22:41:55,031 - INFO - pubmed2pdf.

Done downloading. All downloaded can be found in ToDataBase/Devesa-Peiro2021
Turn PDFs into JSON--------------------



100%|██████████| 4/4 [03:03<00:00, 45.83s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 26/26 [05:43<00:00, 13.20s/it]


We process the article: Henarejos-Castillo2024
Getting PDFs-----------------------------



2025-08-17 22:51:34,072 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33025164
2025-08-17 22:51:36,722 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36055201
2025-08-17 22:51:37,967 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32943985
2025-08-17 22:51:39,633 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24995866
2025-08-17 22:51:40,818 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19810025
2025-08-17 22:51:41,656 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28574608
2025-08-17 22:51:42,454 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29566152
2025-08-17 22:51:44,199 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29425284
2025-08-17 22:51:44,905 - INFO - pubmed2pdf.cli - Trying to fetch pmid 34597585
2025-08-17 22:51:56,971 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30207912
2025-08-17 22:51:57,652 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25326635
2025-08-17 22:51:58,385 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15769979
2025-08-17 22:51:59,057 - INFO - pubmed2

Done downloading. All downloaded can be found in ToDataBase/Henarejos-Castillo2024
Turn PDFs into JSON--------------------



100%|██████████| 13/13 [08:26<00:00, 38.93s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 45/45 [18:56<00:00, 25.26s/it]


We process the article: Sanchez-Reyes2025
Getting PDFs-----------------------------



2025-08-17 23:20:48,511 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35085989
2025-08-17 23:20:49,870 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28627518
2025-08-17 23:20:52,818 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33489905
2025-08-17 23:20:55,638 - INFO - pubmed2pdf.cli - Trying to fetch pmid 38906211
2025-08-17 23:20:57,151 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35409214
2025-08-17 23:20:57,830 - INFO - pubmed2pdf.cli - Trying to fetch pmid 2903323
2025-08-17 23:21:00,118 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32992208
2025-08-17 23:21:01,469 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24227677
2025-08-17 23:21:02,182 - INFO - pubmed2pdf.cli - Trying to fetch pmid 37355665
2025-08-17 23:21:03,827 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23104886
2025-08-17 23:21:04,500 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31467207
2025-08-17 23:21:05,484 - INFO - pubmed2pdf.cli - Trying to fetch pmid 12215322
2025-08-17 23:21:06,756 - INFO - pubmed2p

Done downloading. All downloaded can be found in ToDataBase/Sanchez-Reyes2025
Turn PDFs into JSON--------------------



100%|██████████| 3/3 [01:41<00:00, 33.81s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 11/11 [01:39<00:00,  9.03s/it]


We process the article: Marti-Garcia2024
Getting PDFs-----------------------------



2025-08-17 23:24:47,325 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29250769
2025-08-17 23:24:48,023 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35351010
2025-08-17 23:24:49,415 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1521649
2025-08-17 23:24:50,722 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1838082
2025-08-17 23:24:52,045 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15368600
2025-08-17 23:24:53,835 - INFO - pubmed2pdf.cli - Trying to fetch pmid 21435901
2025-08-17 23:24:55,210 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31085094
2025-08-17 23:24:56,452 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19948745
2025-08-17 23:24:57,169 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25298042
2025-08-17 23:24:59,493 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30283331
2025-08-17 23:25:03,714 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 23:25:04,487 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28854727
2025-08-17 23:25:05,174 - INFO - pubmed2pdf.cl

Done downloading. All downloaded can be found in ToDataBase/Marti-Garcia2024
Turn PDFs into JSON--------------------



100%|██████████| 4/4 [02:04<00:00, 31.10s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 29/29 [08:27<00:00, 17.50s/it]


We process the article: Marti-Garcia2024_review
Getting PDFs-----------------------------



2025-08-17 23:37:02,776 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33258951
2025-08-17 23:37:03,484 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23754297
2025-08-17 23:37:04,182 - INFO - pubmed2pdf.cli - Trying to fetch pmid 9241295
2025-08-17 23:37:05,045 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 23:37:05,801 - INFO - pubmed2pdf.cli - Trying to fetch pmid 34061977
2025-08-17 23:37:06,856 - INFO - pubmed2pdf.cli - Trying to fetch pmid 6053626
2025-08-17 23:37:07,965 - INFO - pubmed2pdf.cli - Trying to fetch pmid 4347566
2025-08-17 23:37:08,694 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25785919
2025-08-17 23:37:09,388 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18454478
2025-08-17 23:37:11,506 - INFO - pubmed2pdf.cli - Trying to fetch pmid 8801134
2025-08-17 23:37:12,812 - INFO - pubmed2pdf.cli - Trying to fetch pmid 12641627
2025-08-17 23:37:13,506 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27178763
2025-08-17 23:37:14,701 - INFO - pubmed2pdf.cli 

Done downloading. All downloaded can be found in ToDataBase/Marti-Garcia2024_review
Turn PDFs into JSON--------------------



100%|██████████| 24/24 [14:26<00:00, 36.11s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 100/100 [13:14<00:00,  7.95s/it]


We process the article: Diaz-Gimeno2022
Getting PDFs-----------------------------



2025-08-18 00:07:39,716 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19246470
2025-08-18 00:07:40,422 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22285995
2025-08-18 00:07:41,098 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28443690
2025-08-18 00:07:41,776 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:07:42,585 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18077318
2025-08-18 00:07:43,269 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:07:44,051 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27386492

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(req.content, 'lxml')
2025-08-18 00:07:48,478 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:

Done downloading. All downloaded can be found in ToDataBase/Diaz-Gimeno2022
Turn PDFs into JSON--------------------



100%|██████████| 1/1 [00:28<00:00, 28.22s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 13/13 [03:19<00:00, 15.33s/it]


We process the article: Devesa-Peiro2022
Getting PDFs-----------------------------



2025-08-18 00:11:59,858 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31806903
2025-08-18 00:12:02,097 - INFO - pubmed2pdf.cli - Trying to fetch pmid 21245076
2025-08-18 00:12:02,778 - INFO - pubmed2pdf.cli - Trying to fetch pmid 17604715
2025-08-18 00:12:04,163 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23725226
2025-08-18 00:12:05,627 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15797956
2025-08-18 00:12:06,282 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:12:07,041 - INFO - pubmed2pdf.cli - Trying to fetch pmid 8903775
2025-08-18 00:12:08,291 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32173784
2025-08-18 00:12:10,755 - INFO - pubmed2pdf.cli - Trying to fetch pmid 12969699
2025-08-18 00:12:11,984 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24894503
2025-08-18 00:12:13,657 - INFO - pubmed2pdf.cli - Trying to fetch pmid 14559028
2025-08-18 00:12:14,900 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27271600
2025-08-18 00:12:15,582 - INFO - pubmed2pdf.c

Done downloading. All downloaded can be found in ToDataBase/Devesa-Peiro2022
Turn PDFs into JSON--------------------



100%|██████████| 5/5 [02:56<00:00, 35.37s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 19/19 [05:17<00:00, 16.71s/it]


We process the article: Sebastian-Leon2018
Getting PDFs-----------------------------



2025-08-18 00:21:11,348 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:21:12,143 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24737781
2025-08-18 00:21:12,869 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24082038
2025-08-18 00:21:13,581 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15539444
2025-08-18 00:21:14,327 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18035563
2025-08-18 00:21:15,620 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23756099
2025-08-18 00:21:17,027 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15924536
2025-08-18 00:21:18,020 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22080510
2025-08-18 00:21:18,696 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:21:19,460 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22683339
2025-08-18 00:21:20,767 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23664094
2025-08-18 00:21:22,064 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28710396
2025-08-18 00:21:24,632 - INFO - pubmed2pdf.cli 

Done downloading. All downloaded can be found in ToDataBase/Sebastian-Leon2018
Turn PDFs into JSON--------------------



100%|██████████| 2/2 [01:11<00:00, 35.53s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 19/19 [02:24<00:00,  7.61s/it]


We process the article: Diaz-Gimeno2017
Getting PDFs-----------------------------



2025-08-18 00:25:18,509 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27128483
2025-08-18 00:25:21,327 - INFO - pubmed2pdf.cli - Trying to fetch pmid 11120680
2025-08-18 00:25:22,005 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:25:22,764 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:25:24,516 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16005454
2025-08-18 00:25:25,886 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18252602
2025-08-18 00:25:27,146 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:25:27,907 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26385059
2025-08-18 00:25:29,064 - INFO - pubmed2pdf.cli - Trying to fetch pmid 20689021
2025-08-18 00:25:29,812 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24706003
2025-08-18 00:25:30,940 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27122490


Done downloading. All downloaded can be found in ToDataBase/Diaz-Gimeno2017
Turn PDFs into JSON--------------------



100%|██████████| 1/1 [00:41<00:00, 41.76s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 10/10 [01:43<00:00, 10.39s/it]


## Create database

In [32]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " "]  # smart splitting
)

In [None]:
info_paper = []

for j in my_json_final:

    for key, value in j.items():
    
        if key in ['Abstract', 'Introduction', 'Methods', 'Results', 'Discussion', 'Conclusion',] and value != "":

            if len(value) > 1200:
                chunks = splitter.split_text(value)

                for i, c in enumerate(chunks):

                    info_paper.append(
                        {
                            "chunk_index":i,
                            "content":c,
                            "parent":key,
                            "split":True,
                            "DOI":j.get("URL"),
                            "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Journal')+", "+j.get('Publication')
                        }
                    )
            else:

                info_paper.append(
                        {
                            "chunk_index":0,
                            "content":value,
                            "parent":key,
                            "split":False,
                            "DOI":j.get("URL"),
                            "Reference": j.get('Authors').split(",")[0]+" et al.,"+j.get('Journal')+", "+j.get('Publication')
                        }
                    )

In [52]:
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# client = chromadb.HttpClient(host='localhost', port=7000, settings=Settings(allow_reset=True))
emb_f= SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", device="cuda")
embedding_function2 = HuggingFaceEmbeddings(model_name="avsolatorio/GIST-small-Embedding-v0")

In [35]:
# os.makedirs("./chroma_RAG", exist_ok=True)
# client = chromadb.HttpClient(host='localhost', port=7000, settings=Settings(allow_reset=True))
client = chromadb.PersistentClient(path="./chroma_RAG")

In [51]:
# client.delete_collection(name="ReproRAG")
collection = client.get_or_create_collection(
    name="ReproRAG",
    embedding_function=embedding_function,
    metadata={"Description": "Database containing scientific articles in reproductive field",
              "Created":str(datetime.datetime.today())},
    configuration={
         "hnsw": {"space": "cosine"}
    }

)

AttributeError: 'function' object has no attribute 'name'

In [55]:
# 2. Prepare documents, metadata, and IDs
texts = [chunk["content"] for chunk in info_paper]
metadatas = [{"parent": chunk["parent"], "chunk_index": chunk["chunk_index"],"DOI": chunk["DOI"], "Reference": chunk["Reference"]} for chunk in info_paper]
ids = [str(uuid.uuid1()) for _ in metadatas]

In [91]:
db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function2,
    metadatas=metadatas,
    ids=ids,
    collection_name="ReproRAG",
    persist_directory="./chromaRepro"
)

In [208]:
batch_size = 100  # or smaller if needed

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_vectors = vectors[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    
    collection.add(
        documents=batch_texts,
        embeddings=batch_vectors,
        metadatas=batch_metadatas,
        ids=batch_ids
    )

In [212]:
# tell LangChain to use our client and collection name
db = Chroma(
    client=client,
    collection_name="ReproRAG",
    embedding_function=embedding_function2,
)

In [53]:
help(Chroma)

Help on class Chroma in module langchain_community.vectorstores.chroma:

class Chroma(langchain_core.vectorstores.base.VectorStore)
 |  Chroma(collection_name: 'str' = 'langchain', embedding_function: 'Optional[Embeddings]' = None, persist_directory: 'Optional[str]' = None, client_settings: 'Optional[chromadb.config.Settings]' = None, collection_metadata: 'Optional[Dict]' = None, client: 'Optional[chromadb.Client]' = None, relevance_score_fn: 'Optional[Callable[[float], float]]' = None) -> 'None'
 |  
 |  .. deprecated:: 0.2.9 Use ``:class:`~langchain_chroma.Chroma``` instead. It will not be removed until langchain-community==1.0.
 |  
 |  `ChromaDB` vector store.
 |  
 |  To use, you should have the ``chromadb`` python package installed.
 |  
 |  Example:
 |      .. code-block:: python
 |  
 |              from langchain_community.vectorstores import Chroma
 |              from langchain_community.embeddings.openai import OpenAIEmbeddings
 |  
 |              embeddings = OpenAIEmbedd

In [57]:
db.similarity_search("What is the function of CTCF in endometrium?", 4)

[Document(metadata={'chunk_index': 7, 'DOI': 'https://doi.org/10.1186/s12958-023-01131-4', 'parent': 'Discussion', 'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023'}, page_content='. Consider-ing these attributed functions and its variable expression throughout the menstrual cycle, we propose that CTCF exerts an inhibitory role in endometrial tissue during the PF phase of the menstrual cycle. With the significant downregulation of CTCF in the secretory endometrium, dually validated in-silico and experimentally herein, its inhibited genes would be derepressed and become tran-scriptionally active during the WOI. This interpretation supports previous findings from our group demonstrat-ing that, during the WOI, a global transcriptional dere-pression may be required for implantation and early embryo development [63]. Transcriptional derepression has been associated with multiple human disease states and should be investigated further within the context o

In [68]:
a = db.similarity_search("What is the function of CTCF in endometrium?", 4)
print(a[0].metadata)
parent = a[0].metadata.get('parent')
print(parent)
ref = a[0].metadata.get('Reference')

b = db.get(
    where={

        "$and" :[
             {"Reference":ref},
             {"parent":parent}
        ]
        },
    include=["documents"]
        
    )

pprint.pprint("".join(b['documents']))

{'parent': 'Discussion', 'DOI': 'https://doi.org/10.1186/s12958-023-01131-4', 'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023', 'chunk_index': 7}
Discussion
('This data-driven approach exposed the common tran-scriptional regulators '
 'among 19 studies who proposed variable biomarkers of endometrial progression '
 'and function. In this study, we focused on understanding the relative '
 'contribution of both the hormonal and nonhormonal regulation, from an '
 'alternative holistic per-spective. We applied data-driven hypothesis '
 'research that, unlike the traditional scientific method, allowed us to '
 'generate new hypotheses based on all available bio-logically-relevant '
 'knowledge [16], observing the molec-ular relationships from a wider scale '
 'view. Besides, some traditional molecular procedures such as PCRs are also '
 'performed to corroborate the insight uncovered with this approach. We '
 'highlighted a larger influence of progestero

In [97]:
for r in a['metadatas'][0]:
     if r['parent'] not in ['Journal','URL']:
        print( "_".join([r['parent'],r['Reference'],str(r['chunk_index']+1)]))

Introduction_Patricia Diaz-Gimeno et al.,Fertil Steril®, 2024_1
Methods_Patricia Diaz-Gimeno et al.,Fertil Steril®, 2024_2
Results_Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023_8


In [127]:
a['metadatas']

[[{'parent': 'Journal',
   'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023',
   'chunk_index': 0,
   'DOI': 'https://doi.org/10.1186/s12958-023-01131-4'},
  {'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015',
   'chunk_index': 0,
   'parent': 'Introduction',
   'Reference': 'Patricia Diaz-Gimeno et al.,Fertil Steril®, 2024'},
  {'Reference': 'Patricia Diaz-Gimeno et al.,Fertil Steril®, 2024',
   'DOI': 'https://doi.org/10.1016/j.fertnstert.2024.03.015',
   'parent': 'Methods',
   'chunk_index': 1},
  {'parent': 'Results',
   'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023',
   'DOI': 'https://doi.org/10.1186/s12958-023-01131-4',
   'chunk_index': 7}]]

In [149]:
ideal_chunks

[['A key factor for reproductive success in assisted reproduction treatments is the status of the maternal endometrium during embryo implantation and fetal development. The endometrial cycle is reflected by the cyclic structural and functional changes of the endometrium across the menstrual cycle, particularly during the mid-secretory phase, to prepare for the‘‘window of implantation’’ (WOI) . After successful embryo implantation, the decidua (specialized layer of the endometrium) actively encapsulates the trophectoderm to support placentation and provide adequate vascularization for optimal fetal growth . Thus, approaches that predict and prevent endometrial-factor infertility could substantially improve assisted reproduction treatment outcomes by supporting the establishment and maintenance of pregnancy',
  '. There is a lack of consensus on the minimum number of implantation failures with good-quality embryos derived from ovum donation or with confirmed euploid karyotypes required t

In [69]:
query = "What is the function of CTCF in endometrium?"

In [74]:
results = db.similarity_search(query=query,k=4)

selected_index = []
ideal_chunks = []
meta_selected = []

for doc in results:

    r = doc.metadata

    if r['parent'] not in ['Journal','URL']:

        if "_".join([r['parent'],r['Reference'],str(r['chunk_index']+1)]) not in selected_index and "_".join([r['parent'],r['Reference'],str(r['chunk_index']-1)]) not in selected_index:
            
            ii = "_".join([r['parent'],r['Reference'],str(r['chunk_index'])])
            selected_index.append(ii)

            candidates = db.get(
            where= {"$and" :[
                        {"Reference":r['Reference']},
                        {"parent":r['parent']}
                ]
            })

            print(f"Candidates\n: {candidates['metadatas']['chunk_index']}")
            max_index = len(candidates['metadatas'])-1

            meta_selected.append(candidates['metadatas'])
            ideal_chunks.append([doc for doc,meta in zip(candidates['documents'], candidates['metadatas'])
                        if meta['chunk_index'] in [r["chunk_index"], max(r["chunk_index"]-1,0), min(r["chunk_index"] + 1,max_index)]])

context = []
print(ideal_chunks)

for text, meta in zip(ideal_chunks, meta_selected):
    context.append(f'Reference:{meta[0]["Reference"]}\n\nLink (DOI)\n: {meta[0]["DOI"]}\n\nSummary:\n\n{"".join(text)}\n\n')

print(context)

TypeError: list indices must be integers or slices, not str

In [73]:
min(1,len(candidates['metadatas'])-1)

1

In [86]:
for doc  in results:
    r = doc.metadata
    if r['parent'] not in ['Journal','URL']:
        
        if "_".join([r['parent'],r['Reference'],str(r['chunk_index']+1)]) not in selected_index and "_".join([r['parent'],r['Reference'],str(r['chunk_index']-1)]) not in selected_index:

            ii = "_".join([r['parent'],r['Reference'],str(r['chunk_index'])])
            selected_index.append(ii)

            candidates = db.get(
            where= {"$and" :[
                        {"Reference":r['Reference']},
                        {"parent":r['parent']}
                ]
            })

            max_index = len(candidates['metadatas'])-1
            meta_selected.append(candidates['metadatas'])
            ideal_chunks.append([doc for doc,meta in zip(candidates['documents'], candidates['metadatas'])
                        if meta['chunk_index'] in [r["chunk_index"], max(r["chunk_index"]-1,0), min(r["chunk_index"] + 1,max_index)]])

print(max_index)
print(len(ideal_chunks))

7
4


In [89]:
results[0]

Document(metadata={'DOI': 'https://doi.org/10.1186/s12958-023-01131-4', 'chunk_index': 7, 'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023', 'parent': 'Discussion'}, page_content='. Consider-ing these attributed functions and its variable expression throughout the menstrual cycle, we propose that CTCF exerts an inhibitory role in endometrial tissue during the PF phase of the menstrual cycle. With the significant downregulation of CTCF in the secretory endometrium, dually validated in-silico and experimentally herein, its inhibited genes would be derepressed and become tran-scriptionally active during the WOI. This interpretation supports previous findings from our group demonstrat-ing that, during the WOI, a global transcriptional dere-pression may be required for implantation and early embryo development [63]. Transcriptional derepression has been associated with multiple human disease states and should be investigated further within the context of

In [88]:
ideal_chunks[0]

['. SP1, which similarly regulated 17/19 signatures (FDR < 0.05), acts as a downstream paracrine target of progesterone to regulate estrogen inactivation, and could have a predominant role dur-ing the WOI [62, 63]. By focusing on the most univer-sal, overlapping TFs that were not previously related to implantation, we ultimately prioritized CTCF and GATA6 as novel key regulators of endometrial pro-gression, however, their specific molecular actions in endometrial progression and function were beyond the scope of this project and merit further investigation. CTCF is a conserved zinc finger protein whose regu-latory functions are well characterized throughout the human body [64]. CTCF acts as a transcriptional repres-sor in RNA polymerase II (Pol II) pausing and imprint-ing and X-chromosome inactivation [65], as well as an insulator, blocking the interaction between enhancers and the promoters of neighbouring genes [66]',
 '. Consider-ing these attributed functions and its variable expre

In [109]:
collection.get(
    where={"$and":[
           {"Reference":'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023'},
           {"parent":"Results"},
           {"chunk_index":0}
    ]}
)

{'ids': ['bd725078-6b0e-11f0-8e08-00155de32492'],
 'embeddings': None,
 'documents': ['Results Gene signatures associated to endometrial progression and function Of the 19 gene lists used for our analysis, eleven were obtained from published studies evaluating control patients throughout the menstrual cycle [4, 17-19, 34-40], and the other eight were derived from studies comparing patients with RIF or unexplained infertility to controls [21, 41-47] (Table 1). Unifying all the aforementioned signatures, we compiled 3,608 genes related to endometrial progression and function. Hormonal regulation of endometrial progression is largely driven by progesterone We identified 7,540 and 698 genes related to estrogen and progesterone hormones, respectively. However, as determined by the relative contribution of each type of hormone within each gene list (Fig. 1A), 17/19 (89%) signatures favoured regulation by progesterone rather than estrogen. These differences were significant in 47% of the sign

In [11]:
query = "How many phases does the endoemtrium have?"
docs = db.similarity_search(query,k=4)

In [80]:
docs

[Document(metadata={'chunk_index': 7, 'parent': 'Discussion', 'DOI': 'https://doi.org/10.1186/s12958-023-01131-4', 'Reference': 'Antonio Parraga-Leo et al.,Reproductive Biology and Endocrinology, 2023'}, page_content='. Considering these attributed functions and its variable expression throughout the menstrual cycle, we propose that CTCF exerts an inhibitory role in endometrial tissue during the PF phase of the menstrual cycle. With the significant downregulation of CTCF in the secretory endometrium, dually validated in-silico and experimentally herein, its inhibited genes would be derepressed and become transcriptionally active during the WOI. This interpretation supports previous findings from our group demonstrating that, during the WOI, a global transcriptional derepression may be required for implantation and early embryo development [63]. Transcriptional derepression has been associated with multiple human disease states and should be investigated further within the context of en

In [30]:
texts_re = []
metadata_re = []
for doc in docs:
    texts_re.append(doc.page_content)
    metadata_re.append(doc.metadata)

In [31]:
metadata_re

[{'parent': 'Discussion',
  'chunk_index': 7,
  'DOI': 'https://doi.org/10.1186/s12958-023-01131-4'},
 {'DOI': 'https://doi.org/10.1186/s12958-023-01131-4',
  'parent': 'Abstract',
  'chunk_index': 3},
 {'parent': 'Results',
  'DOI': 'https://doi.org/10.1186/s12958-023-01131-4',
  'chunk_index': 7},
 {'parent': 'Results',
  'chunk_index': 5,
  'DOI': 'https://doi.org/10.1186/s12958-023-01131-4'}]

In [32]:
# context = "\n\n---\n\n".join(texts_re)

context = []
for text, meta in zip(texts_re, metadata_re):

  context.append(f'The reference of article is {meta["DOI"]}, its information is:\n {text}')

print("\n\n".join(context))

The reference of article is https://doi.org/10.1186/s12958-023-01131-4, its information is:
 . Considering these attributed functions and its variable expression throughout the menstrual cycle, we propose that CTCF exerts an inhibitory role in endometrial tissue during the PF phase of the menstrual cycle. With the significant downregulation of CTCF in the secretory endometrium, dually validated in-silico and experimentally herein, its inhibited genes would be derepressed and become transcriptionally active during the WOI. This interpretation supports previous findings from our group demonstrating that, during the WOI, a global transcriptional derepression may be required for implantation and early embryo development [63]. Transcriptional derepression has been associated with multiple human disease states and should be investigated further within the context of endometrial-factor infertility. Despite previous associations of CTCF with endometriosis [67], its implication in endometrial p

In [113]:
prompt = get_ques_response_prompt(question=query, context=get_system_message_rag("\n\n".join(context)))

answer = llm.invoke(prompt)

In [114]:
answer

'CTCF is a conserved zinc finger protein with well-characterized regulatory functions throughout the human body. It acts as a transcriptional repressor in RNA polymerase II (Pol II) pausing, imprinting, and X-chromosome inactivation. Additionally, CTCF functions as an insulator, blocking the interaction between enhancers and the promoters of neighboring genes (https://doi.org/10.1186/s12958‑023‑01131‑4).\n\nIn endometrial tissue, CTCF is proposed to exert an inhibitory role during the PF phase of the menstrual cycle. Its significant downregulation in the secretory endometrium leads to the derepression of its inhibited genes, which then become transcriptionally active during the Window of Implantation (WOI) (https://doi.org/10.1186/s12958‑023‑01131‑4).\n\nCTCF is highlighted as a CCCTC-binding factor that functions as a transcriptomic repressor and has the most influential regulation of endometrial progression and function across 95% of studies (https://doi.org/10.1186/s12958‑023‑01131‑

## Download reference of each paper

In [14]:
with open("info_articles.pkl","rb") as f:
    my_json = pkl.load(f)

In [None]:
pmid = {}
url_pmc = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{a}/"

for article in my_json:

    pmid[article['PaperTitle']] = []

    if 'https://' in my_json[0]['URL']:
        doi = re.sub(r'https://doi.org/', '', article['URL'])
        response = requests.get('https://api.crossref.org/works/'+doi)
        list_reference = response.json()['message'].get('reference',None)
        article['List_references'] = [x.get('DOI') for x in list_reference if x.get('DOI')]

        # We retrieve the pmid from each DOI

        for a in  article['List_references']:
            
            pmid[article['PaperTitle']].append(get_pmid_from_doi(a))

        # We try to get the pdf

        command = [
        "python3", "-m", "pubmed2pdf", "pdf",
        "--out", "Data2/",
        "--pmids", f"{', '.join(pmid[article['PaperTitle']])}",
        "--verbose",
        "--errors", "Data2/pubmed2pdf_log.txt"
        ]
        subprocess.run(command)

        # We try to get the html otherwise

        to_retrieve = pd.read_csv("Data2/pubmed2pdf_log.txt", header=None,dtype='str') # We get those that resulted in a mistake
        to_retrieve.columns = ["PMID"]
        to_retrieve = to_retrieve['PMID'].tolist()
        to_retrieve.extend([re.sub(r'.html','',f)  for f in os.listdir('./Data2') if f.endswith('.html')]) # We add those in html format
        [os.remove(os.path.join('Data2',f))  for f in os.listdir('./Data2') if f.endswith('.html')] # Then, we remove from directory
        
        pmcid = []
        for pmid in to_retrieve:
            pmcid.append(get_pmcid_from_pmid(pmid))
        
        response = requests.get(url_pmc, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")
        soup.find("article").get_text(separator="\n", strip=True)




2025-08-10 19:08:04,763 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-10 19:08:04,764 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16832043
2025-08-10 19:08:05,439 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 16832043 failed from error list index out of range
2025-08-10 19:08:05,440 - INFO - pubmed2pdf.cli - Trying to fetch pmid 20729534
2025-08-10 19:08:06,127 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 20729534 failed from error list index out of range
2025-08-10 19:08:06,127 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30929718
2025-08-10 19:08:07,885 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 30929718 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-10 19:08:07,885 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16306079
2025-08-10 19:08:08,525 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 16306079 failed from error list index out of range
2025-08-10 19:08:08,525 - INFO - pubmed2pdf.cli - Trying to fetch 

Done downloading. All downloaded can be found in Data2/


2025-08-10 19:09:48,571 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-10 19:09:48,572 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1155504
2025-08-10 19:09:49,824 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 1155504 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-10 19:09:49,825 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1424330
2025-08-10 19:09:50,989 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 1424330 failed from error Invalid URL 'wenmhi4L9dPogJ7fBmofTSqFqabAb1VqRbFzzuvnOkth6PgfEnsViH5uoEhxes7n': No scheme supplied. Perhaps you meant https://wenmhi4L9dPogJ7fBmofTSqFqabAb1VqRbFzzuvnOkth6PgfEnsViH5uoEhxes7n?
2025-08-10 19:09:50,989 - INFO - pubmed2pdf.cli - Trying to fetch pmid 10362823
2025-08-10 19:09:51,701 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 10362823 failed from error list index out of range
2025-08-10 19:09:51,701 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15353123
2025-08-10 19:09:53,993 - I

Done downloading. All downloaded can be found in Data2/


2025-08-10 19:10:51,291 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 12490681 failed from error list index out of range


{'Deciphering a shared transcriptomic regulation and the relative contribution of each regulator type through endometrial gene expression signatures': ['16832043',
  '20729534',
  '30929718',
  '16306079',
  '29102484',
  '30624659',
  '29452422',
  '23994285',
  '23290997',
  '30951376',
  '21849299',
  '24882617',
  '18192189',
  '31462323',
  '12200466',
  '12529417',
  '15878921',
  '35085395',
  '19933690',
  '33576824',
  '19617889',
  '10592173',
  '10802651',
  '31340985',
  '29156006',
  '26415722',
  '26395145',
  '32641214',
  '25605792',
  '11846609',
  '28855728',
  '20619403',
  '12021176',
  '15501903',
  '15666095',
  '12728018',
  '30081718',
  '18539642',
  '23555582',
  '22025212',
  '28523980',
  '16672246',
  '7962413',
  '23933037',
  '26804062',
  '32482256',
  '35147192',
  '22902743',
  '29663566',
  '28229988',
  '30037059',
  '22911744',
  '34117589',
  '31511876',
  '24100212',
  '16807381',
  '33830236',
  '21964334',
  '19563753',
  '35574918',
  '18174356

In [None]:
len(article['List_references'])
[os.remove(os.path.join('Data2',f))  for f in os.listdir('./Data2') if f.endswith('.html')]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [58]:
def get_pmid_from_doi(doi, api_key=None):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": f"{doi}[DOI]",
        "retmode": "json"
    }
    if api_key:
        params["api_key"] = api_key

    response = requests.get(url, params=params)
    result = response.json()
    pmids = result.get("esearchresult", {}).get("idlist", [])
    return pmids[0] if pmids else 'None'



def get_pmcid_from_pmid(pmid, api_key=None):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "retmode": "json"
    }
    if api_key:
        params["api_key"] = api_key

    response = requests.get(url, params=params)
    data = response.json()

    try:
        linksets = data["linksets"]
        if linksets and "linksetdbs" in linksets[0]:
            pmcid_link = linksets[0]["linksetdbs"][0]["links"][0]
            return pmcid_link
        else:
            return None
    except Exception:
        return None
    
def download_pmc_pdf(pmcid, filename=None):
    pmc_number = pmcid.replace("PMC", "")
    pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_number}/pdf/"

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }

    response = requests.get(pdf_url, headers=headers)
    if "pdf" in response.headers.get("Content-Type", ""):
        if not filename:
            filename = f"{pmcid}.pdf"
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"✅ PDF downloaded as {filename}")
    else:
        print(f"⚠️ PDF not directly available at {pdf_url}")
        

def download_pdf_from_pmid(pmid, api_key=None):
    print(f"🔎 Checking PMC for PMID: {pmid}")
    pmcid = get_pmcid_from_pmid(pmid, api_key)
    if pmcid:
        print(f"✅ Found PMC ID: {pmcid}")
        download_pmc_pdf(pmcid)
    else:
        print("❌ No PMC version available for this PubMed article.")

In [None]:
pmids = []
for article in my_json:
    for doi in article['List_references']:
        pmid = get_pmid_from_doi(doi=doi)
        pmids.append(pmid)

In [None]:
for pmid in list(set(pmids)):
    with open('Data2/pmids.txt',"a") as f:
        f.write(pmid+'\n')

In [None]:
import subprocess
command = [
    "python3", "-m", "pubmed2pdf", "pdf",
    "--out", "Data2/",
    "--pmidsfile", "Data2/pmids.txt",
    "--verbose",
    "--errors", "Data2/pubmed2pdf_log.txt"
]
subprocess.run(command)

2025-08-02 16:38:25,860 - DEBUG - pubmed2pdf.cli - Full log mode activated
2025-08-02 16:38:25,861 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28923940
2025-08-02 16:38:26,583 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 28923940 failed from error list index out of range
2025-08-02 16:38:26,583 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18539642
2025-08-02 16:38:27,254 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 18539642 failed from error list index out of range
2025-08-02 16:38:27,254 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26415722
2025-08-02 16:38:28,786 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 26415722 failed from error list index out of range
2025-08-02 16:38:28,786 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31540845
2025-08-02 16:38:30,006 - DEBUG - pubmed2pdf.cli - ** fetching of reprint 31540845 failed from error direct_pdf_link() takes 1 positional argument but 3 were given
2025-08-02 16:38:30,006 - INFO - pubmed2pdf.cli - Trying to fetch 

Done downloading. All downloaded can be found in Data2/


CompletedProcess(args=['python3', '-m', 'pubmed2pdf', 'pdf', '--out', 'Data2/', '--pmidsfile', 'Data2/pmids.txt', '--verbose', '--errors', 'Data2/pubmed2pdf_log.txt'], returncode=0)

In [None]:


to_retrieve = pd.read_csv("Data2/pubmed2pdf_log.txt", header=None,dtype='str')
to_retrieve.columns = ["PMID"]
to_retrieve['PMID']

0      1155504
1      1424330
2     10362823
3     16306079
4     10221616
5     28863933
6     28923940
7     33313697
8     36822566
9     24269084
10    24581986
11    30624659
12    33077239
13    33576824
14    34875061
15    29452422
16    20619403
17    23102856
18    34199109
19    12421895
20    22256780
21    24581625
22    36070983
23    32723696
24    33067123
25    36472596
26    11752295
27    25605792
28         NaN
29         NaN
30         NaN
31    10592173
32    33036008
33    28370781
34    34947871
35    35929523
36    27441287
37    31540845
38    33830236
39    29315421
40    35092277
41    32482256
42    12490681
Name: PMID, dtype: object

In [None]:
a = get_pmcid_from_pmid(to_retrieve['PMID'][4])
a

'12282335'

In [None]:
url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{a}/"
response = requests.get(url, headers=HEADERS)

In [None]:
soup = BeautifulSoup(response.text, "html.parser")
print(soup.find("article").get_text(separator="\n", strip=True))

Am J Obstet Gynecol
. Author manuscript; available in PMC: 2025 Jul 22.
Published in final edited form as:
Am J Obstet Gynecol. 2025 Apr;232(4 Suppl):S105–S123. doi:
10.1016/j.ajog.2024.08.043
Search in PMC
Search in PubMed
View in NLM Catalog
Add to search
Endometriosis and Adenomyosis Unveiled Through Single-cell Glasses
Linda C Giudice
Linda C Giudice
1
Center for Reproductive Sciences, Department of Obstetrics, Gynecology & Reproductive Sciences, University of California, San Francisco, San Francisco, CA 94143
Find articles by
Linda C Giudice
1
,
Binya Liu
Binya Liu
1
Center for Reproductive Sciences, Department of Obstetrics, Gynecology & Reproductive Sciences, University of California, San Francisco, San Francisco, CA 94143
Find articles by
Binya Liu
1
,
Juan C Irwin
Juan C Irwin
1
Center for Reproductive Sciences, Department of Obstetrics, Gynecology & Reproductive Sciences, University of California, San Francisco, San Francisco, CA 94143
Find articles by
Juan C Irwin
1
Author i

In [None]:
prueba = chain.invoke({"query": soup.find("article").get_text(separator="\n", strip=True)})

In [None]:
prueba

{'PaperTitle': 'Endometriosis and Adenomyosis Unveiled Through Single-cell Glasses',
 'Publication': '2025',
 'Authors': 'Linda C Giudice, Binya Liu, Juan C Irwin',
 'Email': 'Linda.Giudice@ucsf.edu',
 'Abstract': 'Single cell technologies are expanding our understanding of endometriosis and adenomyosis, sister disorders of the uterine endometrium that contain similar complements of lesion cell types but located in different niches – outside and inside the endometrium, respectively. Both diseases cause significant morbidity and impaired quality of life among those affected, and current therapies mitigate most symptoms although with highly variable efficacy, duration of effect, and frequent intolerable side effects. Thus, there is a pressing need for transformative approaches to develop individualized therapies for the variety of presentations of endometriosis and adenomyosis symptoms and heterogeneity of lesion types histologically and architecturally. Single cell technologies are tran

In [None]:
pprint.pprint(prueba['Results'])

('Endometriosis through single cell glasses Subsequent to the single cell '
 'cartography of human endometrium published by Wang et al 19 (see Chapter 3), '
 'several groups have recently reported eutopic endometrial scRNAseq data of '
 'patients with endometriosis, endometriosis lesions and control eutopic '
 'endometrium 20 , 31 – 36 ( Table 1 ). Types of samples, inclusion/exclusion '
 'criteria, choice of controls, study designs, numbers of cells sequenced, and '
 'number of reads (i.e., base pairs sequenced) are key to data integrity and '
 'interpretation and to comparative analyses across tissues and studies. They '
 'have provided insights into origins of ectopic disease, the heterogeneity of '
 'cell types/subtypes, unique clusters and signatures of endometrium and '
 'ectopic lesions informing mechanisms and pathways involved in cellular '
 'dysfunctions relevant to pain and fertility compromise, novel cell-cell '
 'communications, relationships between lesions and eutopic ti

## Others

In [None]:
# Load several documents

storage = []

for root, dirs, files in os.walk("Data",): # for server all the path

    for file in files:
        print(file)

        loader = PyPDFLoader(os.path.join(root,file))
        pages = loader.load_and_split()

        # split it into chunks
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
        docs = text_splitter.split_documents(pages)
        storage.extend(docs) # This  is a list of lists


Devesa-Peiro2020.pdf
Henarejos-Castillo2020.pdf
Sebastian-Leon2021.pdf
Marti-Garcia2024(1).pdf
Henarejos-Castillo2021.pdf
Henarejos-Castillo2022.pdf
Diaz-Gimeno2024.pdf
Devesa-Peiro2021.pdf
Henarejos-Castillo2024.pdf
parraga-leo_2023.pdf
Marti-Garcia2024.pdf
Diaz-Gimeno2022.pdf
Devesa-Peiro2022.pdf
Sebastian-Leon2018.pdf
Diaz-Gimeno2017.pdf


In [83]:
# Load several documents

storage = []

for root, dirs, files in os.walk("Data",): # for server all the path

    for file in files:

        loader = PyPDFLoader(os.path.join(root,file),mode='page')
        pages = loader.load() # List of pages
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        docs = text_splitter.split_documents(pages)

        storage.extend(docs) # Add several items at the same time



In [13]:
# load the document and split it into pages
loader = PyPDFLoader("Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
storage = text_splitter.split_documents(pages)

In [14]:
pprint.pprint(storage[2].page_content)

('out the menstrual cycle (FDR < 0.05), dually validated in-silico and through '
 'endometrial biopsies, corroborated their \n'
 'potential regulatory roles in the endometrium.\n'
 '*Correspondence:\n'
 'Patricia Diaz‑Gimeno\n'
 'patricia.diaz@ivirma.com; patricia_diaz@iislafe.es\n'
 'Full list of author information is available at the end of the article')


In [134]:
model = ChatOllama(
    model="gemma3:4b",
    temperature=0.1,
)

In [171]:
class paper(BaseModel):
    text: str = Field(description="Main text that explains the reuslts, introductions, methods or conclusions")
    doi: str = Field(description="link with the url of scientifc article known as doi.")
    citation: str = Field(description="add the citation of the manuscript is presented in the text.") 

In [172]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
    You are an expert analyzing scientifc research papers. Read the text carefully to provide the request information.
    Remove the potential information that is not related with the scientific article.

    If it appears a link with the doi of the artcile, take it. Otherwise write down UNKOWN.

    Help me to extract the information and create an json file with it. If you don't know the
    answer, say UNKNOWN.
    
    Format instructions: \n{format_instructions}\n
    
    Case:\n{query}\n""",

    input_variables=["query"],
    
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [173]:
chain = prompt | model | parser

In [174]:
my_json = []

for docs in storage:
    print(docs.page_content)

    my_json.append(chain.invoke({"query": docs.page_content}))


#chain.invoke({"query": storage[0].page_content})

Parraga‑Leo et al. 
Reproductive Biology and Endocrinology           (2023) 21:84  
https://doi.org/10.1186/s12958‑023‑01131‑4
RESEARCH Open Access
© The Author(s) 2023. Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which 
permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the 
original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or 
other third party material in this article are included in the article’s Creative Commons licence, unless indicated otherwise in a credit line 
to the material. If material is not included in the article’s Creative Commons licence and your intended use is not permitted by statutory 
regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this 
licence, visit http://creativecom

In [175]:
my_json

# my_json['author'] = "parraga-leo"
# my_json['title'] = storage[0].metadata['title']
# my_json

[{'text': 'A total of 3,608 distinct genes from the 19 gene lists were associated with endometrial progression',
  'doi': '10.1186/s12958‑023‑01131‑4',
  'citation': 'Parraga‑Leo et al. (2023)'},
 {'text': 'The lists’ regulation was significantly favoured by TFs (89% (17/19) of gene lists) and progesterone (47% (8 /19) of gene lists), rather than miRNAs (5% (1/19) of gene lists) or estrogen (0% (0/19) of gene lists), respectively (FDR < 0.05). Exceptionally, two gene lists that were previously associated with implantation failure and unexplained infertility were less hormone‑dependent, but primarily regulated by estrogen. Although endometrial progression genes were mainly targeted by hormones rather than non‑hormonal contributors (odds ratio = 91.94, FDR < 0.05), we identified 311 TFs and 595 miRNAs not previously associated with ovarian hormones. We highlight CTCF, GATA6, hsa‑miR‑15a‑5p, hsa‑miR‑218‑5p, hsa‑miR‑107, hsa‑miR‑103a‑3p, and hsa‑miR‑128‑3p, as overlapping novel master regu

In [4]:
# load the document and split it into pages
loader = PyPDFLoader("/data/local/aparraga/Bioinformatician/RAG/Publications/Parraga-Leo2023.pdf")
pages = loader.load_and_split()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
storage = text_splitter.split_documents(pages)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# create the chroma client
import uuid
import chromadb
from chromadb.config import Settings

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

client = chromadb.HttpClient(host='localhost', port=7000, settings=Settings(allow_reset=True))
# client.list_collections()
# client.reset()  # resets the database
collection = client.get_or_create_collection("tfm_APL")
# collection = client.get_collection('tfm')

for i, doc in enumerate(my_json): #storage:
    print(doc['text'])
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=storage[i].metadata, documents=str(doc['text'])
    )

# tell LangChain to use our client and collection name
db = Chroma(
    client=client,
    collection_name="tfm",
    embedding_function=embedding_function,
)

A total of 3,608 distinct genes from the 19 gene lists were associated with endometrial progression
The lists’ regulation was significantly favoured by TFs (89% (17/19) of gene lists) and progesterone (47% (8 /19) of gene lists), rather than miRNAs (5% (1/19) of gene lists) or estrogen (0% (0/19) of gene lists), respectively (FDR < 0.05). Exceptionally, two gene lists that were previously associated with implantation failure and unexplained infertility were less hormone‑dependent, but primarily regulated by estrogen. Although endometrial progression genes were mainly targeted by hormones rather than non‑hormonal contributors (odds ratio = 91.94, FDR < 0.05), we identified 311 TFs and 595 miRNAs not previously associated with ovarian hormones. We highlight CTCF, GATA6, hsa‑miR‑15a‑5p, hsa‑miR‑218‑5p, hsa‑miR‑107, hsa‑miR‑103a‑3p, and hsa‑miR‑128‑3p, as overlapping novel master regulators of endometrial function. The gene expression changes of selected regulators through‑out the menstrua

In [None]:
# from ollama import Client
# client = Client(host='http://localhost:11434')
# stream = client.chat(model='gemma3:12b', messages=[
# {"role": "system", "content": get_system_message_rag(fullcontent)},            
# {"role": "user", "content": get_ques_response_prompt(query)}
# ],stream=True)