# Retrieving scientific articles for database

## Libraries

In [8]:
from langchain_community.document_loaders import PyPDFLoader, pdf
import os
import re
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
import json
from langchain_google_genai import GoogleGenerativeAI
import pickle as pkl
import requests
import subprocess
import pandas as pd
from bs4 import BeautifulSoup
import tqdm
import time
import helper_functions as hf

## Config

In [3]:
os.getcwd()

'/data/local/aparraga/Bioinformatician/RAG'

In [4]:
with open('api_google.txt') as f:
    
    api_key = json.load(f)

## Functions

## Timing

In [6]:
time_management = {}

## LLM Model

In [None]:
llm = GoogleGenerativeAI(model="gemini-2.0-flash",api_key=api_key['key'],temperature=0.2)

## Loading PDFs (articles from our research group)

In [6]:
paths = []
paths_folder_create = []

for root, dirs, files in os.walk("Data"):
    for f in files:
        if f.endswith('.pdf'):
            paths.append(os.path.join(root,f))
            paths_folder_create.append(os.path.join(os.getcwd(),"ToDataBase",f))

We create a folder for each article to pre-process the references

In [7]:
os.makedirs(os.path.join(os.getcwd(), "ToDataBase"), exist_ok=True)
[os.makedirs(re.sub(r'.pdf', '', p), exist_ok=True) for p in paths_folder_create]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
# We read the articles from our scientific group with PyPDFLoader in single mode (all content)

articles = []

for file in paths:
    loader = loader = PyPDFLoader(file, mode="single")
    doc = loader.load()[0] # As this function provides a list, we select the first element.
    doc.page_content = hf.clean_text(doc.page_content)
    
    articles.append(doc)

### Json generation

In [10]:
class paper(BaseModel):

    PaperTitle: str = Field(description="The full title of the research paper")
    Publication: str = Field(description="Year: The year the paper was published")
    Authors: str = Field(description="The full names of all authors of the paper")
    Email: str = Field(description="The email address of the author (if provided)")
    Abstract: str = Field(description="The full text of the paper's abstract.")
    Introduction: str = Field(description="The full text fo the paper's introduction. Bear in mind that it can have other names such as background.")
    Methods: str = Field(description="The full text fo the paper's methods. Don't take the information from abstract.")
    Results: str = Field(description="The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.")
    Discussion: str = Field(description="The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It should be entitled 'Discussion'")
    Conclusion: str = Field(description="The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.")
    DOI: str = Field(description="Provide the Digital Object Identifier (DOI) for the paper. If a DOI is not available, provide a URL for the paper. This field is compulsory. Example of a DOI format: 10.1007/s10814-017-9105-3, 10.1016/j.cell.2023.08.001, 10.1038/nature12345.")
    Journal: str = Field(description="provide the name of the journal, e.g Nature.")

In [11]:
parser = JsonOutputParser(pydantic_object=paper)

prompt = PromptTemplate(
    template="""
You are an expert in analyzing scientific research papers. Please carefully read the provided research paper above and extract the following key information:
Extract these nine (10) properties from the research paper:

Paper Title: The full title of the research paper

Publication Year: The year the paper was published

Authors: The full names of all authors of the paper

Email: The email address of the author (if provided)

Abstract: The full text of the paper's abstract

Introduction: The full text fo the paper's introduction. Bear in mind that it can have other names such as background.

Methods: The full text fo the paper's methods. Don't take the information from abstract.

Results: The full text fo the paper's results. Don't take the information from abstract. Please take all possible text of results, this section could be divided into different sections.

Discussion:The full text fo the paper's discussion if provided. Otherwise leave the filed blank. It shoudl be entitled "Discussion"

Conclusion: The full text fo the paper's conclusion if provided. Otherwise leave the filed blank. Please don't take this information form abstract.

DOI: Provide the Digital Object Identifier (DOI) for the paper. If a DOI is not available, provide a URL for the paper. This field is compulsory. Example of a DOI format: 10.1007/s10814-017-9105-3, 10.1016/j.cell.2023.08.001, 10.1038/nature12345.

Journal: provide the name of the journal, e.g Nature.


Guidelines:


The extracted information should be factual and accurate to the document. Be extremely concise, except for the Abstract, Introduction, Methods, Results. Discussion and Conclusion which should be copied in full.
The extracted entities should be self-contained and easily understood without the rest of the paper. If a property is missing from the paper, please leave the field empty rather than guessing.
Answer in JSON format. The JSON should contain 9 keys: "PaperTitle", "PublicationYear", "Authors", "Email", "Abstract", "Introduction", "Methods","Results", "DOI", "Reference". You MUST add the DOI since is key to identify the article.

Format instructions: \n{format_instructions}\n
    
The article is this:\n{query}\n""",

    input_variables=["query"],

    partial_variables={"format_instructions": parser.get_format_instructions()}
)

To ensure all DOI are retrieved with perform a doble check.

In [67]:
chain = prompt | llm | parser

doi_prompt = PromptTemplate(
    template=(
        """Extract the Digital Object Identifier (DOI) from the following paper.
        "The DOI must be returned in its raw form (e.g., 10.1007/s10814-017-9105-3,
        "10.1016/j.cell.2023.08.001, 10.1038/nature12345).
        "Return only the DOI string with no additional text.
        "If no DOI is found, return None.\n\n
        "Paper:\n{query}"""
    ),
    input_variables=["query"]
)

chain_doi = doi_prompt | llm

In [None]:
my_json = []

for doc in tqdm.tqdm(articles, total=len(articles)):
    info_article = chain.invoke({"query": doc.page_content})

    if 'DOI' not in info_article.keys():
        previous_doi= chain_doi.invoke({"query":doc.page_content})
        if 'https://doi.org/' not in previous_doi:
            info_article['DOI'] = 'https://doi.org/' + previous_doi
        else:
            info_article['DOI'] = previous_doi
    
    my_json.append(info_article)

with open('info_articles_main.pkl','wb') as f:
    pkl.dump(my_json,f)

100%|██████████| 18/18 [11:53<00:00, 39.62s/it]


## Getting references of each article

In [None]:
# Already DOI (To avoid repeatin previously included articles)

includedDOI = [ re.sub('(https://doi\.org/|http://dx\.doi\.org/)', '',a['DOI']) for a in my_json]

In [None]:
my_json_ref = []
pmid_dict = {}
not_available = []

HEADERS = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }

for i, article in enumerate(my_json):
    
    path_name = re.sub(r'.pdf','',paths[i]).split('/')[1]
    pmid_dict[path_name] = []

    print(f"We process the article: {path_name}")

    doi = hf.clean_doi(article['DOI'])
    
    cross_ref = requests.get('https://api.crossref.org/works/'+doi)
    list_reference = cross_ref.json()['message'].get('reference',None)
    article['List_references'] = list(set([x.get('DOI') for x in list_reference if x.get('DOI') and x.get('DOI') not in includedDOI]))

    # We retrieve the pmid from each DOI

    for ref_doi in article['List_references']:
            
        pmid_dict[path_name].append(hf.get_pmid_from_doi(ref_doi))

    # We try to get the pdf

    print("Getting PDFs-----------------------------\n")

    output_dir = os.path.join("ToDataBase", path_name)

    if len(pmid_dict[path_name]) > 0:

        command = [
        "python3", "-m", "pubmed2pdf", "pdf",
        "--out", output_dir,
        "--pmids", f"{', '.join(pmid_dict[path_name])}",
        "--maxtries","1",
        "--errors", os.path.join(output_dir,"pubmed2pdf_log.txt")
        ]
        subprocess.run(command)

        # We process pdfs

        pdfs2retrieve = [os.path.join(output_dir,f) for f in os.listdir(output_dir) if f.endswith('.pdf')]

        print("Turn PDFs into JSON--------------------\n")

        for pdf in tqdm.tqdm(pdfs2retrieve, total=len(pdfs2retrieve)):

            loader = loader = PyPDFLoader(pdf, mode="single")
            doc = loader.load()[0] # As this function provides a list, we select the first element.
            doc.page_content = hf.clean_text(doc.page_content)

            try:
                info_article = chain.invoke({"query": doc.page_content})
                doi_candidate = info_article.get("DOI",None)

                if not doi_candidate:

                    retry_doi = chain_doi.invoke({"query":doc.page_content})
                    info_article['DOI'] = 'https://doi.org/' + retry_doi if 'https://doi.org/' not in retry_doi else retry_doi

                else:
                    info_article['DOI'] = 'https://doi.org/' + info_article['DOI'] if 'https://doi.org/' not in info_article['DOI'] else info_article['DOI']

                my_json_ref.append(info_article)
            except Exception:
                not_available.append(re.sub(r".pdf","",os.path.basename(pdf)))

        # We try to get the html otherwise

        print("Getting xml from PMC--------------------\n")

        log_file = os.path.join(output_dir,"pubmed2pdf_log.txt")
        if os.path.exists(log_file):
            to_retrieve = pd.read_csv(log_file, header=None,dtype='str')[0].tolist() # We get those files that could not be retrieved
        else:
            to_retrieve = []

        to_retrieve.extend([re.sub(r'.html','',f)  for f in os.listdir(output_dir) if f.endswith('.html')]) # We add those in html format since they do not have information


        # Clean information
        for f in os.listdir(output_dir):
            if f.endswith('html'):
                os.remove(os.path.join(output_dir,f)) # Then, we remove from directory
        
        print("Turn XML into JSON--------------------\n")
        
        for pmid in tqdm.tqdm(to_retrieve, total=len(to_retrieve)):

            try:
                pmcid = hf.get_pmcid_from_pmid(pmid)
                response = requests.get(url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/", headers=HEADERS)
                soup = BeautifulSoup(response.text, "html.parser")
                text_paper = soup.find("article").get_text(separator="\n", strip=True)
                info_article = chain.invoke({"query": text_paper})

                doi_candidate = info_article.get("DOI",None)
                if not doi_candidate:
                    retry_doi= chain_doi.invoke({"query": text_paper})
                    info_article['DOI'] = 'https://doi.org/' + retry_doi if 'https://doi.org/' not in retry_doi else retry_doi
                        
                else:
                    info_article['DOI'] = 'https://doi.org/' + info_article['DOI'] if 'https://doi.org/' not in info_article['DOI'] else info_article['DOI']

                my_json_ref.append(info_article)
                
            except Exception:
                not_available.append(pmid)

            time.sleep(1)
    

    includedDOI.extend(article['List_references'])

    # Save the results

    with open("info_articles_ref_final.pkl","wb") as f:
        pkl.dump(my_json_ref, f)

We process the article: Devesa-Peiro2020
Getting PDFs-----------------------------

We process the article: Sebastian-Leon2025
Getting PDFs-----------------------------

We process the article: Henarejos-Castillo2020
Getting PDFs-----------------------------

We process the article: Parraga-Leo2023
Getting PDFs-----------------------------

We process the article: Garcia-Acero2025
Getting PDFs-----------------------------

We process the article: Sebastian-Leon2021
Getting PDFs-----------------------------

We process the article: Henarejos-Castillo2021
Getting PDFs-----------------------------



2025-08-17 21:24:52,774 - INFO - pubmed2pdf.cli - Trying to fetch pmid 21989058
2025-08-17 21:24:53,518 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27901055
2025-08-17 21:24:53,518 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24020646
2025-08-17 21:24:53,518 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25227694
2025-08-17 21:24:55,788 - INFO - pubmed2pdf.cli - Trying to fetch pmid 11125122
2025-08-17 21:24:56,503 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 21:24:57,337 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26243799
2025-08-17 21:24:58,065 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30276597
2025-08-17 21:25:00,376 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32461654
2025-08-17 21:25:00,376 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26620551
2025-08-17 21:25:01,311 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 21:25:02,131 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16722528
2025-08-17 21:25:02,131 - INFO - pubmed2pdf.cli 

Done downloading. All downloaded can be found in ToDataBase/Henarejos-Castillo2021
Turn PDFs into JSON--------------------



100%|██████████| 20/20 [09:05<00:00, 27.29s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 62/62 [20:21<00:00, 19.71s/it]


We process the article: Henarejos-Castillo2022
Getting PDFs-----------------------------



2025-08-17 21:56:26,230 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18586725
2025-08-17 21:56:26,911 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19628854
2025-08-17 21:56:27,900 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30817321
2025-08-17 21:56:28,605 - INFO - pubmed2pdf.cli - Trying to fetch pmid 17960529
2025-08-17 21:56:32,202 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31885807
2025-08-17 21:56:33,032 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32898168
2025-08-17 21:56:36,176 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32918567
2025-08-17 21:56:38,758 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22287627
2025-08-17 21:56:39,504 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32552854
2025-08-17 21:56:40,864 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30612956
2025-08-17 21:56:43,235 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29259471
2025-08-17 21:56:44,503 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33568179
2025-08-17 21:56:47,152 - INFO - pubmed2

Done downloading. All downloaded can be found in ToDataBase/Henarejos-Castillo2022
Turn PDFs into JSON--------------------



100%|██████████| 26/26 [12:39<00:00, 29.21s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 53/53 [18:13<00:00, 20.63s/it]


We process the article: Diaz-Gimeno2024
Getting PDFs-----------------------------



2025-08-17 22:29:39,976 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33880419
2025-08-17 22:29:41,121 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36822566
2025-08-17 22:29:42,415 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18546601
2025-08-17 22:29:44,658 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36472596
2025-08-17 22:29:45,352 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28370781
2025-08-17 22:29:46,334 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35929523
2025-08-17 22:29:47,132 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33036008
2025-08-17 22:29:48,727 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26109056
2025-08-17 22:29:50,228 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32723696
2025-08-17 22:29:51,467 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24581625
2025-08-17 22:29:52,774 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29737471
2025-08-17 22:29:55,961 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 22:29:56,712 - INFO - pubmed2pdf.

Done downloading. All downloaded can be found in ToDataBase/Diaz-Gimeno2024
Turn PDFs into JSON--------------------



100%|██████████| 3/3 [02:09<00:00, 43.30s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 32/32 [08:47<00:00, 16.49s/it]


We process the article: Devesa-Peiro2021
Getting PDFs-----------------------------



2025-08-17 22:41:40,580 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16707507
2025-08-17 22:41:42,320 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27323161
2025-08-17 22:41:43,472 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 22:41:44,302 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22819144
2025-08-17 22:41:45,575 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26856931
2025-08-17 22:41:46,857 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29576469
2025-08-17 22:41:48,184 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26366788
2025-08-17 22:41:49,534 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30588329
2025-08-17 22:41:52,025 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27998009
2025-08-17 22:41:52,740 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19910308
2025-08-17 22:41:53,436 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30477193
2025-08-17 22:41:54,306 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25243856
2025-08-17 22:41:55,031 - INFO - pubmed2pdf.

Done downloading. All downloaded can be found in ToDataBase/Devesa-Peiro2021
Turn PDFs into JSON--------------------



100%|██████████| 4/4 [03:03<00:00, 45.83s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 26/26 [05:43<00:00, 13.20s/it]


We process the article: Henarejos-Castillo2024
Getting PDFs-----------------------------



2025-08-17 22:51:34,072 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33025164
2025-08-17 22:51:36,722 - INFO - pubmed2pdf.cli - Trying to fetch pmid 36055201
2025-08-17 22:51:37,967 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32943985
2025-08-17 22:51:39,633 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24995866
2025-08-17 22:51:40,818 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19810025
2025-08-17 22:51:41,656 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28574608
2025-08-17 22:51:42,454 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29566152
2025-08-17 22:51:44,199 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29425284
2025-08-17 22:51:44,905 - INFO - pubmed2pdf.cli - Trying to fetch pmid 34597585
2025-08-17 22:51:56,971 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30207912
2025-08-17 22:51:57,652 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25326635
2025-08-17 22:51:58,385 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15769979
2025-08-17 22:51:59,057 - INFO - pubmed2

Done downloading. All downloaded can be found in ToDataBase/Henarejos-Castillo2024
Turn PDFs into JSON--------------------



100%|██████████| 13/13 [08:26<00:00, 38.93s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 45/45 [18:56<00:00, 25.26s/it]


We process the article: Sanchez-Reyes2025
Getting PDFs-----------------------------



2025-08-17 23:20:48,511 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35085989
2025-08-17 23:20:49,870 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28627518
2025-08-17 23:20:52,818 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33489905
2025-08-17 23:20:55,638 - INFO - pubmed2pdf.cli - Trying to fetch pmid 38906211
2025-08-17 23:20:57,151 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35409214
2025-08-17 23:20:57,830 - INFO - pubmed2pdf.cli - Trying to fetch pmid 2903323
2025-08-17 23:21:00,118 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32992208
2025-08-17 23:21:01,469 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24227677
2025-08-17 23:21:02,182 - INFO - pubmed2pdf.cli - Trying to fetch pmid 37355665
2025-08-17 23:21:03,827 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23104886
2025-08-17 23:21:04,500 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31467207
2025-08-17 23:21:05,484 - INFO - pubmed2pdf.cli - Trying to fetch pmid 12215322
2025-08-17 23:21:06,756 - INFO - pubmed2p

Done downloading. All downloaded can be found in ToDataBase/Sanchez-Reyes2025
Turn PDFs into JSON--------------------



100%|██████████| 3/3 [01:41<00:00, 33.81s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 11/11 [01:39<00:00,  9.03s/it]


We process the article: Marti-Garcia2024
Getting PDFs-----------------------------



2025-08-17 23:24:47,325 - INFO - pubmed2pdf.cli - Trying to fetch pmid 29250769
2025-08-17 23:24:48,023 - INFO - pubmed2pdf.cli - Trying to fetch pmid 35351010
2025-08-17 23:24:49,415 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1521649
2025-08-17 23:24:50,722 - INFO - pubmed2pdf.cli - Trying to fetch pmid 1838082
2025-08-17 23:24:52,045 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15368600
2025-08-17 23:24:53,835 - INFO - pubmed2pdf.cli - Trying to fetch pmid 21435901
2025-08-17 23:24:55,210 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31085094
2025-08-17 23:24:56,452 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19948745
2025-08-17 23:24:57,169 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25298042
2025-08-17 23:24:59,493 - INFO - pubmed2pdf.cli - Trying to fetch pmid 30283331
2025-08-17 23:25:03,714 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 23:25:04,487 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28854727
2025-08-17 23:25:05,174 - INFO - pubmed2pdf.cl

Done downloading. All downloaded can be found in ToDataBase/Marti-Garcia2024
Turn PDFs into JSON--------------------



100%|██████████| 4/4 [02:04<00:00, 31.10s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 29/29 [08:27<00:00, 17.50s/it]


We process the article: Marti-Garcia2024_review
Getting PDFs-----------------------------



2025-08-17 23:37:02,776 - INFO - pubmed2pdf.cli - Trying to fetch pmid 33258951
2025-08-17 23:37:03,484 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23754297
2025-08-17 23:37:04,182 - INFO - pubmed2pdf.cli - Trying to fetch pmid 9241295
2025-08-17 23:37:05,045 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-17 23:37:05,801 - INFO - pubmed2pdf.cli - Trying to fetch pmid 34061977
2025-08-17 23:37:06,856 - INFO - pubmed2pdf.cli - Trying to fetch pmid 6053626
2025-08-17 23:37:07,965 - INFO - pubmed2pdf.cli - Trying to fetch pmid 4347566
2025-08-17 23:37:08,694 - INFO - pubmed2pdf.cli - Trying to fetch pmid 25785919
2025-08-17 23:37:09,388 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18454478
2025-08-17 23:37:11,506 - INFO - pubmed2pdf.cli - Trying to fetch pmid 8801134
2025-08-17 23:37:12,812 - INFO - pubmed2pdf.cli - Trying to fetch pmid 12641627
2025-08-17 23:37:13,506 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27178763
2025-08-17 23:37:14,701 - INFO - pubmed2pdf.cli 

Done downloading. All downloaded can be found in ToDataBase/Marti-Garcia2024_review
Turn PDFs into JSON--------------------



100%|██████████| 24/24 [14:26<00:00, 36.11s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 100/100 [13:14<00:00,  7.95s/it]


We process the article: Diaz-Gimeno2022
Getting PDFs-----------------------------



2025-08-18 00:07:39,716 - INFO - pubmed2pdf.cli - Trying to fetch pmid 19246470
2025-08-18 00:07:40,422 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22285995
2025-08-18 00:07:41,098 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28443690
2025-08-18 00:07:41,776 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:07:42,585 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18077318
2025-08-18 00:07:43,269 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:07:44,051 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27386492

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(req.content, 'lxml')
2025-08-18 00:07:48,478 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:

Done downloading. All downloaded can be found in ToDataBase/Diaz-Gimeno2022
Turn PDFs into JSON--------------------



100%|██████████| 1/1 [00:28<00:00, 28.22s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 13/13 [03:19<00:00, 15.33s/it]


We process the article: Devesa-Peiro2022
Getting PDFs-----------------------------



2025-08-18 00:11:59,858 - INFO - pubmed2pdf.cli - Trying to fetch pmid 31806903
2025-08-18 00:12:02,097 - INFO - pubmed2pdf.cli - Trying to fetch pmid 21245076
2025-08-18 00:12:02,778 - INFO - pubmed2pdf.cli - Trying to fetch pmid 17604715
2025-08-18 00:12:04,163 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23725226
2025-08-18 00:12:05,627 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15797956
2025-08-18 00:12:06,282 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:12:07,041 - INFO - pubmed2pdf.cli - Trying to fetch pmid 8903775
2025-08-18 00:12:08,291 - INFO - pubmed2pdf.cli - Trying to fetch pmid 32173784
2025-08-18 00:12:10,755 - INFO - pubmed2pdf.cli - Trying to fetch pmid 12969699
2025-08-18 00:12:11,984 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24894503
2025-08-18 00:12:13,657 - INFO - pubmed2pdf.cli - Trying to fetch pmid 14559028
2025-08-18 00:12:14,900 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27271600
2025-08-18 00:12:15,582 - INFO - pubmed2pdf.c

Done downloading. All downloaded can be found in ToDataBase/Devesa-Peiro2022
Turn PDFs into JSON--------------------



100%|██████████| 5/5 [02:56<00:00, 35.37s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 19/19 [05:17<00:00, 16.71s/it]


We process the article: Sebastian-Leon2018
Getting PDFs-----------------------------



2025-08-18 00:21:11,348 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:21:12,143 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24737781
2025-08-18 00:21:12,869 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24082038
2025-08-18 00:21:13,581 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15539444
2025-08-18 00:21:14,327 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18035563
2025-08-18 00:21:15,620 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23756099
2025-08-18 00:21:17,027 - INFO - pubmed2pdf.cli - Trying to fetch pmid 15924536
2025-08-18 00:21:18,020 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22080510
2025-08-18 00:21:18,696 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:21:19,460 - INFO - pubmed2pdf.cli - Trying to fetch pmid 22683339
2025-08-18 00:21:20,767 - INFO - pubmed2pdf.cli - Trying to fetch pmid 23664094
2025-08-18 00:21:22,064 - INFO - pubmed2pdf.cli - Trying to fetch pmid 28710396
2025-08-18 00:21:24,632 - INFO - pubmed2pdf.cli 

Done downloading. All downloaded can be found in ToDataBase/Sebastian-Leon2018
Turn PDFs into JSON--------------------



100%|██████████| 2/2 [01:11<00:00, 35.53s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 19/19 [02:24<00:00,  7.61s/it]


We process the article: Diaz-Gimeno2017
Getting PDFs-----------------------------



2025-08-18 00:25:18,509 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27128483
2025-08-18 00:25:21,327 - INFO - pubmed2pdf.cli - Trying to fetch pmid 11120680
2025-08-18 00:25:22,005 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:25:22,764 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:25:24,516 - INFO - pubmed2pdf.cli - Trying to fetch pmid 16005454
2025-08-18 00:25:25,886 - INFO - pubmed2pdf.cli - Trying to fetch pmid 18252602
2025-08-18 00:25:27,146 - INFO - pubmed2pdf.cli - Trying to fetch pmid None
2025-08-18 00:25:27,907 - INFO - pubmed2pdf.cli - Trying to fetch pmid 26385059
2025-08-18 00:25:29,064 - INFO - pubmed2pdf.cli - Trying to fetch pmid 20689021
2025-08-18 00:25:29,812 - INFO - pubmed2pdf.cli - Trying to fetch pmid 24706003
2025-08-18 00:25:30,940 - INFO - pubmed2pdf.cli - Trying to fetch pmid 27122490


Done downloading. All downloaded can be found in ToDataBase/Diaz-Gimeno2017
Turn PDFs into JSON--------------------



100%|██████████| 1/1 [00:41<00:00, 41.76s/it]


Getting xml from PMC--------------------

Turn XML into JSON--------------------



100%|██████████| 10/10 [01:43<00:00, 10.39s/it]
