## Download full-text articles

In [None]:
# Clear cache.
! rm -rf ./litcovid_pmcid_lists/*
! rm -rf ./litcovid_pmc_fulltext/xml/*

In [6]:
# Get pmid from LitCovid.
id_list_path = "./litcovid_pmid/litcovid_16052022.tsv"
csv_directory_path = "./litcovid_pmcid_lists"
download_directory_path="./litcovid_pmc_fulltext/xml"

In [7]:
import pandas as pd
import time
import os
import requests

In [8]:
pubmed_id_df = pd.read_csv(id_list_path, sep='\t')
pubmed_id_df = pubmed_id_df.astype({'pmid': str})
display(pubmed_id_df)

# Sample a subset from all articles indexed in LitCovid.
# subset_size = 2000
# pubmed_id_sub_df = pubmed_id_df.sample(n=subset_size)

# Use the whole set from LitCovid.
subset_size = pubmed_id_df.shape[0]
pubmed_id_sub_df = pubmed_id_df
display(pubmed_id_sub_df)

Unnamed: 0,pmid,title_e,journal
0,35568078,SARS-CoV-2 vaccination in the immunocompromise...,J Allergy Clin Immunol
1,35568309,SARS-CoV-2 and Influenza A virus: Dual diagnos...,Int J Surg
2,35566568,Pandemic Preparedness: The Importance of Adequ...,J Clin Med
3,35563611,Rationale for 1068 nm Photobiomodulation Thera...,Int J Mol Sci
4,35566073,Drug Repurposing for COVID-19: A Review and a ...,Molecules
...,...,...,...
250773,35373191,Involving Urban Single Low-Income African Amer...,Fam Med Prim Care Open Access
250774,34235420,Could fighting airborne transmission be the ne...,City Environ Interact
250775,33935437,A retrospective analysis of the effect of the ...,Ayu
250776,34703640,Use of public data to describe COVID-19 contac...,Western Pac Surveill Response J


Unnamed: 0,pmid,title_e,journal
0,35568078,SARS-CoV-2 vaccination in the immunocompromise...,J Allergy Clin Immunol
1,35568309,SARS-CoV-2 and Influenza A virus: Dual diagnos...,Int J Surg
2,35566568,Pandemic Preparedness: The Importance of Adequ...,J Clin Med
3,35563611,Rationale for 1068 nm Photobiomodulation Thera...,Int J Mol Sci
4,35566073,Drug Repurposing for COVID-19: A Review and a ...,Molecules
...,...,...,...
250773,35373191,Involving Urban Single Low-Income African Amer...,Fam Med Prim Care Open Access
250774,34235420,Could fighting airborne transmission be the ne...,City Environ Interact
250775,33935437,A retrospective analysis of the effect of the ...,Ayu
250776,34703640,Use of public data to describe COVID-19 contac...,Western Pac Surveill Response J


In [9]:
# Split into sub-lists (200 articles per sub-list) for ID conversion.

pmid_list = pubmed_id_sub_df['pmid'].to_list()

id_strings = []
sample_num = len(pmid_list)
nested_list_num = int(sample_num / 200) if sample_num % 200 == 0 else int(sample_num / 200) + 1

for i in range(nested_list_num):
    start = 200 * i
    end = 200 * (i + 1)
    id_string = ','.join(pmid_list[start:end])
    id_strings.append(id_string)
    
print("Generated %d nested lists for ID conversion." % len(id_strings))

Generated 1254 nested lists for ID conversion.


In [10]:
# Convert into PMCID and save as CSV files.
base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"

for i, id_string in enumerate(id_strings):
    url = base_url + "?ids={}&format=csv".format(id_string)
    response = requests.get(url)
    csv_filename = "pmcid_{}_{}.csv".format(subset_size, i+1)
    csv_path = os.path.join(csv_directory_path, csv_filename)
    with open(csv_path, 'w') as csvfile:
        csvfile.write(response.text)
    time.sleep(0.5)
    if i != 0 and i % 100 == 0:
        print("Saved PMCID sublist %d as %s." % (i, csv_filename))
print("All PMCID converted and saved to specified directory.")

Saved PMCID sublist 100 as pmcid_250778_101.csv.
Saved PMCID sublist 200 as pmcid_250778_201.csv.
Saved PMCID sublist 300 as pmcid_250778_301.csv.
Saved PMCID sublist 400 as pmcid_250778_401.csv.
Saved PMCID sublist 500 as pmcid_250778_501.csv.
Saved PMCID sublist 600 as pmcid_250778_601.csv.
Saved PMCID sublist 700 as pmcid_250778_701.csv.
Saved PMCID sublist 800 as pmcid_250778_801.csv.
Saved PMCID sublist 900 as pmcid_250778_901.csv.
Saved PMCID sublist 1000 as pmcid_250778_1001.csv.
Saved PMCID sublist 1100 as pmcid_250778_1101.csv.
Saved PMCID sublist 1200 as pmcid_250778_1201.csv.
All PMCID converted and saved to specified directory.


## Get all PMC articles.

In [11]:
# Combined all.

pmcids = []
for csvfile in os.listdir(csv_directory_path):
    if csvfile.endswith(".csv"):
        csv_path = os.path.join(csv_directory_path, csvfile)
        pmcid_df = pd.read_csv(csv_path)
        pmcid_sublist = pmcid_df['PMCID'].dropna().str.replace("PMC", "").to_list()
        pmcids.extend(pmcid_sublist)

pmcids = list(set(pmcids))
print("Find %d articles indexed in PubMed Central." % len(pmcids))

Find 199521 articles indexed in PubMed Central.


In [None]:
# Download PMC full-text.
downlad_base_url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
download_count = 0

for i, pmcid in enumerate(pmcids):
    url = downlad_base_url + "?db=pmc&id={}&retmode=xml".format(pmcid)
    response = requests.get(url)
    xml_filename = "PMC{}.xml".format(pmcid)
    xml_path = os.path.join(download_directory_path, xml_filename)
    if os.path.exists(xml_path):
        continue
    else:
        with open(xml_path, 'w') as xmlfile:
            xmlfile.write(response.text)
        time.sleep(0.5)
        download_count += 1
    if download_count != 0 and download_count % 1000 == 0:
        print("Downloaded %d full-text articles from PubMed Central." % download_count)
        
print("All PMC articles are saved as XML files in the specified directory.")

In [9]:
# PMC full-text configuration.
max_download_num = 2000
reset_point = 26674

In [10]:
# Download PMC full-text.
downlad_base_url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

for i, pmcid in enumerate(pmcids, reset_point):
    url = downlad_base_url + "?db=pmc&id={}&retmode=xml".format(pmcid)
    response = requests.get(url)
    xml_filename = "PMC{}.xml".format(pmcid)
    xml_path = os.path.join(download_directory_path, xml_filename)
    with open(xml_path, 'w') as xmlfile:
        xmlfile.write(response.text)
    time.sleep(0.5)
    if i != 0 and i % 1000 == 0:
        print("Downloaded %d full-text articles from PubMed Central." % i)
    if i - reset_point + 1 == max_download_num:
        print("Reached the maximum limit %d for downloads. %d articles downloaded." % (max_download_num, i))
        break
        
print("All PMC articles are saved as XML files in the specified directory.")
    

Downloaded 25000 full-text articles from PubMed Central.
Downloaded 26000 full-text articles from PubMed Central.
Reached the maximum limit 2000 for downloads. 26673 articles downloaded.
All PMC articles are saved as XML files in the specified directory.


In [None]:
# Parse out citation context.
print(len(os.listdir(download_directory_path)))