# Imports and command line arguments

In [None]:
import argparse
import sys
import os
import requests
import time
import csv
from bs4 import BeautifulSoup
import re
import urllib.parse

In [None]:
parser=argparse.ArgumentParser()
parser._optionals.title = "Flag Arguments"
parser.add_argument('-pmf',help="File with pmids to fetch inside, one pmid per line. Optionally, the file can be a tsv with a second column of names to save each pmid's article with (without '.pdf' at the end). Must include -pmids or -pmf", default='open_access_pmids.txt')
parser.add_argument('-out',help="Output directory for fetched articles.  Default: fetched_pdfs", default="fetched_pdfs")
parser.add_argument('-errors',help="Output file path for pmids which failed to fetch.  Default: unfetched_pmids.tsv", default="unfetched_pmids.tsv")
parser.add_argument('-maxRetries',help="Change max number of retries per article on an error 104.  Default: 3", default=3,type=int)
args = vars(parser.parse_args([]))

In [None]:
print(args)

In [None]:
if not os.path.exists(args['out']):
    print( "Output directory of {0} did not exist.  Created the directory.".format(args['out']))
    os.mkdir(args['out'])

### Debug space.  Clear before commit

# Functions

In [None]:
def getMainUrl(url):
    return "/".join(url.split("/")[:3])

In [None]:
def download_pdf(pdf_url, out_dir, filename):
    try:
        r = requests.get(pdf_url, stream=True, timeout=20)
        r.raise_for_status()
        os.makedirs(out_dir, exist_ok=True)
        with open(os.path.join(out_dir, filename), "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {filename}")
        return True
    except Exception as e:
        print(f"Failed to download {filename}: {e}")
        return False

In [None]:
def get_landing_url(paper):
    if paper["doi"] and paper["doi"] != "none":
        return f"https://doi.org/{paper['doi']}"
    elif paper["pmid"] and paper["pmid"] != "none":
        return f"https://pubmed.ncbi.nlm.nih.gov/{paper['pmid']}/"
    elif paper["pmcid"] and paper["pmcid"] != "none":
        return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{paper['pmcid']}/"
    else:
        return None

In [None]:
def html_finder_wrapper(paper, html_finder_func, headers=None):
    url = get_landing_url(paper)
    if not url:
        return None
    try:
        r = requests.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.content, "html.parser")
        return html_finder_func(r, soup, headers)
    except Exception as e:
        print(f"{html_finder_func.__name__} error: {e}")
        return None

In [None]:
def finder_europepmc(paper):
    if not paper["pmid"]:
        return None
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{paper['pmid']}&resultType=core&format=json"
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        data = r.json()
        for result in data.get("resultList", {}).get("result", []):
            if "fullTextUrlList" in result:
                for link in result["fullTextUrlList"]["fullTextUrl"]:
                    if link.get("documentStyle") == "pdf":
                        return link["url"]
    except Exception as e:
        print(f"EuropePMC error: {e}")
    return None

In [None]:
def finder_unpaywall(paper):
    if not paper["doi"]:
        return None
    url = f"https://api.unpaywall.org/v2/{paper['doi']}?email=YOUR_EMAIL@example.com"
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        data = r.json()
        loc = data.get("best_oa_location")
        if loc and loc.get("url_for_pdf"):
            return loc["url_for_pdf"]
    except Exception as e:
        print(f"Unpaywall error: {e}")
    return None

In [None]:
def finder_core(paper):
    if not paper["doi"]:
        return None
    api_key = "YOUR_CORE_API_KEY"
    url = f"https://core.ac.uk:443/api-v2/articles/get/{paper['doi']}?metadata=true&fulltext=true&citations=false&similar=false&duplicate=false&urls=true&apiKey={api_key}"
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        data = r.json()
        if data.get("data") and data["data"].get("downloadUrl"):
            return data["data"]["downloadUrl"]
    except Exception as e:
        print(f"CORE error: {e}")
    return None

In [None]:
def finder_openaire(paper):
    if not paper["doi"]:
        return None
    url = f"https://api.openaire.eu/search/publications?doi={paper['doi']}&format=json"
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        data = r.json()
        # You may need to parse for PDF links here
    except Exception as e:
        print(f"OpenAIRE error: {e}")
    return None

In [None]:
def finder_arxiv(paper):
    if paper["doi"] and paper["doi"].startswith("10.48550/arXiv."):
        arxiv_id = paper["doi"].split("arXiv.")[1]
        return f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    return None

In [None]:
def finder_springer(paper):
    if not paper["doi"]:
        return None
    return f"https://link.springer.com/content/pdf/{paper['doi']}.pdf"


In [None]:
def acsPublications(req, soup, headers):
    possibleLinks = [x for x in soup.find_all('a') if isinstance(x.get('title'), str) and ('high-res pdf' in x.get('title').lower() or 'low-res pdf' in x.get('title').lower())]
    if possibleLinks:
        print("** fetching reprint using the 'acsPublications' finder...")
        pdfUrl = getMainUrl(req.url) + possibleLinks[0].get('href')
        return pdfUrl
    return None

def finder_acsPublications(paper):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return html_finder_wrapper(paper, acsPublications, headers)


In [None]:
def genericCitationLabelled(req, soup, headers):
    possibleLinks = soup.find_all('meta', attrs={'name': 'citation_pdf_url'})
    if possibleLinks:
        print("** fetching reprint using the 'generic citation labelled' finder...")
        return possibleLinks[0].get('content')
    return None
def finder_genericCitationLabelled(paper):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return html_finder_wrapper(paper, genericCitationLabelled, headers)


In [None]:
def fetch_paper_pdf(paper, out_dir, finders, error_file, max_retries=3):
    filename = paper["doi"] or paper["pmid"] or paper["pmcid"] or "unknown"
    filename = filename.replace("/", "_") + ".pdf"
    print(f"\n=== Processing: DOI={paper['doi']}, PMID={paper['pmid']}, PMCID={paper['pmcid']} ===")
    for finder in finders:
        for attempt in range(1, max_retries + 1):
            print(f"  Trying finder: {finder.__name__} (attempt {attempt}/{max_retries})")
            pdf_url = finder(paper)
            if pdf_url:
                print(f"    Found PDF URL: {pdf_url}")
                if download_pdf(pdf_url, out_dir, filename):
                    print(f"    SUCCESS: Downloaded {filename} using {finder.__name__}")
                    return True
                else:
                    print(f"    ERROR: Failed to download from {pdf_url}")
            else:
                print(f"    No PDF URL found by {finder.__name__}")
            time.sleep(1)
    with open(error_file, "a") as ef:
        ef.write(f"{paper['doi'] or 'none'}\t{paper['pmid'] or 'none'}\t{paper['pmcid'] or 'none'}\n")
    print(f"  FAILED: No PDF found for {filename} after trying all finders.")
    return False

# Finders

In [None]:
def futureMedicine(req,soup,headers):
    possibleLinks=soup.find_all('a',attrs={'href':re.compile("/doi/pdf")})
    if len(possibleLinks)>0:
        print ("** fetching reprint using the 'future medicine' finder...")
        pdfUrl=getMainUrl(req.url)+possibleLinks[0].get('href')
        return pdfUrl
    return None

def finder_futureMedicine(paper):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return html_finder_wrapper(paper, futureMedicine, headers)

In [None]:
def nejm(req,soup,headers):
    possibleLinks=[x for x in soup.find_all('a') if type(x.get('data-download-type'))==str and (x.get('data-download-type').lower()=='article pdf')]
        
    if len(possibleLinks)>0:
        print ("** fetching reprint using the 'NEJM' finder...")
        pdfUrl=getMainUrl(req.url)+possibleLinks[0].get('href')
        return pdfUrl
    
    return None

In [None]:
def pubmed_central_v1(req,soup,headers):
    possibleLinks=soup.find_all('a',re.compile('pdf'))
    
    possibleLinks=[x for x in possibleLinks if 'epdf' not in x.get('title').lower()] #this allows the pubmed_central finder to also work for wiley
    
    if len(possibleLinks)>0:
        print ("** fetching reprint using the 'pubmed central' finder...")
        pdfUrl=getMainUrl(req.url)+possibleLinks[0].get('href')
        return pdfUrl
    
    return None

def finder_pub1(paper):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return html_finder_wrapper(paper, pubmed_central_v1, headers)

In [None]:
def pubmed_central_v2(req,soup,headers):
    possibleLinks=soup.find_all('a',attrs={'href':re.compile('/pmc/articles')})
        
    if len(possibleLinks)>0:
        print ("** fetching reprint using the 'pubmed central' finder...")
        pdfUrl="https://www.ncbi.nlm.nih.gov/{}".format(possibleLinks[0].get('href'))
        return pdfUrl
    
    return None

def finder_pub2(paper):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return html_finder_wrapper(paper, pubmed_central_v2, headers)

In [None]:
def science_direct(req,soup,headers):
    newUri=urllib.parse.unquote(soup.find_all('input')[0].get('value'))
    req=requests.get(newUri,allow_redirects=True,headers=headers)
    soup=BeautifulSoup(req.content,'html.parser')
    

    possibleLinks=soup.find_all('meta',attrs={'name':'citation_pdf_url'})
    
    
    
    if len(possibleLinks)>0:
        print ("** fetching reprint using the 'science_direct' finder...")
        req=requests.get(possibleLinks[0].get('content'),headers=headers)
        soup=BeautifulSoup(req.content,'html.parser')
        
        pdfUrl=soup.find_all('a')[0].get('href')
        return pdfUrl
    return None

def finder_scienceDirect(paper):
    headers = {'User-Agent': 'Mozilla/5.0'}
    return html_finder_wrapper(paper, science_direct, headers)

# Main

In [None]:
# finders=[
#          'genericCitationLabelled',
#          'pubmed_central_v2',
#          'acsPublications',
#          'uchicagoPress',
#          'nejm',
#          'futureMedicine',
#          'science_direct',
#          'direct_pdf_link',
# ]
finders = [
        finder_europepmc,
        finder_unpaywall,
        finder_core,
        finder_openaire,
        finder_arxiv,
        finder_springer,
        finder_acsPublications,
        finder_genericCitationLabelled,
        finder_pub1,
        finder_pub2,
        finder_scienceDirect
        # Add more finders here...
    ]

In [None]:
papers = []
with open(args['pmf'], "r") as f:
    for line in f:
        doi, pmid, pmcid = [x.strip() if x.strip() != "none" else None for x in line.strip().split(",")]
        papers.append({"doi": doi, "pmid": pmid, "pmcid": pmcid})



In [None]:
# args = {
#     'pmf': 'open_access_pmids.txt',
#     'pmids': '%#$',
#     'out':'fetched_pdfs',
#     'errors': 'unfetched_pmids.tsv',
#     'maxRetries': 3
# }

In [None]:
for paper in papers:
        fetch_paper_pdf(paper, args['out'], finders, args['errors'], args['maxRetries'])