In [2]:
import pandas as pd
import time
import datetime
import requests
import json
import subprocess

In [3]:
# Make a GET request for each term to get a list of pmc ids for a given search term
# https://www.ncbi.nlm.nih.gov/pmc/?term=open%20access%5Bfilter%5D
def get_pmcids(term):
    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?'
    url = url + f'db=pmc&term={term}+AND+free+fulltext%5bfilter%5d&retmode=json'

    response = requests.get(url, ) #headers=headers, data=data)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Print the response
        json_response = response.json()
        arr = json_response['esearchresult']['idlist']
    
    else:
        # Print the error message
        print(f"Error: {response.status_code} - {response.text}")
        arr = []
        
    return arr

In [4]:
def get_pdf(pmcid):
    PMCLink="http://www.ncbi.nlm.nih.gov/pmc/articles/"
    #user_agent = "Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"

    pdf_url = PMCLink + pmcid + '/pdf/'
    print (pdf_url)
    response = requests.get(pdf_url, ) #headers={'User-Agent': user_agent})

    wget_command = [f'/usr/local/bin/wget',
           f'--user-agent="Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"',
           f'-l1',
           f'--no-parent',
           f'-A.pdf', 
           f'-O{pmcid}.pdf',
           pdf_url,
           # '2>/dev/null'
          ]
    # Execute the wget command
    try:
        subprocess.run(wget_command, check=True, ) #stderr=subprocess.DEVNULL)
        print(f"{pmcid} PDF downloaded successfully!")
    except subprocess.CalledProcessError as e:
        print(" ".join(wget_command), "failed")
        print(f"Failed to download {pmcid} PDF. Error: {e}")
        
    return

In [5]:
F="depmap/Model.csv.gz"
df=pd.read_csv(F)

In [8]:
'''
# This is the main routine.
# 
# It reads a value from a pandas column
# and uses that to query for pubmed central ids,
# and loop over the array of pmc ids getting pdfs.
#
# The query to pubmed central is filtered for
# free and full text.
'''

start = datetime.datetime.now()
for term in df['CellLineName']:
    pmcids = get_pmcids(term)
    for pmcid in pmcids:
        get_pdf(pmcid)
        time.sleep(1)
    break
stop = datetime.datetime.now()  

http://www.ncbi.nlm.nih.gov/pmc/articles/11073418/pdf/
11073418 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11062918/pdf/
11062918 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11038523/pdf/
11038523 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11038245/pdf/
11038245 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11031052/pdf/
11031052 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11030086/pdf/
11030086 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11029159/pdf/
11029159 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11018308/pdf/
11018308 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/11012648/pdf/
11012648 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/10969092/pdf/
10969092 PDF downloaded successfully!
http://www.ncbi.nlm.nih.gov/pmc/articles/10960176/pdf/
10960176 PDF do

In [9]:
print(start)
print(stop)
print(stop-start)

2024-05-08 09:07:01.204327
2024-05-08 09:10:03.192171
0:03:01.987844


In [10]:
1000*3/60

50.0