<a href="https://colab.research.google.com/github/YujiSue/python/blob/master/PubMedControl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import json
import re
import threading

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
driver.implicitly_wait(10)
headers = requests.utils.default_headers()
headers["User-Agent"] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'

class PubMedSearcher:
  def __init__(self):
    self.que = ''
    self.param = { 'db': 'pubmed', 'retmode': 'json' }
    self.reflist = {}
    self.base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?'

  def setParam(self, key, value):
    self.param[key] = value

  def makeQue(self):
    self.que = self.base
    if (0 < len(self.param)):
      for key in self.param:
        self.que += key + "=" + self.param[key] + '&'
      self.que = self.que[:len(self.que)-1]    
  
  def getSummary(self, id):
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&id='+str(id))
    summary = response.json()
    if (summary.get('result') == None):
      print(summary)
    else: 
      self.reflist[str(id)] = summary['result'][str(id)]

  def search(self):
    self.makeQue()
    response  = requests.get(self.que+'&rettype=count')
    total = response.json()['esearchresult']['count']
    print('count:',total)
    if (0 < int(total)):
      response = requests.get(self.que+'&retmax='+str(total))
      ids = response.json()['esearchresult']['idlist']
      for val in ids:
        self.getSummary(val)
        time.sleep(0.25)

  def clear(self):
    self.reflist = {}

  def load(self, path):
    self.reflist = json.load(open(path, 'r'))

  def output(self, path):
    with open(path, 'w') as f:
      json.dump(self.reflist, f, indent=4, ensure_ascii=False)

def elsevierLink(url, doi):
  response = requests.get('https://api.elsevier.com/content/object/doi/'+doi+'?httpAccept=application/json')
  return response.json()['attachment-metadata-response']['coredata']['link'][1]['@href']

def checkRedirect(url):
  response = requests.get(url, headers=headers)
  if (response.headers.get('Link') != None):
    beg = response.headers['Link'].find('<')
    if (beg != -1):
      end = response.headers['Link'].find('>', beg+1)
      return response.headers['Link'][beg+1:end]
    else:
      return response.headers['Link']
  else:
    return url

In [None]:
#PubMedから検索
searcher = PubMedSearcher()
searcher.setParam('api_key', '<Your key>')
#検索クエリ
searcher.setParam('term', 'elegans')
#時間範囲
searcher.setParam('mindate', '2019/04/01')
searcher.setParam('maxdate', '2020/03/31')
#検索
searcher.search()
#出力
searcher.output('elegans_2019_reflist.json')