<a href="https://colab.research.google.com/github/YujiSue/python/blob/master/PubMedControl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import json
import re
import threading

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
driver.implicitly_wait(10)
headers = requests.utils.default_headers()
headers["User-Agent"] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'

class PubMedSearcher:
  def __init__(self):
    self.que = ''
    self.param = { 'db': 'pubmed', 'retmode': 'json' }
    self.reflist = {}
    self.base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?'

  def setParam(self, key, value):
    self.param[key] = value

  def makeQue(self):
    self.que = self.base
    if (0 < len(self.param)):
      for key in self.param:
        self.que += key + "=" + self.param[key] + '&'
      self.que = self.que[:len(self.que)-1]    
  
  def getSummary(self, id):
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&id='+str(id))
    summary = response.json()
    if (summary.get('result') == None):
      print(summary)
    else: 
      self.reflist[str(id)] = summary['result'][str(id)]

  def search(self):
    self.makeQue()
    response  = requests.get(self.que+'&rettype=count')
    total = response.json()['esearchresult']['count']
    print('count:',total)
    if (0 < int(total)):
      response = requests.get(self.que+'&retmax='+str(total))
      ids = response.json()['esearchresult']['idlist']
      for val in ids:
        self.getSummary(val)
        time.sleep(0.25)

  def clear(self):
    self.reflist = {}

  def load(self, path):
    self.reflist = json.load(open(path, 'r'))

  def output(self, path):
    with open(path, 'w') as f:
      json.dump(self.reflist, f, indent=4, ensure_ascii=False)

def elsevierLink(url, doi):
  response = requests.get('https://api.elsevier.com/content/object/doi/'+doi+'?httpAccept=application/json')
  return response.json()['attachment-metadata-response']['coredata']['link'][1]['@href']

def checkRedirect(url):
  response = requests.get(url, headers=headers)
  if (response.headers.get('Link') != None):
    beg = response.headers['Link'].find('<')
    if (beg != -1):
      end = response.headers['Link'].find('>', beg+1)
      return response.headers['Link'][beg+1:end]
    else:
      return response.headers['Link']
  else:
    return url

class JournalSearch:
  def __init__(self):
    self.base = 'https://pubmed.ncbi.nlm.nih.gov/'

  def search(self, id, key, eid):
    words = []
    fulltext = {}
    trial = 0
    while (True):
      try:
        response = requests.get(self.base+id, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.select('div.full-text-links-list a')
        if (0 < len(links)):
          for item in links:
            if (item['href'].find('elsevier') != -1):
              fulltext[item['title']] = elsevierLink(item['href'], eid)
            else:
              fulltext[item['title']] = checkRedirect(item['href'])
            print(id,':',fulltext[item['title']])
        break
      except Exception as e:
        print('ID:', id, '\t', e)
        trial = trial + 1
        if (trial == 3):
          break
    trial = 0
    if (0 < len(fulltext)):
      for site in fulltext:
        while (True):
          try:
            response = requests.get(fulltext[site], headers=headers)
            words = re.findall(key, response.text)
            print(id,':',words)
            break
          except Exception as e:
            trial = trial + 1
            if (trial == 3):
              print('ID:', id, '\t', e)
              break
        if (0 < len(words)):
          break
    return words

def worker(tid, count, reflist, header, prefix):
  beg = tid*count
  end = beg+count
  current = 0
  result = '"ID","Link","Title","Author","Journal","PubDate","Type","KeyWords"\n' if header==True else ''
  jsearcher = JournalSearch()
  for refid in reflist:
    if (current < beg):
      current = current+1
      continue
    if (end <= current):
      break
    refdata = reflist[refid]
    result += '"'+refid+'","'+'https://pubmed.ncbi.nlm.nih.gov/'+refid
    title = refdata['title']
    title = title.replace('&lt;i&gt;', '')
    title = title.replace('&lt;/i&gt;', '')
    title = title.replace('&lt;sub&gt;', '')
    title = title.replace('&lt;/sub&gt;', '')
    title = title.replace('&lt;sup&gt;', '')
    title = title.replace('&lt;/sup&gt;', '')
    result += '","'+title
    if (refdata.get('authors') != None and 0 < len(refdata['authors'])):
      result += '","'
      names = ''
      for author in refdata['authors']:
        names += author['name']+','
      result += names[0:len(names)-1]
    else: 
      result += '","'
    result += '","'+refdata['fulljournalname']
    result += '","'+refdata['pubdate']
    if (refdata.get('pubtype') != None and 0 < len(refdata['pubtype'])):
      result += '","'+"-".join(refdata['pubtype'])
    else: 
      result += '","'
    refdata['myquery'] = jsearcher.search(refid, 'tm\d{3,}', refdata['elocationid'])
    if (0 < len(refdata['myquery'])):
      result += '","'+",".join(refdata['myquery'])+'"\n'
    else:
      result += '",""\n'
    current = current+1
    print(current-beg,'/',count)
  f = open(prefix+str(tid).rjust(2, '0')+'.csv', 'w')
  f.write(result)

In [None]:
#PubMedから検索
searcher = PubMedSearcher()
searcher.setParam('api_key', '<Your ID>')
#検索クエリ
searcher.setParam('term', 'elegans')
#時間範囲
searcher.setParam('mindate', '2019/04/01')
searcher.setParam('maxdate', '2020/03/31')
#検索
searcher.search()
#出力
searcher.output('elegans_2019_reflist.json')

In [None]:
#出力したファイルを読み込む
#上セルから連続実行する場合は下２行コメントアウト
searcher = PubMedSearcher()
searcher.load('elegans_2019_reflist.json')

prefix = 'result2019-'
#マルチスレッド化
total = len(searcher.reflist)
count = 20
threads = []
maxthreads = int(total/count)+1
#for tid in range(maxthreads):
for tid in range(3,4):
  threads.append(threading.Thread(target=worker, args=(tid, count, searcher.reflist, False if 0 < tid else True, prefix)))
for thread in threads:
  thread.start()
for thread in threads:
  thread.join()

In [None]:
#統合
integrated = ''
for tid in range(maxthreads):
  fr = open(prefix+str(tid).rjust(2, '0')+'.csv')
  res = fr.read()
  integrated += res
fw = open(prefix[:len(prefix)-1]+'.csv', 'w')
fw.write(integrated)