<a href="https://colab.research.google.com/github/aivscovid19/data_pipeline/blob/gulnoza/IbmcRuMiner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up and update

In [None]:
!apt-get update && apt-get upgrade
!apt install chromium-chromedriver
!pip install centaurMiner==0.0.8

!apt autoremove

## Define miner

In [None]:
import centaurminer as mining

class IbmcRuMiner():
  '''
  Miner for http://pbmc.ibmc.msk.ru/
  '''
  def __init__():
    pass

  class IbmcLocations(mining.PageLocations):
    '''
    IbmcLocations class sets instructions to find an element on
    `http://pbmc.ibmc.msk.ru/`
    '''

    license = "http://pbmc.ibmc.msk.ru/ru/authors-rules-ru/"
    keywords = mining.MetaData("citation_keywords")
    abstract = mining.Element("xpath", "//td[@class='arti'][@style='text-align:justify;']")
    source = mining.MetaData("citation_journal_title")
    organization_affiliated = mining.MetaData("citation_author_institution")
    pubmed_link = mining.Element("xpath", "//td[@class='arti'][@style='align:justify;']//a[@target='_blank']").get_attribute('href')
    translated_link = mining.Element("xpath", "//td[@class='arti']//a[@target='_blank']").get_attribute('href')

  class IbmcEngine(mining.MiningEngine):
    '''
    IbmcEngine class sets instructions on how to mine data from
    `http://pbmc.ibmc.msk.ru/`
    '''

    def get_date_publication(self, element):
      ''' Changes date format from YYYY/MM/DD to YYYY-MM-DD'''
      return (self.get(element).replace('/', '-'))

    def get_authors(self, element):
      ''' Gets several `author` fields and wraps them 
      inside `html` like tags. '''
      return mining.TagList(self.get(element, several=True), tag='author')

    def get_organization_affiliated(self, element):
      ''' Gets several `organizations_affiliated` fields and wraps them 
      inside `html` like tags. '''
      return mining.TagList(self.get(element, several=True), tag='org')
