<a href="https://colab.research.google.com/github/khodizoda/ai_vs_covid19/blob/master/Rxiv_Miners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up and update

In [None]:
!apt-get update && apt-get upgrade
!apt install chromium-chromedriver
!pip install centaurMiner==0.0.8

!apt autoremove

## Define Arxiv miner

In [23]:
import centaurminer as mining

In [24]:
class ArxivMiner():
  '''
  Miner for https://arxiv.org/
  '''
  def __init__():
    pass

  class ArxivLocations(mining.PageLocations):
    """ 
      This is a class used to find the schema in the journal
    """
    references = mining.MetaData("")
    title = mining.MetaData("citation_title")
    doi = mining.MetaData("citation_doi")
    authors = mining.MetaData("citation_author")
    abstract = mining.Element("css_selector", "blockquote.abstract.mathjax")
    source = mining.MetaData('Arxiv')
    body = mining.MetaData("")
    source_impact_factor = mining.MetaData("")
    category = mining.MetaData("")
    license = "https://arxiv.org/licenses/nonexclusive-distrib/1.0/license.html"
    quantity_of_citations = mining.MetaData("")
    organization = mining.MetaData("")
    keywords = mining.MetaData("")
    extra_link = mining.MetaData("citation_pdf_url")

  class ArxivEngine(mining.MiningEngine):
    def get_authors(self, element):
      return mining.TagList(self.get(element, several=True),tag='author')
    def get_date_publication(self, element):
      ''' Changes date format from YYYY/MM/DD to YYYY-MM-DD'''
      return (self.get(element).replace('/', '-'))


## Define Biorxiv miner

In [25]:
class BiorxivMiner():
  '''
  Miner for https://www.biorxiv.org/
  '''
  def __init__():
    pass

  class BiorxivLocations(mining.PageLocations):
    """ 
      This is a class used to find the schema in the journal
    """
    references = mining.MetaData("citation_reference")
    source = mining.MetaData('citation_journal_title')
    body = mining.MetaData("")
    source_impact_factor = mining.MetaData("")
    category = mining.MetaData("")
    license = "It is made available under a CC-BY 4.0 International license."
    quantity_of_citations = mining.MetaData("")
    organization = mining.MetaData("citation_author_institution")
    keywords = mining.MetaData("")
    extra_link = mining.MetaData("citation_pdf_url")
    
  class BiorxivEngine(mining.MiningEngine):
    def get_authors(self, element):
      return mining.TagList(self.get(element, several=True),tag='author')
    def get_references(self, element):
      return mining.TagList(self.get(element, several=True),tag='reference')
    def get_organization(self, element):
      return mining.TagList(self.get(element, several=True),tag='organization')


## Define Medrxiv miner

In [26]:
class MedrxivMiner():
  '''
  Miner for https://www.medrxiv.org/
  '''
  def __init__():
    pass

  class MedrxivLocations(mining.PageLocations):
    references = mining.MetaData("")
    title = mining.MetaData("citation_title")
    doi = mining.MetaData("citation_doi")
    authors = mining.MetaData("citation_author")
    abstract = mining.MetaData("citation_abstract")
    date_publication = mining.MetaData("article:published_time")
    body = mining.MetaData("")
    source = mining.MetaData("citation_journal_title")
    source_impact_factor = mining.MetaData("")
    search_keyword = mining.MetaData("")
    category = mining.MetaData("")
    license = mining.MetaData("")
    quantity_of_citations = mining.MetaData("")
    organization = mining.MetaData("citation_author_institution")
    keywords = mining.MetaData("")
    extra_link = mining.MetaData("citation_pdf_url")      

  class MedrxivEngine(mining.MiningEngine):
    def get_organization(self, element):
      return mining.TagList(self.get(element, several=True),tag='organization')
    def get_authors(self, element):
      return mining.TagList(self.get(element, several=True),tag='author')
