<a href="https://colab.research.google.com/github/aivscovid19/data_pipeline/blob/gulnoza/PbmcSiteWorker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 ## **Set up and update**


In [None]:
from google.colab import auth
credentials = auth.authenticate_user()

!apt-get update && apt-get upgrade
!apt install chromium-chromedriver
!pip install centaurMiner==0.0.8
!pip install import_ipynb

!apt autoremove

# import JobDispatcher module
!curl 'https://raw.githubusercontent.com/aivscovid19/data_pipeline/gulnoza/JobDispatcher.ipynb' > job_dispatcher.ipynb

## Define PmbcSiteWorker class:

In [50]:
import import_ipynb
from job_dispatcher import JobDispatcher
import centaurminer as mining
import random, time
import pandas as pd
from pandas.io import gbq

class PbmcSiteWorker():
  '''
  PmbcSiteWorker class scrapes articles from 
  Biomedical Chemistry (`http://pbmc.ibmc.msk.ru/`)
  and uploads scraped data to a given BigQuery table.

  Attributes:
    credentials (str): Credentials, either from user_account or service_account,
                        to authenticate to Google Cloud APIs.
    project_id (str): A project_id on Google Cloud Platform.
    url_table (str): A url_table to use to retrieve urls_dataframe from,
                      in form of `dataset_id.table_id`.
    article_table (str): An article_table to use to upload scraped data to,
                      in form of `dataset_id.table_id`.
    driver_path (str): A driver path to a chromium-chromedriver
  '''

  def __init__(self, credentials, project_id, url_table, article_table,
                driver_path=None):
    self.max_threshold = 50
    self.min_delay = 0.1
    self.max_delay = 2
    self.credentials = credentials
    self.project_id = project_id
    self.url_table = url_table
    self.article_table = article_table
    self.driver_path = driver_path
    self.article_schema = [
        {'name': 'abstract',                'type': 'STRING', 'mode': 'REQUIRED'},
        {'name': 'authors',                 'type': 'STRING'                    },
        {'name': 'date_publication',        'type': 'DATE'                      },
        {'name': 'doi',                     'type': 'STRING'                    },
        {'name': 'extra_link',              'type': 'STRING'                    },
        {'name': 'keywords',                'type': 'STRING'                    },
        {'name': 'license',                 'type': 'STRING'                    },
        {'name': 'organization_affiliated', 'type': 'STRING'                    },
        {'name': 'pubmed_link',             'type': 'STRING'                    },
        {'name': 'source',                  'type': 'STRING'                    },
        {'name': 'title',                   'type': 'STRING', 'mode': 'REQUIRED'},
        {'name': 'translated_link',         'type': 'STRING',                   },
        {'name': 'url',                     'type': 'STRING', 'mode': 'REQUIRED'},
        {'name': 'date_aquisition',         'type': 'DATE'                      },
    ]

  class PbmcLocations(mining.PageLocations):
    '''
    PbmcLocations class sets instructions to find an element on
    `http://pbmc.ibmc.msk.ru/`
    '''

    license = "http://pbmc.ibmc.msk.ru/ru/authors-rules-ru/"
    keywords = mining.MetaData("citation_keywords")
    abstract = mining.Element("xpath", "//td[@class='arti'][@style='text-align:justify;']")
    source = mining.MetaData("citation_journal_title")
    organization_affiliated = mining.MetaData("citation_author_institution")
    pubmed_link = mining.Element("xpath", "//td[@class='arti'][@style='align:justify;']//a[@target='_blank']").get_attribute('href')
    translated_link = mining.Element("xpath", "//td[@class='arti']//a[@target='_blank']").get_attribute('href')

  class PbmcEngine(mining.MiningEngine):
    '''
    PbmcEngine class mines data from `http://pbmc.ibmc.msk.ru/`
    '''

    def get_date_publication(self, element):
      ''' Changes date format from YYYY/MM/DD to YYYY-MM-DD'''
      return (self.get(element).replace('/', '-'))

    def get_organization_affiliated(self, element):
      ''' Gets several `organizations_affiliated` fields and wraps them 
      inside `html` like tags. '''
      return mining.TagList(self.get(element, several=True), tag='org')

  def scrape_data(self, urls_df, limit=100):
    ''' Scrapes data given urls' dataframe, and limit of articles to scrape.
    Uploads data to a given BigQuery table
    and calls `update_job_status` method from `JobDispatcher` class
    to update `status` of the job to `done`.

    Attributes:
      urls_df (pandas dataframe): A urls dataframe, to get a list of urls 
                                  to scrape from.
      limit (str): A limit of articles to scrape. Default is 100.
    '''

    miner = self.PbmcEngine(self.PbmcLocations, driver_path=self.driver_path)
    
    urls = [url for url in list(urls_df['article_url'])]
    data = []
    prev_count = 0
    for count, url in enumerate(urls, 1):
      miner.gather(url)
      data.append(miner.results)
      if (count == self.max_threshold or count == limit or count == len(urls)):
        # remove data if article_schema's REQUIRED fields are NULL
        # applied to 'title' and 'abstract' fields
        articles_list = list(filter(lambda i:
                            i['title'] != None and len(i['title']) != 0 and
                            i['abstract'] != None and len(i['abstract']) != 0,
                            data[prev_count : count])) 
        articles_df = pd.DataFrame(articles_list)
        articles_df.to_gbq(destination_table=f'{self.article_table}',
                  project_id=self.project_id,
                  if_exists='append',
                  table_schema=self.article_schema,
                  credentials=self.credentials)
        prev_count = count
      time.sleep(self.min_delay + self.max_delay * random.random())
    # update job_status
    JobDispatcher(self.credentials, self.project_id, self.url_table).update_job_status(urls_df)


## TEST

In [52]:
import uuid

worker_id = uuid.uuid1()
limit = 25
driver_path = '/usr/lib/chromium-browser/chromedriver'

urls_df = JobDispatcher(credentials, 'for-gulnoza', 'pbmc_v2.url_builder_med').register_job(worker_id, limit)
PbmcSiteWorker(credentials, 'for-gulnoza', 'pbmc_v2.url_builder_med', 'pbmc_v2.articles_med_10_02', driver_path=driver_path).scrape_data(urls_df, limit)

print(urls_df)





1it [00:02,  2.43s/it]


Headless: True
driver path: /usr/lib/chromium-browser/chromedriver





0it [00:00, ?it/s][A[A[A

                                             abstract  ... date_aquisition
0   Клеточные ганглиозиды участвуют во взаимодейст...  ...      2020-10-02
1   Натрийуретические пептиды - семейство пептидны...  ...      2020-10-02
2   Исследована применимость разработанных нами ко...  ...      2020-10-02
3   Методом микротонкослойной хроматографии изучал...  ...      2020-10-02
4   Липиды составляют 20% от веса лиофильно высуше...  ...      2020-10-02
5   В работе представлен способ определения флавон...  ...      2020-10-02
6   Показано, что секреты слюнных желез медицински...  ...      2020-10-02
7   Нанобиотехнологии - это новое направление в на...  ...      2020-10-02
8   Использование в качестве модельных гликирующих...  ...      2020-10-02
9   Рассматривается состояние дел в России и за ру...  ...      2020-10-02
10  В статье рассмотрены различные аспекты использ...  ...      2020-10-02
11  В результате фракционирования высокоочищенного...  ...      2020-10-02
12  Метаболомика — одна и

1it [00:02,  2.29s/it]



1it [00:02,  2.55s/it]

Done
                                          article_url  ...                             worker_id
0   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
1   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
2   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-201...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
3   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-199...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
4   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
5   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
6   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-201...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
7   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
8   http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-201...  ...  625a4a34-04d3-11eb-bb65-0242ac1c0002
9   http://pbmc.ibmc.msk.


