<a href="https://colab.research.google.com/github/aivscovid19/data_pipeline/blob/gulnoza/PbmcSiteWorker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 ## **Set up and update**


In [None]:
from google.colab import auth
credentials = auth.authenticate_user()

!apt-get update && apt-get upgrade
!apt install chromium-chromedriver
!pip install centaurMiner==0.0.8
!pip install import_ipynb

!apt autoremove

# import JobDispatcher module
!curl 'https://raw.githubusercontent.com/aivscovid19/data_pipeline/gulnoza/JobDispatcher.ipynb' > job_dispatcher.ipynb

## Define PmbcSiteWorker class:

In [62]:
from job_dispatcher import JobDispatcher
import centaurminer as mining
import random, time
import pandas as pd
from pandas.io import gbq

class PbmcSiteWorker():
  def __init__(self, credentials, project_id, url_table, article_table,
                driver_path=None):
    self.max_threshold = 50
    self.min_delay = 0.1
    self.max_delay = 2
    self.credentials = credentials
    self.project_id = project_id
    self.url_table = url_table
    self.article_table = article_table
    self.driver_path = driver_path
    self.article_schema = [
        {'name': 'abstract',                'type': 'STRING', 'mode': 'REQUIRED'},
        {'name': 'authors',                 'type': 'STRING'                    },
        {'name': 'date_publication',        'type': 'DATE'                      },
        {'name': 'doi',                     'type': 'STRING'                    },
        {'name': 'extra_link',              'type': 'STRING'                    },
        {'name': 'keywords',                'type': 'STRING'                    },
        {'name': 'license',                 'type': 'STRING'                    },
        {'name': 'organization_affiliated', 'type': 'STRING'                    },
        {'name': 'pubmed_link',             'type': 'STRING'                    },
        {'name': 'source',                  'type': 'STRING'                    },
        {'name': 'title',                   'type': 'STRING', 'mode': 'REQUIRED'},
        {'name': 'translated_link',         'type': 'STRING',                   },
        {'name': 'url',                     'type': 'STRING', 'mode': 'REQUIRED'},
        {'name': 'date_aquisition',         'type': 'DATE'                      },
    ]

  class PbmcLocations(mining.PageLocations):
    license = "http://pbmc.ibmc.msk.ru/ru/authors-rules-ru/"
    keywords = mining.MetaData("citation_keywords")
    abstract = mining.Element("xpath", "//td[@class='arti'][@style='text-align:justify;']")
    source = mining.MetaData("citation_journal_title")
    organization_affiliated = mining.MetaData("citation_author_institution")
    pubmed_link = mining.Element("xpath", "//td[@class='arti'][@style='align:justify;']//a[@target='_blank']").get_attribute('href')
    translated_link = mining.Element("xpath", "//td[@class='arti']//a[@target='_blank']").get_attribute('href')

  class PbmcEngine(mining.MiningEngine):
    def get_date_publication(self, element):
      return (self.get(element).replace('/', '-'))

    def get_organization_affiliated(self, element):
      return mining.TagList(self.get(element, several=True))

  def scrape_data(self, urls_df, limit):
    miner = PbmcEngine(PbmcLocations, driver_path=self.driver_path)
    
    urls = [url for url in list(urls_df['article_url'])]
 
    prev_count = 0
    for count, url in enumerate(urls, 1):
      miner.gather(url)
      data.append(miner.results)
      if (count == self.max_threshold or count == len(urls)):
        articles_df = pd.DataFrame(data[prev_count : count])
        articles_df.to_gbq(destination_table=f'{self.article_table}',
                  project_id=self.project_id,
                  if_exists='append',
                  table_schema=self.article_schema,
                  credentials=self.credentials)
        prev_count = count
      time.sleep(self.min_delay + self.max_delay * random.random())

    # update job_status
    JobDispatcher(self.credentials, self.project_id, self.url_table).update_job_status(urls_df)
    return articles_df

## TEST

In [63]:
import uuid

worker_id = uuid.uuid1()
limit = 5
driver_path = '/usr/lib/chromium-browser/chromedriver'

urls_df = JobDispatcher(credentials, 'for-gulnoza', 'pbmc_v2.url_builder_med').register_job(worker_id, limit)
articles_df = PbmcSiteWorker(credentials, 'for-gulnoza', 'pbmc_v2.url_builder_med', 'pbmc_v2.articles_med', driver_path=driver_path).scrape_data(urls_df, limit)

print(urls_df)
print(articles_df)

1it [00:03,  3.39s/it]


Headless: True
driver path: /usr/lib/chromium-browser/chromedriver


1it [00:05,  5.45s/it]
1it [00:06,  6.21s/it]

Done
                                         article_url  ...                             worker_id
0  http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  f99d7dd2-02dd-11eb-8613-0242ac1c0002
1  http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-201...  ...  f99d7dd2-02dd-11eb-8613-0242ac1c0002
2  http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-201...  ...  f99d7dd2-02dd-11eb-8613-0242ac1c0002
3  http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-200...  ...  f99d7dd2-02dd-11eb-8613-0242ac1c0002
4  http://pbmc.ibmc.msk.ru/ru/article-ru/PBMC-201...  ...  f99d7dd2-02dd-11eb-8613-0242ac1c0002

[5 rows x 8 columns]
                                            abstract  ... date_aquisition
0  Isatin (indol-2,3-dione), an endogenous biofac...  ...      2020-09-30
1  В статье рассмотрены различные аспекты использ...  ...      2020-09-30
2  ПНФ катализирует обратимую реакцию фосфоролиза...  ...      2020-09-30
3  Электрохимические сенсоры с углеродными нанотр...  ...      2020-09-30
4  Одной из важных целей ом


