In [1]:
import logging

download_logger = logging.getLogger('download')
download_logger.setLevel(logging.DEBUG)
download_handler = logging.FileHandler('download.log', mode='w')
download_handler.setLevel(logging.DEBUG)
download_formatter = logging.Formatter('%(asctime)s;%(name)s;%(levelname)s;%(message)s')
download_handler.setFormatter(download_formatter)
download_logger.addHandler(download_handler)

In [2]:
import requests
import os
import time
import numpy as np
import sys

def ratelimit():
    "A function that handles the rate of your calls."
    time.sleep(1)

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
    """This Class implements a method for reliable connection to the internet and monitoring.
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessments

    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case). 
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type
    
    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile
    #header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']

    self.project_logger = logging.getLogger()
    self.project_logger.setLevel(logging.INFO)

    log_mode = 'w' if (overwrite_log or not os.path.isfile(logfile)) else 'a'
    self.handler = logging.FileHandler(logfile, mode=log_mode)
    self.handler.setLevel(logging.INFO)
    self.formatter = logging.Formatter('%(message)s')
    self.handler.setFormatter(self.formatter)
    self.project_logger.addHandler(self.handler)

    if overwrite_log == True or not os.path.isfile(logfile):
      # Write header to log file
      self.project_logger.info(
        msg='{};{};{};{};{};{};{};{};{};{};{}'.format('id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error'))

    with open(logfile,'r') as f: # open file
      l = f.readlines()
      # Remove lines from log file not beginning with a number
      l = list(filter(lambda x: x[0].isalpha(), l))
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1].split(';')[0]) + 1

  def log_message(self, project, t, delta_t, url, redirect_url, response_size, response_code, success, error):
    return '{};{};{};{};{};{};{};{};{};{};{}'.format(self.id, project, self.connector_type, t, delta_t, url, redirect_url, response_size, response_code, success, error)
    
  def _inc_id(self):
    self.id += 1

  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
    
    Keyword arguments:
    url -- str, url
    project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
     
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit()
        t = time.time()
        try: # error handling
          response = self.session.get(url,timeout = self.timeout) # make get call
          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = time.time() - t # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          self._inc_id() # increment call id
          self.project_logger.info(
              msg=self.log_message(project_name,t,dt,url,redirect_url,size,response_code,success,err))
          return response, self.id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt =  time.time() - t # define delta t
          ## log...
          self._inc_id() # increment call_id
          self.project_logger.info( \
              msg=self.log_message(project_name,t,dt,url,redirect_url,size,response_code,success,err))
    else:
      ratelimit()
      t = time.time()
      self.browser.get(url) # use selenium get method
      ## log
      self._inc_id() # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = time.time() - t # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      self.project_logger.info( \
          msg=self.log_message(project_name,t,dt,url,redirect_url,size,response_code,success,err))
      # Using selenium it will not return a response object, instead you should call the browser object of the connector.
      return self.id

In [3]:
### Reads in a list of product numbers from an excel sheet
import pandas as pd

df_lmpriser = pd.read_excel('lmpriser_eSundhed_190812.xlsx', sheet_name='DATA', converters={'Varenummer': str})

In [4]:
df_lmpriser.head()

Unnamed: 0,ATC,Lægemiddel,Varenummer,Pakning,Styrke,Form,Firma,Indikator,20140623,20140707,...,20190408,20190422,20190506,20190520,20190603,20190617,20190701,20190715,20190729,20190812
0,A01AA01,Bifluorid,42846,4 g + solvens,,dentalsuspension,Voco,AIP,407.36,407.36,...,,,,,,,,,,
1,A01AA01,Bifluorid,42846,4 g + solvens,,dentalsuspension,Voco,AUP,568.4,568.4,...,,,,,,,,,,
2,A01AA01,Bifluorid,42846,4 g + solvens,,dentalsuspension,Voco,DDD,,,...,,,,,,,,,,
3,A01AA01,Bifluorid,42846,4 g + solvens,,dentalsuspension,Voco,AUP_pr_DDD,,,...,,,,,,,,,,
4,A01AA01,Bifluorid,43158,10 g,,dentalsuspension,Voco,AIP,602.07,602.07,...,,,,,,,,,,


In [5]:
# Copy product numbers into a dataframe
df_product_numbers = df_lmpriser['Varenummer'].copy()
# Delete original dataframe to save memory
del df_lmpriser
# Peak into the dataframe
df_product_numbers.head()

0    042846
1    042846
2    042846
3    042846
4    043158
Name: Varenummer, dtype: object

In [6]:
# Ensure unique product numbers
product_number_array = df_product_numbers.unique()
# Split the product number
product_number_arrays = np.array_split(product_number_array, 3)
# Insert the correct index here, it defines which set of product numbers to process
INDEX = 1
# raise NotImplementedError

df_product_numbers = pd.Series(product_number_arrays[INDEX])

In [7]:
df_product_numbers.head()

0    075227
1    101812
2    182729
3    372663
4    561154
dtype: object

In [8]:
# Take all product numbers in "our" slice of the product numbers
product_numbers_slice = df_product_numbers[:]

In [9]:
product_numbers_slice.shape

(5418,)

In [10]:
conn = Connector('med-prices.log', overwrite_log=True)

def get_product_details(product_number):
    details_url = 'http://api.medicinpriser.dk/v1/produkter/detaljer/{}?format=json'
    response, _ = conn.get(details_url.format(product_number), 'med-scrape')
    return response

In [11]:
# start by defining a Project name
project = 'medicin_prices'
import os # generel package for interacting with the system
# among other things automate folder creation
def maybe_create_dir(path):
    if not os.path.isdir(path):
        os.mkdir(path)

maybe_create_dir(project)

subfolders = ['raw_data','parsed_data']

raw_data = os.path.join(project, subfolders[0])
parsed_data = os.path.join(project, subfolders[1])

for directory in subfolders: 
    maybe_create_dir(os.path.join(project, directory))

In [12]:
import json

def dump_json(product):
    vnr = product['Varenummer']
    file_path = os.path.join(raw_data, vnr + '.json')
    with open(file_path, 'w') as f:
        json.dump(product, f)

In [13]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys

import time

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, executable_path='./geckodriver')

def get_medication_price_grid_html(vnr: str, sleep_for=20):
    # Create base_url based on vnr
    base_url = 'https://medicinpriser.dk/Default.aspx?id=15&vnr=' + vnr
    # Fetch the base_url using the firefox webdriver
    driver.get(base_url)
    # Find the price development link
    price_development = driver.find_element_by_id("ctl00_TopBar_Pridudvikling")
    # Click the price development link
    price_development.click()
    # Find the date_from input box
    date_from_input = driver.find_element_by_id('ctl00_ctl07_ctl00_DateFrom')
    # Click the date_from input field
    date_from_input.click()
    # Clear the date_from input field
    date_from_input.clear()
    # Send the '01-01-1998' key stroke to the date_from input field
    date_from_input.send_keys('01-01-1998')
    # Send key <enter> stroke to the date_from input field
    date_from_input.send_keys(Keys.ENTER)
    # Submit the content of the date_from input field
    date_from_input.submit()
    # Sleep for sleep_for seconds (default=20)
    time.sleep(sleep_for)
    # Copy the page_source into a string var
    return_str = driver.page_source
    # Return the string var to caller
    return return_str

In [14]:
import re

pno_re = re.compile('[0-9]{6}')

def rename_columns(df):
    pnos = [vnr.group() for vnr in map(pno_re.search, df.columns[2:])]
    df.columns = ['datetime', 'tilskudspris'] + pnos

In [15]:
import re
from bs4 import BeautifulSoup as bs

def get_price_grid(html_table):
    soup = bs(html_table)
    price_grid_table = soup.findAll('table', attrs={'id': re.compile(r'.*PriceGrid.*')})
    if (len(price_grid_table) == 1):
        return price_grid_table[0]
    else:
        return None

In [16]:
def make_html(table_html):
    return "<{h}><{b}>{s}</{b}></{h}>".format(h='html', b='body', s=str(table_html).replace(',', '.').replace('-', ''))

In [17]:
from pandas import DataFrame

In [18]:
from selenium.common.exceptions import WebDriverException

def get_dataframe(product_number: str, sleep_for=5):
    try:
        html = get_medication_price_grid_html(product_number, sleep_for=sleep_for)
    except WebDriverException as wde:
        download_logger.warning(wde)
    table = get_price_grid(html)
    if table is None:
        return None
    maybe_table = pd.read_html(make_html(table))
    if len(maybe_table) > 0:
        df = maybe_table[0]
        rename_columns(df)
        return df
    else:
        None

In [19]:
def get_series(product_number: str, sleep_for=5) -> DataFrame:
    try:
        html = get_medication_price_grid_html(product_number, sleep_for=sleep_for)
    except WebDriverException as wde:
        download_logger.warning(wde)
    table = get_price_grid(html)
    if table is None:
        return None
    maybe_table = pd.read_html(make_html(table))
    if len(maybe_table) > 0:
        df = maybe_table[0]
        rename_columns(df)
        return df[product_number]
    else:
        None

In [None]:
def with_retry(product_number, func, retries=5, sleep_for=5):
    tries = 0
    downloaded = False
    while tries <= retries and not downloaded:
        # Increment the number of tries we have used on this product number
        tries += 1
        product_details = get_product_details(product_number)
        if product_details.ok:
            product_json = json.loads(product_details.text)
            product_number_str = str(product_json['Varenummer'])
            df = func(product_number_str, sleep_for=sleep_for*tries)
            if df is not None:
                downloaded = True
                return df
        else:
            return None

In [21]:
def find_missing(p_no, p_nos, df):
    missing = []
    for pno in p_nos:
        if pno not in df.columns:
            missing.append(pno)
    return missing

In [22]:
def has_csv_file(product_number) -> bool:
    return os.path.isfile(os.path.join(raw_data, product_number + '.csv'))

In [None]:
retries = 5

for product_number in product_numbers_slice:
    tries = 0
    # Create a product number string
    product_number_str = str(product_number)
    downloaded = has_csv_file(product_number_str)
    while tries <= retries and not downloaded:
        # Increment the number of tries we have used on this product number
        tries += 1
        product_details = get_product_details(product_number)
        if product_details.ok:
            product_json = json.loads(product_details.text)
            dump_json(product_json)
        else:
            fail_msg = "Not able to download: {}, response code: {}".format(product_number_str, product_details.status_code)
            download_logger.warning(fail_msg)
            print(fail_msg)
            break
        df = get_dataframe(product_number_str, sleep_for=5*tries)
        if df is not None:
            if str(df['datetime'].iloc[0]).endswith('1997'):
                success_msg = "Downloaded data for: {}".format(product_number_str)
                download_logger.info(success_msg)
                print(success_msg)
                downloaded = True
            else:
                warning_msg = 'Data for {} was incomplete, redownloading'.format(product_number_str)
                download_logger.warning(warning_msg)
                print(warning_msg)
                downloaded = False
            substitude_product_numbers = [p['Varenummer'] for p in product_json['Substitutioner']]
            missing_product_numbers = find_missing(product_number, substitude_product_numbers, df)
            if len(missing_product_numbers) > 0:
                missing_msg = "Downloading missing substitute product: {}".format(','.join(missing_product_numbers))
                download_logger.info(missing_msg)
                print(missing_msg)
            for number in missing_product_numbers:
                missing_series = with_retry(number, get_series)
                if missing_series is not None:
                    df[number] = missing_series
            # Drops rows filled with na
            df = df.dropna(axis=0, thresh=2)
            # Save the data into an csv file
            df.to_csv(os.path.join(raw_data, product_number_str + '.csv'))
                    