In [4]:
from datetime import date, datetime, timedelta


import pandas as pd
import numpy as np

from matplotlib.pylab import plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter


import seaborn as sns
import plotly_express as px

import chart_studio.plotly as py
import chart_studio

from loguru import logger
import pycountry

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import SCORERS, median_absolute_error

from sklearn.ensemble import GradientBoostingRegressor

from requests_html import HTMLSession

import tabula

%matplotlib inline

# Loading the reports

In [273]:

# The URLs of the pages listing the reports
urls = [
    'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/',
    'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/july2020/',
    'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/june2020/',
    'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/may2020/',
    'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/april2020/'
]

def get_report_links(urls):
    
    session = HTMLSession()

    all_report_links = []
    
    # Get the report links from each page.
    for url in urls:
        
        r = session.get(url)
        
        for link in r.html.absolute_links:
            if 'pdf' in link:
                # Need to fix the urls
                all_report_links.append(link.replace(' ', '%20'))
                
    return all_report_links

In [275]:
report_links = get_report_links(urls)
report_links

['https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_(NPHET)_Website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_(NPHET)_v1.0_20200813_website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_20082020%20-Website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_(NPHET)%20_website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epi

In [263]:
def extract_cfr_table(report_url):
    
    tables = tabula.read_pdf(report_url, pages='all', multiple_tables = True)
    
    # Find the right table ...
    for table in tables:
        
        if table.iloc[:,0].map(str).str.contains('Case fatality').any() & (len(table.columns)>5):
            return extract_cfr_table_v1(table.dropna())
            
def extract_cfr_table_v1(cfr_table):
    
    cfr_table.columns = [
        'metric', 
        'Aged5', 'Aged5to14', 'Aged15tp24', 'Aged25tp34', 'Aged35tp44', 
        'Aged45tp54', 'Aged55tp64', 'Aged65tp74', 'Aged75to84', 'Aged85up', 
        'Unknown'
    ]
    
    cfr_table['metric'] = [
        'num_cases', 
        'num_hospitalised', 'pct_hospitalised',
        'num_icu', 'pct_icu',
        'num_deaths', 'pct_cfr'
    ]
    
    cfr_table = cfr_table.set_index('metric').T
    cfr_table['frac_cfr'] = cfr_table['num_deaths'].map(lambda n: int(n.replace(',', '')))/cfr_table['num_cases'].map(lambda n: int(n.replace(',', '')))
    
    return cfr_table.T

In [271]:
extract_cfr_table(report_links[36])

Got stderr: Aug 24, 2020 10:20:18 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored
Aug 24, 2020 10:20:18 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored
Aug 24, 2020 10:20:18 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,Aged5,Aged5to14,Aged15tp24,Aged25tp34,Aged35tp44,Aged45tp54,Aged55tp64,Aged65tp74,Aged75to84,Aged85up,Unknown
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
num_cases,171.0,327.0,1891.0,4268.0,4480.0,4567.0,3249.0,1803.0,2291.0,2369.0,22.0
num_hospitalised,21.0,16.0,69.0,196.0,265.0,444.0,492.0,580.0,735.0,473.0,2.0
pct_hospitalised,12.28,4.89,3.65,4.59,5.92,9.72,15.14,32.17,32.08,19.97,9.09
num_icu,0.0,2.0,5.0,15.0,35.0,91.0,126.0,110.0,45.0,6.0,0.0
pct_icu,0.0,0.61,0.26,0.35,0.78,1.99,3.88,6.1,1.96,0.25,0.0
num_deaths,0.0,0.0,1.0,5.0,11.0,23.0,63.0,218.0,506.0,647.0,0.0
pct_cfr,0.0,0.0,0.05,0.12,0.25,0.5,1.94,12.09,22.09,27.31,0.0
frac_cfr,0.0,0.0,0.000528821,0.00117151,0.00245536,0.00503613,0.0193906,0.12091,0.220864,0.273111,0.0


In [272]:
report_links

['https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_(NPHET)_Website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_(NPHET)_v1.0_20200813_website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_20082020%20-Website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epidemiology_report_(NPHET)%20_website.pdf',
 'https://www.hpsc.ie/a-z/respiratory/coronavirus/novelcoronavirus/casesinireland/epidemiologyofcovid-19inireland/COVID-19_Daily_epi

In [226]:
tables = tabula.read_pdf(
    report_links[4], pages='all', multiple_tables = True, pandas_options={'header': 1})


Got stderr: Aug 24, 2020 10:03:29 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored
Aug 24, 2020 10:03:29 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored
Aug 24, 2020 10:03:29 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font Arial are not implemented in PDFBox and will be ignored



In [231]:
tables[8].iloc[:,0]

0    Deaths among COVID-19 cases\rTable 7: Summary ...
Name: Unnamed: 0, dtype: object