## "Table 3" extraction from ISS weekly covid-19 reports
https://www.epicentro.iss.it/coronavirus/sars-cov-2-sorveglianza-dati

See example pdf: 
https://www.epicentro.iss.it/coronavirus/bollettino/Bollettino-sorveglianza-integrata-COVID-19_8-settembre-2021.pdf

Requirements: Java 8+, Python 3.6+

In [1]:
from tabula import read_pdf
import pandas as pd
import numpy as np

Scaricare nella cartella dati il bollettino dell'ISS https://www.epicentro.iss.it/coronavirus/bollettino/Bollettino-sorveglianza-integrata-COVID-19_22-settembre-2021.pdf

In [2]:
# input file
in_file = 'bollettino-22set.pdf' # replace with pdf name

# report data
rep_date = pd.to_datetime('22/09/2021') # replace with report date

# ...and read the raw table
raw_tb = read_pdf(in_file, pages=19, stream=True) # set page containing table 3
raw_tb[0]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,VACCINATI CON,VACCINATI CON.1
0,GRUPPO,FASCIA DI ETÀ,NON VACCINATI,,
1,,,,CICLO INCOMPLETO,CICLO COMPLETO
2,,12-39,"6.122.626 (35,1%)","3.199.510 (18,4%)","8.113.419 (46,5%)"
3,Popolazione,40-59,"4.469.554 (24,2%)","1.372.996 (7,4%)","12.596.107 (68,3%)"
4,(4/09/2021),60-79,"1.705.446 (12,6%)","422.136 (3,1%)","11.445.197 (84,3%)"
5,,80+,"283.500 (6,2%)","101.926 (2,2%)","4.177.484 (91,6%)"
6,,12-39,"49.207 (72,9%)","7.216 (10,7%)","11.040 (16,4%)"
7,Diagnosi di,,,,
8,,40-59,"26.851 (57,9%)","3.406 (7,3%)","16.128 (34,8%)"
9,Sars-CoV-2,,,,


In [3]:
# keep the last and the third last column
columns_to_keep = raw_tb[0].columns[[-3,-1]]
to_exclude = '\((.*)|[^a-z-0-9]|\d+-\d+|\d+\+'

df = raw_tb[0][columns_to_keep].replace(to_exclude, '', regex=True).replace('', np.nan)
df = df.dropna(subset=columns_to_keep, how='all').fillna(0).astype(np.int64)
df.columns = ['Non vaccinati', 'Immunizzati']
df

Unnamed: 0,Non vaccinati,Immunizzati
2,6122626,8113419
3,4469554,12596107
4,1705446,11445197
5,283500,4177484
6,49207,11040
8,26851,16128
10,8275,11646
11,1800,5236
12,1646,118
13,2539,304


In [4]:
# get data
# sum value by age/event

step_ = 4 # groups (=5) are 4 rows (=20) distant (see foo.pdf)

results = [df[col][i:i+step_].sum() for i in np.arange(0, len(df)-step_+1, step_) for col in df.columns]
results

[12581126, 36332207, 86133, 44050, 6782, 2456, 775, 173, 877, 509]

In [5]:
# read the original general data csv from apalladi's repo
# https://github.com/apalladi/covid_vaccini_monitoraggio/tree/main/dati

date_parser = lambda x: pd.to_datetime(x, format='%Y/%m/%d')
url = 'https://raw.githubusercontent.com/apalladi/covid_vaccini_monitoraggio/main/dati/dati_ISS_complessivi.csv'
df_0 = pd.read_csv(url, sep=';', parse_dates=['data'], date_parser=date_parser, index_col='data')
df_0

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-09-15,13334156,35599815,96900,44990,6841,2331,786,163,770,405
2021-09-08,14507530,34292078,104890,43775,6579,2118,712,157,571,280
2021-09-01,15656647,32287644,107937,40380,5988,1788,641,127,383,187
2021-08-25,16708830,29628678,104405,33894,5162,1349,500,95,246,92
2021-08-18,17568325,26501452,95261,24978,4052,859,334,61,177,48
2021-08-11,20322716,22345659,80585,18887,3067,627,247,34,143,34
2021-08-04,21289761,19570464,60267,12333,2234,404,169,23,128,34
2021-07-28,22879167,17389604,40729,7277,1619,280,123,16,169,38
2021-07-21,24745853,15384196,26284,3805,1483,224,117,8,250,46
2021-07-14,27065063,13929401,21089,2310,1880,240,147,7,357,68


In [6]:
# add the new row at the top of the df
df_0.loc[rep_date] = results
df_0.sort_index(ascending=False, inplace=True)
df_0

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-09-22,12581126,36332207,86133,44050,6782,2456,775,173,877,509
2021-09-15,13334156,35599815,96900,44990,6841,2331,786,163,770,405
2021-09-08,14507530,34292078,104890,43775,6579,2118,712,157,571,280
2021-09-01,15656647,32287644,107937,40380,5988,1788,641,127,383,187
2021-08-25,16708830,29628678,104405,33894,5162,1349,500,95,246,92
2021-08-18,17568325,26501452,95261,24978,4052,859,334,61,177,48
2021-08-11,20322716,22345659,80585,18887,3067,627,247,34,143,34
2021-08-04,21289761,19570464,60267,12333,2234,404,169,23,128,34
2021-07-28,22879167,17389604,40729,7277,1619,280,123,16,169,38
2021-07-21,24745853,15384196,26284,3805,1483,224,117,8,250,46


In [7]:
# save to a csv
df_0.to_csv('dati_ISS_complessivi.csv', sep=';')

In [8]:
# get data by age
ages = ['12-39', '40-59', '60-79', '80+']
results_ = {age: df[ages.index(age)::step_].stack().values for age in ages}
results_

{'12-39': array([6122626, 8113419,   49207,   11040,    1646,     118,      76,
              4,      13,       2], dtype=int64),
 '40-59': array([ 4469554, 12596107,    26851,    16128,     2539,      304,
             300,       17,      127,       16], dtype=int64),
 '60-79': array([ 1705446, 11445197,     8275,    11646,     1883,      827,
             355,       95,      399,      127], dtype=int64),
 '80+': array([ 283500, 4177484,    1800,    5236,     714,    1207,      44,
             57,     338,     364], dtype=int64)}

In [9]:
# load dict as df
df_1 = pd.DataFrame(results_).T
df_1.columns = df_0.columns
df_1.index.rename('età', inplace=True)
df_1.head()

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
età,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12-39,6122626,8113419,49207,11040,1646,118,76,4,13,2
40-59,4469554,12596107,26851,16128,2539,304,300,17,127,16
60-79,1705446,11445197,8275,11646,1883,827,355,95,399,127
80+,283500,4177484,1800,5236,714,1207,44,57,338,364


In [10]:
# save to csv
df_1.to_csv(f'data_iss_età_{rep_date.date()}.csv', sep=';')