## "Table 3" extraction from ISS weekly covid-19 reports
https://www.epicentro.iss.it/coronavirus/sars-cov-2-sorveglianza-dati

See example pdf: 
https://www.epicentro.iss.it/coronavirus/bollettino/Bollettino-sorveglianza-integrata-COVID-19_8-settembre-2021.pdf

Requirements: Java 8+, Python 3.6+

In [37]:
from tabula import read_pdf
from pandas import to_datetime, read_csv, DataFrame
from numpy import nan, int64, arange

Scaricare nella cartella dati il bollettino dell'ISS https://www.epicentro.iss.it/coronavirus/bollettino/Bollettino-sorveglianza-integrata-COVID-19_15-settembre-2021.pdf

In [38]:
# input file
in_file = 'bollettino-15set.pdf' # replace with pdf name

# report data
rep_date = to_datetime('15/09/2021') # replace with report date

# ...and read the raw table
raw_tb = read_pdf(in_file, pages=19, stream=True) # set page containing table 3
raw_tb[0]

Unnamed: 0.1,VACCINATI CON GRUPPO FASCIA DI ETÀ,NON VACCINATI,VACCINATI CON,Unnamed: 0
0,,,CICLO INCOMPLETO,CICLO COMPLETO
1,12-39,"6.602.412 (37,9%)","3.128.125 (17,9%)","7.705.018 (44,2%)"
2,Popolazione 40-59,"4.684.966 (25,4%)","1.379.625 (7,5%)","12.382.591 (67,1%)"
3,(28/08/2021) 60-79,1.765.005 (13%),"463.925 (3,4%)","11.343.849 (83,6%)"
4,80+,"281.773 (6,2%)","104.255 (2,3%)","4.168.357 (91,5%)"
5,12-39,"57.228 (73,8%)","8.599 (11,1%)","11.700 (15,1%)"
6,Diagnosi di,,,
7,,"29.184 (58,9%)","3.794 (7,7%)","16.554 (33,4%)"
8,Sars-CoV-2 40-59,,,
9,(13/08/2021- 60-79,"8.629 (40,3%)","1.171 (5,5%)","11.620 (54,2%)"


In [39]:
# keep the last and the third last column
columns_to_keep = raw_tb[0].columns[[-3,-1]]
to_exclude = '\((.*)|[^a-z-0-9]|\d+-\d+|\d+\+'

df = raw_tb[0][columns_to_keep].replace(to_exclude, '', regex=True).replace('', nan)
df = df.dropna(subset=columns_to_keep, how='all').fillna(0).astype(int64)
df.columns = ['Non vaccinati', 'Immunizzati']
df

Unnamed: 0,Non vaccinati,Immunizzati
1,6602412,7705018
2,4684966,12382591
3,1765005,11343849
4,281773,4168357
5,57228,11700
7,29184,16554
9,8629,11620
10,1859,5116
11,1705,111
12,2528,283


In [40]:
# get data
# sum value by age/event

step_ = 4 # groups (=5) are 4 rows (=20) distant (see foo.pdf)

results = [df[col][i:i+step_].sum() for i in arange(0, len(df)-step_+1, step_) for col in df.columns]
results

[13334156, 35599815, 96900, 44990, 6841, 2331, 786, 163, 770, 405]

In [41]:
# read the original general data csv from apalladi's repo
# https://github.com/apalladi/covid_vaccini_monitoraggio/tree/main/dati

date_parser = lambda x: to_datetime(x, format='%d/%m/%Y')
url = 'https://raw.githubusercontent.com/apalladi/covid_vaccini_monitoraggio/main/dati/dati_ISS_complessivi.csv'
df_0 = read_csv(url, sep=';', parse_dates=['data'], date_parser=date_parser, index_col='data')
df_0

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-09-08,14507530,34292078,104890,43775,6579,2118,712,157,571,280
2021-09-01,15656647,32287644,107937,40380,5988,1788,641,127,383,187
2021-08-25,16708830,29628678,104405,33894,5162,1349,500,95,246,92
2021-08-18,17568325,26501452,95261,24978,4052,859,334,61,177,48
2021-08-11,20322716,22345659,80585,18887,3067,627,247,34,143,34
2021-08-04,21289761,19570464,60267,12333,2234,404,169,23,128,34
2021-07-28,22879167,17389604,40729,7277,1619,280,123,16,169,38
2021-07-21,24745853,15384196,26284,3805,1483,224,117,8,250,46
2021-07-14,27065063,13929401,21089,2310,1880,240,147,7,357,68


In [42]:
# add the new row at the top of the df
df_0.loc[rep_date] = results
df_0.sort_index(ascending=False, inplace=True)
df_0

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-09-15,13334156,35599815,96900,44990,6841,2331,786,163,770,405
2021-09-08,14507530,34292078,104890,43775,6579,2118,712,157,571,280
2021-09-01,15656647,32287644,107937,40380,5988,1788,641,127,383,187
2021-08-25,16708830,29628678,104405,33894,5162,1349,500,95,246,92
2021-08-18,17568325,26501452,95261,24978,4052,859,334,61,177,48
2021-08-11,20322716,22345659,80585,18887,3067,627,247,34,143,34
2021-08-04,21289761,19570464,60267,12333,2234,404,169,23,128,34
2021-07-28,22879167,17389604,40729,7277,1619,280,123,16,169,38
2021-07-21,24745853,15384196,26284,3805,1483,224,117,8,250,46
2021-07-14,27065063,13929401,21089,2310,1880,240,147,7,357,68


In [43]:
# save to a csv
df_0.to_csv('dati_ISS_complessivi.csv', sep=';')

In [44]:
# get data by age
ages = ['12-39', '40-59', '60-79', '80+']
results_ = {age: df[ages.index(age)::step_].stack().values for age in ages}
results_

{'12-39': array([6602412, 7705018,   57228,   11700,    1705,     111,      73,
              4,       9,       2]),
 '40-59': array([ 4684966, 12382591,    29184,    16554,     2528,      283,
             295,       15,      114,       15]),
 '60-79': array([ 1765005, 11343849,     8629,    11620,     1903,      792,
             366,       88,      353,      100]),
 '80+': array([ 281773, 4168357,    1859,    5116,     705,    1145,      52,
             56,     294,     288])}

In [45]:
# load dict as df
df_1 = DataFrame(results_).T
df_1.columns = df_0.columns
df_1.index.rename('età', inplace=True)
df_1.head()

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
età,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12-39,6602412,7705018,57228,11700,1705,111,73,4,9,2
40-59,4684966,12382591,29184,16554,2528,283,295,15,114,15
60-79,1765005,11343849,8629,11620,1903,792,366,88,353,100
80+,281773,4168357,1859,5116,705,1145,52,56,294,288


In [46]:
# save to csv
df_1.to_csv(f'data_iss_età_{rep_date.date()}.csv', sep=';')