## "Table 3" extraction from ISS weekly covid-19 reports
https://www.epicentro.iss.it/coronavirus/sars-cov-2-sorveglianza-dati

Requirements: Java 8+, Python 3.6+

See: 
https://www.epicentro.iss.it/coronavirus/bollettino/Bollettino-sorveglianza-integrata-COVID-19_8-settembre-2021.pdf

In [1]:
from tabula import read_pdf
import pandas as pd
import numpy as np

In [2]:
# input file
in_file = 'foo.pdf' # replace with pdf name

# report data
date = '08/09/2021'

In [3]:
# set page containing table 3
page = 17

# ...and read
raw = read_pdf(in_file, pages=page, stream=True)
raw[0].head()

Unnamed: 0.1,Unnamed: 0,FASCIA DI,Unnamed: 1,VACCINATI CON,VACCINATI CON.1
0,GRUPPO,,NON VACCINATI,,
1,,ETÀ,,CICLO INCOMPLETO,CICLO COMPLETO
2,,12-39,"7.378.291 (42,3%)",3.129.839 (18%),"6.927.425 (39,7%)"
3,Popolazione,40-59,"4.990.358 (27,1%)","1.422.424 (7,7%)","12.034.678 (65,2%)"
4,(21/08/2021),60-79,"1.847.629 (13,6%)","552.988 (4,1%)","11.172.162 (82,3%)"


In [4]:
# keep the last and the third last column
columns_to_keep = [raw[0].columns[-3], raw[0].columns[-1]]
df = raw[0][columns_to_keep].replace('(\.|\(.*)','', regex=True).dropna().astype(np.int64)
df.columns = ['Non vaccinati', 'Immunizzati']
df

Unnamed: 0,Non vaccinati,Immunizzati
2,7378291,6927425
3,4990358,12034678
4,1847629,11172162
5,291252,4157813
6,64612,11719
8,29879,15993
9,8631,11201
10,1768,4862
11,1711,105
12,2459,258


In [5]:
# get data
results = [date]
for i in np.arange(0, 4*4+1, 4): #+1 to take into account the last index
    results.extend([df['Non vaccinati'][i:i+4].sum(), 
                    df['Immunizzati'][i:i+4].sum()])
results

['08/09/2021',
 14507530,
 34292078,
 104890,
 43775,
 6579,
 2118,
 712,
 157,
 571,
 280]

In [6]:
# read the original general data csv from apalladi's repo
# https://github.com/apalladi/covid_vaccini_monitoraggio/tree/main/dati
url_csv_0 = 'https://raw.githubusercontent.com/apalladi/covid_vaccini_monitoraggio/main/dati/dati_ISS_complessivi.csv'
df_0 = pd.read_csv(url_csv_0, sep=';')
df_0.head()

Unnamed: 0,data,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
0,08/09/2021,14507530,34292078,104890,43775,6579,2118,712,157,571,280
1,01/09/2021,15656647,32287644,107937,40380,5988,1788,641,127,383,187
2,25/8/2021,16708830,29628678,104405,33894,5162,1349,500,95,246,92
3,18/8/2021,17568325,26501452,95261,24978,4052,859,334,61,177,48
4,11/8/2021,20322716,22345659,80585,18887,3067,627,247,34,143,34


In [7]:
# add new row to the df
df_0.loc[-1] = results
df_0.index = df_0.index+1
df_0.sort_index(inplace=True)
df_0.head()

Unnamed: 0,data,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
0,08/09/2021,14507530,34292078,104890,43775,6579,2118,712,157,571,280
1,08/09/2021,14507530,34292078,104890,43775,6579,2118,712,157,571,280
2,01/09/2021,15656647,32287644,107937,40380,5988,1788,641,127,383,187
3,25/8/2021,16708830,29628678,104405,33894,5162,1349,500,95,246,92
4,18/8/2021,17568325,26501452,95261,24978,4052,859,334,61,177,48


In [8]:
# get data by age
results_ = {'12-39': [],
            '40-59': [],
            '60-79': [],
            '80+': []}

for i, key in enumerate(results_):
    for start_index in np.arange(i, i+4*4+1, 4): #+1 to take into account the last index
        results_[key].extend([df['Non vaccinati'][start_index:start_index+1].values[0], df['Immunizzati'][start_index:start_index+1].values[0]])

results_

{'12-39': [7378291, 6927425, 64612, 11719, 1711, 105, 63, 3, 7, 0],
 '40-59': [4990358, 12034678, 29879, 15993, 2459, 258, 275, 17, 85, 10],
 '60-79': [1847629, 11172162, 8631, 11201, 1771, 736, 331, 91, 257, 62],
 '80+': [291252, 4157813, 1768, 4862, 638, 1019, 43, 46, 222, 208]}

In [9]:
columns = ['non vaccinati', 
          'vaccinati completo',
          'casi non vaccinati',
          'casi vaccinati',
          'ospedalizzati non vaccinati',
          'ospedalizzati vaccinati',
          'terapia intensiva non vaccinati',
          'terapia intensiva vaccinati',
          'decessi non vaccinati',
          'decessi vaccinati']

# load dict as df
df_1 = pd.DataFrame(results_).T
df_1.columns = columns
df_1.index.rename('età', inplace=True)
df_1.head()

Unnamed: 0_level_0,non vaccinati,vaccinati completo,casi non vaccinati,casi vaccinati,ospedalizzati non vaccinati,ospedalizzati vaccinati,terapia intensiva non vaccinati,terapia intensiva vaccinati,decessi non vaccinati,decessi vaccinati
età,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12-39,7378291,6927425,64612,11719,1711,105,63,3,7,0
40-59,4990358,12034678,29879,15993,2459,258,275,17,85,10
60-79,1847629,11172162,8631,11201,1771,736,331,91,257,62
80+,291252,4157813,1768,4862,638,1019,43,46,222,208


In [10]:
# save to csv
out_date = date.replace('/', '-')
out_name = f'data_iss_età_{out_date}.csv'
df_1.to_csv(out_name, encoding='utf-8-sig')