**Query** <br>
Per ogni regione, per ogni settimana, individuare il minimo, il massimo e la media aritmetica di nuovi positivi che si sono registrati in quella settiimana.

**Dataset** <br>
Il dataset è organizzato secondo il seguente schema: <br>
data | stato| codice_regione | denominazione_regione | lat | long | ricoverati_con_sintomi | terapia_intensiva | totale_ospedalizzati | isolamento_domiciliare | totale_positivi | variazione_totale_positivi | nuovi_positivi | dimessi_guariti | deceduti | totale_casi | tamponi | casi_testati | note_it | note_en

In [1]:
%%capture
!pip install pyspark

In [2]:
from datetime import datetime
from pyspark import SparkContext

In [3]:
# DOWNLOAD FILE CSV
!wget https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv

--2021-01-18 10:19:38--  https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 974734 (952K) [text/plain]
Saving to: ‘dpc-covid19-ita-regioni.csv’


2021-01-18 10:19:38 (28.0 MB/s) - ‘dpc-covid19-ita-regioni.csv’ saved [974734/974734]



In [4]:
sc = SparkContext("local[2]", "covid2")

In [5]:
# IMPORTO DATASET RIMUOVENDO L'HEADER
rawData = sc.textFile("dpc-covid19-ita-regioni.csv")
header = rawData.first()
rawData = rawData.filter(lambda line: line != header)

In [6]:
rawData.take(5)

['2020-02-24T18:00:00,ITA,13,Abruzzo,42.35122196,13.39843823,0,0,0,0,0,0,0,0,0,,,0,5,,,,,',
 '2020-02-24T18:00:00,ITA,17,Basilicata,40.63947052,15.80514834,0,0,0,0,0,0,0,0,0,,,0,0,,,,,',
 '2020-02-24T18:00:00,ITA,18,Calabria,38.90597598,16.59440194,0,0,0,0,0,0,0,0,0,,,0,1,,,,,',
 '2020-02-24T18:00:00,ITA,15,Campania,40.83956555,14.25084984,0,0,0,0,0,0,0,0,0,,,0,10,,,,,',
 '2020-02-24T18:00:00,ITA,08,Emilia-Romagna,44.49436681,11.341720800000001,10,2,12,6,18,0,18,0,0,,,18,148,,,,,']

In [7]:
data = rawData.map(lambda line:line.split(',')).map(lambda x : ((str(x[3]),datetime.strptime(x[0],'%Y-%m-%dT%H:%M:%S').isocalendar()[1]), int(x[12])))

In [8]:
data.take(100)

[(('Abruzzo', 9), 0),
 (('Basilicata', 9), 0),
 (('Calabria', 9), 0),
 (('Campania', 9), 0),
 (('Emilia-Romagna', 9), 18),
 (('Friuli Venezia Giulia', 9), 0),
 (('Lazio', 9), 2),
 (('Liguria', 9), 0),
 (('Lombardia', 9), 166),
 (('Marche', 9), 0),
 (('Molise', 9), 0),
 (('P.A. Bolzano', 9), 0),
 (('P.A. Trento', 9), 0),
 (('Piemonte', 9), 3),
 (('Puglia', 9), 0),
 (('Sardegna', 9), 0),
 (('Sicilia', 9), 0),
 (('Toscana', 9), 0),
 (('Umbria', 9), 0),
 (("Valle d'Aosta", 9), 0),
 (('Veneto', 9), 32),
 (('Abruzzo', 9), 0),
 (('Basilicata', 9), 0),
 (('Calabria', 9), 0),
 (('Campania', 9), 0),
 (('Emilia-Romagna', 9), 8),
 (('Friuli Venezia Giulia', 9), 0),
 (('Lazio', 9), 0),
 (('Liguria', 9), 1),
 (('Lombardia', 9), 68),
 (('Marche', 9), 0),
 (('Molise', 9), 0),
 (('P.A. Bolzano', 9), 1),
 (('P.A. Trento', 9), 0),
 (('Piemonte', 9), 0),
 (('Puglia', 9), 0),
 (('Sardegna', 9), 0),
 (('Sicilia', 9), 3),
 (('Toscana', 9), 2),
 (('Umbria', 9), 0),
 (("Valle d'Aosta", 9), 0),
 (('Veneto', 9),

In [9]:
query = data.groupByKey().mapValues(lambda x: (round(sum(x)/len(x),1),min(x),max(x))).sortByKey()

In [10]:
query.take(200)

[(('Abruzzo', 1), (257.6, 121, 400)),
 (('Abruzzo', 2), (223.3, 120, 314)),
 (('Abruzzo', 9), (0.7, 0, 3)),
 (('Abruzzo', 10), (1.7, 0, 6)),
 (('Abruzzo', 11), (17.1, 0, 46)),
 (('Abruzzo', 12), (64.3, 34, 122)),
 (('Abruzzo', 13), (100.9, 26, 160)),
 (('Abruzzo', 14), (58.6, 35, 75)),
 (('Abruzzo', 15), (65.3, 18, 106)),
 (('Abruzzo', 16), (51.6, 29, 97)),
 (('Abruzzo', 17), (48.3, 18, 91)),
 (('Abruzzo', 18), (19.6, 7, 32)),
 (('Abruzzo', 19), (15.3, 4, 25)),
 (('Abruzzo', 20), (11.9, 4, 30)),
 (('Abruzzo', 21), (5.7, 1, 8)),
 (('Abruzzo', 22), (2.6, 0, 7)),
 (('Abruzzo', 23), (3.0, 1, 7)),
 (('Abruzzo', 24), (2.0, 0, 4)),
 (('Abruzzo', 25), (1.0, 0, 2)),
 (('Abruzzo', 26), (0.7, 0, 1)),
 (('Abruzzo', 27), (3.3, 0, 7)),
 (('Abruzzo', 28), (2.7, 0, 8)),
 (('Abruzzo', 29), (1.1, 0, 3)),
 (('Abruzzo', 30), (3.7, 0, 10)),
 (('Abruzzo', 31), (4.3, 1, 9)),
 (('Abruzzo', 32), (16.1, 3, 39)),
 (('Abruzzo', 33), (9.0, 2, 16)),
 (('Abruzzo', 34), (12.7, 4, 24)),
 (('Abruzzo', 35), (18.0, 4, 34

In [11]:
query.coalesce(1).saveAsTextFile("output_covid2")
sc.stop()

In [12]:
!ls

dpc-covid19-ita-regioni.csv  output_covid2  sample_data


In [13]:
!cat output_covid2/part-00000

(('Abruzzo', 1), (257.6, 121, 400))
(('Abruzzo', 2), (223.3, 120, 314))
(('Abruzzo', 9), (0.7, 0, 3))
(('Abruzzo', 10), (1.7, 0, 6))
(('Abruzzo', 11), (17.1, 0, 46))
(('Abruzzo', 12), (64.3, 34, 122))
(('Abruzzo', 13), (100.9, 26, 160))
(('Abruzzo', 14), (58.6, 35, 75))
(('Abruzzo', 15), (65.3, 18, 106))
(('Abruzzo', 16), (51.6, 29, 97))
(('Abruzzo', 17), (48.3, 18, 91))
(('Abruzzo', 18), (19.6, 7, 32))
(('Abruzzo', 19), (15.3, 4, 25))
(('Abruzzo', 20), (11.9, 4, 30))
(('Abruzzo', 21), (5.7, 1, 8))
(('Abruzzo', 22), (2.6, 0, 7))
(('Abruzzo', 23), (3.0, 1, 7))
(('Abruzzo', 24), (2.0, 0, 4))
(('Abruzzo', 25), (1.0, 0, 2))
(('Abruzzo', 26), (0.7, 0, 1))
(('Abruzzo', 27), (3.3, 0, 7))
(('Abruzzo', 28), (2.7, 0, 8))
(('Abruzzo', 29), (1.1, 0, 3))
(('Abruzzo', 30), (3.7, 0, 10))
(('Abruzzo', 31), (4.3, 1, 9))
(('Abruzzo', 32), (16.1, 3, 39))
(('Abruzzo', 33), (9.0, 2, 16))
(('Abruzzo', 34), (12.7, 4, 24))
(('Abruzzo', 35), (18.0, 4, 34))
(('Abruzzo', 36), (17.3, 4, 26))
(('Abruzzo', 37), (18

In [14]:
!rm -r output_covid2