### Initialize the Dask Dashboard Client

Sets up a local cluster that handles the computations. It provides the details of the computation.

In [1]:
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=2, memory_limit='8GB')
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  8,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:45527,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  8
Started:  Just now,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:44204,Total threads: 2
Dashboard: http://127.0.0.1:37626/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:35645,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx

0,1
Comm: tcp://127.0.0.1:46793,Total threads: 2
Dashboard: http://127.0.0.1:35027/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34677,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1

0,1
Comm: tcp://127.0.0.1:34412,Total threads: 2
Dashboard: http://127.0.0.1:39527/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34295,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl

0,1
Comm: tcp://127.0.0.1:33427,Total threads: 2
Dashboard: http://127.0.0.1:38505/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:37557,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut


### Reading multiple sources of data into multiple dataframes

Reads three csv files into their own dataframes using the read_csv() function.

In [3]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# dataframe for covid cases daily count by country on 01/01/2021
df = dd.read_csv('data/010121.csv', dtype={'Active':'float64'}) # dtypes specified to read csv properly
df2 = dd.read_csv('data/020121.csv', dtype={'Active':'float64'})
df3 = dd.read_csv('data/030121.csv', dtype={'Active':'float64'})

### Beginning of the Dataframes

The head() function displays the beginning of the dataframe.

In [4]:
df.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,Afghanistan,02/01/2021 5:22,33.93911,67.709953,51526,2191,41727,0.0,Afghanistan,0.0,4.252222
1,,Albania,02/01/2021 5:22,41.1533,20.1683,58316,1181,33634,23501.0,Albania,2026.409062,2.025173
2,,Algeria,02/01/2021 5:22,28.0339,1.6596,99897,2762,67395,29740.0,Algeria,227.809861,2.764848
3,,Andorra,02/01/2021 5:22,42.5063,1.5218,8117,84,7463,570.0,Andorra,10505.40348,1.034865
4,,Angola,02/01/2021 5:22,-11.2027,17.8739,17568,405,11146,6017.0,Angola,53.452981,2.305328


In [5]:
df2.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,Afghanistan,02/02/2021 5:22,33.93911,67.709953,55059,2404,47723,4932.0,Afghanistan,141.436801,4.366225
1,,Albania,02/02/2021 5:22,41.1533,20.1683,78992,1393,47922,29677.0,Albania,2744.874557,1.76347
2,,Algeria,02/02/2021 5:22,28.0339,1.6596,107578,2894,73530,31154.0,Algeria,245.325978,2.690141
3,,Andorra,02/02/2021 5:22,42.5063,1.5218,9972,101,9206,665.0,Andorra,12906.2318,1.012836
4,,Angola,02/02/2021 5:22,-11.2027,17.8739,19829,466,18180,1183.0,Angola,60.332375,2.350093


In [6]:
df3.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,Afghanistan,02/03/2021 5:23,33.93911,67.709953,55733,2444,49344,3945.0,Afghanistan,143.168187,4.385194
1,,Albania,02/03/2021 5:23,41.1533,20.1683,107931,1816,70413,35702.0,Albania,3750.469108,1.682556
2,,Algeria,02/03/2021 5:23,28.0339,1.6596,113255,2987,78234,32034.0,Algeria,258.272078,2.637411
3,,Andorra,02/03/2021 5:23,42.5063,1.5218,10889,110,10475,304.0,Andorra,14093.05636,1.010194
4,,Angola,02/03/2021 5:23,-11.2027,17.8739,20854,508,19400,946.0,Angola,63.451074,2.435984


In [7]:
%%time

# Calculating the mean
print("Mean of confirmed global COVID-19 cases reported 01/01/2021:  " + str(df.Confirmed.mean().compute()))
print("Mean of reported global COVID-19 deaths reported 02/01/2021:  " + str(df2.Deaths.mean().compute()))
print("Mean of confirmed active global COVID-19 cases reported 03/01/2021:  " + str(df3.Active.mean().compute()))

Mean of confirmed global COVID-19 cases reported 01/01/2021:  21119.139307228917
Mean of reported global COVID-19 deaths reported 02/01/2021:  579.2197140707299
Mean of confirmed active global COVID-19 cases reported 03/01/2021:  11862.095859473024
CPU times: user 60.5 ms, sys: 4.96 ms, total: 65.4 ms
Wall time: 187 ms


In [8]:
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  8,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:45527,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  8
Started:  4 minutes ago,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:44204,Total threads: 2
Dashboard: http://127.0.0.1:37626/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:35645,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx

0,1
Comm: tcp://127.0.0.1:46793,Total threads: 2
Dashboard: http://127.0.0.1:35027/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34677,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1

0,1
Comm: tcp://127.0.0.1:34412,Total threads: 2
Dashboard: http://127.0.0.1:39527/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34295,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl

0,1
Comm: tcp://127.0.0.1:33427,Total threads: 2
Dashboard: http://127.0.0.1:38505/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:37557,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut


### Merging the data

Using the merge() function to merge all of the dataframes into one. Below you can see you can merge multiple dataframes by separating it with a period and calling the merge() function however many times necessary.

In [1]:
%%time

result = df.merge(df2).merge(df3)

NameError: name 'df' is not defined

In [27]:
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  8,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:45527,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  8
Started:  1 hour ago,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:44204,Total threads: 2
Dashboard: http://127.0.0.1:37626/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:35645,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx

0,1
Comm: tcp://127.0.0.1:46793,Total threads: 2
Dashboard: http://127.0.0.1:35027/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34677,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1

0,1
Comm: tcp://127.0.0.1:34412,Total threads: 2
Dashboard: http://127.0.0.1:39527/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34295,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl

0,1
Comm: tcp://127.0.0.1:33427,Total threads: 2
Dashboard: http://127.0.0.1:38505/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:37557,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut


In [19]:
result.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,Diamond Princess,Canada,21/12/2020 13:27,,,0,1,0,,"Diamond Princess, Canada",,
1,Grand Princess,Canada,21/12/2020 13:27,,,13,0,13,0.0,"Grand Princess, Canada",,0.0
2,Alabama,US,21/12/2020 13:27,,,0,0,0,0.0,"Out of AL, Alabama, US",,
3,Alabama,US,21/12/2020 13:27,,,0,0,0,0.0,"Unassigned, Alabama, US",,
4,Diamond Princess,US,04/08/2020 2:27,,,49,0,0,49.0,"Diamond Princess, US",,0.0


The total sum for each column is calculated with the sum() function and dataframe is displayed based on columns specified in the groupby() function.

In [26]:
%%time
result.groupby(['Province_State', 'Country_Region']).sum().reset_index().compute()

CPU times: user 44.5 ms, sys: 119 µs, total: 44.7 ms
Wall time: 122 ms


Unnamed: 0,Province_State,Country_Region,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
0,Alabama,US,0.0,0.0,0,0,0,0.0,0.0,0.0
1,Diamond Princess,Canada,0.0,0.0,0,1,0,0.0,0.0,0.0
2,Diamond Princess,US,0.0,0.0,49,0,0,49.0,0.0,0.0
3,Grand Princess,Canada,0.0,0.0,13,0,13,0.0,0.0,0.0
4,Grand Princess,US,0.0,0.0,103,3,0,100.0,0.0,2.912621
5,Hawaii,US,0.0,0.0,0,0,0,0.0,0.0,0.0
6,Maine,US,0.0,0.0,0,0,0,0.0,0.0,0.0
7,Montana,US,0.0,0.0,0,0,0,0.0,0.0,0.0
8,Virginia,US,0.0,0.0,0,0,0,0.0,0.0,0.0


In [28]:
client

0,1
Connection method: Cluster object,Cluster type: LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads:  8,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:45527,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads:  8
Started:  1 hour ago,Total memory:  29.80 GiB

0,1
Comm: tcp://127.0.0.1:44204,Total threads: 2
Dashboard: http://127.0.0.1:37626/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:35645,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-2ri9v6yx

0,1
Comm: tcp://127.0.0.1:46793,Total threads: 2
Dashboard: http://127.0.0.1:35027/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34677,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-p0uc2oh1

0,1
Comm: tcp://127.0.0.1:34412,Total threads: 2
Dashboard: http://127.0.0.1:39527/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34295,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-poic2svl

0,1
Comm: tcp://127.0.0.1:33427,Total threads: 2
Dashboard: http://127.0.0.1:38505/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:37557,
Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut,Local directory: /home/asd/stha/pyslurm_decorators/test/dask-worker-space/worker-z6fdhmut
