# The whole dataset with Dask

Let us use the Jupyter Lab Dask Extension to create a cluster and watch its metrics

In [29]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:33405")
client

0,1
Client  Scheduler: tcp://127.0.0.1:33405  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 16.56 GB


In [30]:
import dask.dataframe as dd

In [36]:
data = dd.read_csv("data/votes_*.csv", usecols=["region", "vote"], dtype={"region": "category", "vote": "category"})

In [37]:
data

Unnamed: 0_level_0,region,vote
npartitions=216,Unnamed: 1_level_1,Unnamed: 2_level_1
,category[unknown],category[unknown]
,...,...
...,...,...
,...,...
,...,...


In [38]:
data.shape[0].compute()

144000000

In [39]:
data["result"] = 1

In [40]:
data

Unnamed: 0_level_0,region,vote,result
npartitions=216,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,category[unknown],category[unknown],int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [41]:
count_per_region = (
    data
    .groupby(["region", "vote"])
    .result.agg("count")
    .reset_index()
)

In [42]:
count_per_region

Unnamed: 0_level_0,region,vote,result
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,category[unknown],category[unknown],int64
,...,...,...


In [43]:
count_per_region_computed = count_per_region.compute()

In [44]:
count_per_region_computed.head()

Unnamed: 0,region,vote,result
0,ai,blue,146208
1,ai,red,128472
2,ai,yellow,2664
3,au,blue,1341408
4,au,red,1614480


In [51]:
def get_winner(count_per_region: pd.DataFrame) -> pd.DataFrame:
    results = list()
    for region, df in count_per_region.groupby("region"):
        results.append(
            {"region": region,
             "winner": df.set_index("vote").result.idxmax()}
        )

    winner_per_region = pd.DataFrame(results)
    delegates_per_region = pd.read_csv("data/region_delegates.csv")
    winner_region_delegates = pd.merge(winner_per_region, delegates_per_region, on="region")
    return (winner_region_delegates
            .groupby("winner")
            .delegates.sum()
            .sort_values(ascending=False))    

In [52]:
get_winner(count_per_region_computed)

winner
blue    297
red     241
Name: delegates, dtype: int64