# The whole dataset with Dask

Let us use the Jupyter Lab Dask Extension to create a cluster and watch its metrics

In [None]:
# Your cluster here

In [None]:
import pandas as pd
import dask.dataframe as dd

In [None]:
data = dd.read_csv("data/votes_*.csv",
                   usecols=["region", "vote"],
                   dtype={"region": "category",
                          "vote": "category"})

In [None]:
data

In [None]:
data.shape

In [None]:
data.shape[0].compute()

# EVERY VOTE COUNTS

In [None]:
data["result"] = 1

In [None]:
count_per_region = (
    data
    .groupby(["region", "vote"])
    .result.agg("count")
    .reset_index()
)

Notice that the code is the same. Only change: the `data` object

In [None]:
count_per_region

In [None]:
count_per_region_computed = count_per_region.compute()

In [None]:
count_per_region_computed.head()

This is a __crucial point__: data has been sufficiently aggregated that it fits in __single machine memory__

In [None]:
def get_winner(count_per_region: pd.DataFrame) -> pd.Series:
    
    results = list()
    for region, df in count_per_region.groupby("region"):
        results.append(
            {"region": region,
             "winner": df.set_index("vote").result.idxmax()}
        )

    winner_per_region = pd.DataFrame(results)
    delegates_per_region = pd.read_csv("data/region_delegates.csv")
    winner_region_delegates = pd.merge(winner_per_region, delegates_per_region, on="region")
    return (winner_region_delegates
            .groupby("winner")
            .delegates.sum()
            .sort_values(ascending=False))    

In [None]:
get_winner(count_per_region_computed)