In [None]:
import pandas as pd

# Find out what may be parallelized by looking at a single file

The dataset has files named `votes_{k}.csv` with `k` starting at 0 and going up to 60.

Let us read a single file. We will use it in order to extract schema information

In [None]:
small_data = pd.read_csv("data/votes_0.csv")

In [None]:
small_data.info()

In [None]:
small_data.head()

In [None]:
small_data.describe()

# Reload data

After we've learned a bit, we can be smart about how we load data.   

In [None]:
small_data = pd.read_csv("data/votes_0.csv",
                         parse_dates=["timestamp"],
                         dtype={"region": "category",
                                "vote": "category"})

In [None]:
small_data.info()

# Let's count the votes

At least for this small file; working out what needs to be done on a small sample will be useful later, when we work at large.

- Figure out the number of votes per candidate per region

In [None]:
small_data["result"] = 1

In [None]:
count_per_region = (
    small_data
    .groupby(["region", "vote"])
    .result.agg("count")
    .reset_index()
)

In [None]:
count_per_region.head()

- Figure out the candidate who won in each region

In [None]:
results = list()
for region, df in count_per_region.groupby("region"):
    results.append(
        {"region": region,
         "winner": df.set_index("vote").result.idxmax()}
    )
    
winner_per_region = pd.DataFrame(results)

In [None]:
winner_per_region.head()

- After putting together a list of candidates who voted in each region, find out delegates per candidate per region

In [None]:
delegates_per_region = pd.read_csv("data/region_delegates.csv")

In [None]:
delegates_per_region.head()

In [None]:
winner_region_delegates = pd.merge(winner_per_region, delegates_per_region, on="region")

In [None]:
winner_region_delegates.head()

- Aggregate in order to find total number of delegates. The candidate with most delegates wins

In [None]:
winner_region_delegates.groupby("winner").delegates.sum().sort_values(ascending=False)