# Pandas Advanced Operations

We're going to cover groupby, aggregate, transform, apply

In [None]:
import pandas as pd
import numpy as np

In [None]:
simple_example = pd.DataFrame(dict(
    family=["Tang", "Tang", 
           "Not Tang", "Not Tang", "Not Tang",
           "Carroll", "Carroll", "Carroll"],
    name=["Kendrick", "Terence", 
          "Tonia", "Lawrence", "Ying", 
          "Alice", "The Frumious Bandersnatch", "The Jabberwock"],
    weight=[130, 150,
            135, 175, 125,
            100, 5000, 99999]
))
simple_example

## A weird question
How much does each family weigh?

Solution using what we already know:

In [None]:
families = simple_example["family"].unique()
for family in families:
    subset = simple_example.query("family == @family")
    summed_weight = subset["weight"].sum()
    print("The {family} Family weighs {weight} pounds. WOW!".format(
        family=family, weight=summed_weight))

### O frabjous day! Callooh! Callay!

How can we do this in a more effective way using the great
tools Pandas provides?

Answer: **groupby**

Solution using pandas **groupby**

In [None]:
for group, family in simple_example.groupby("family"):
    print(group)
    print(family)

In [None]:
for family, group in simple_example.groupby("family"):
    summed_weight = group["weight"].sum()
    print("The {family} Family weighs {weight} pounds. WOW!".format(
        family=family, weight=summed_weight))

Using groupby in an even better way:

In [None]:
weights = simple_example.groupby("family")["weight"].sum()
for family in weights.index:
    summed_weight = weights[family]
    print("The {family} Family weighs {weight} pounds. WOW!".format(
        family=family, weight=summed_weight))

## A more realistic question:
In 2010, how many male deaths were there in countries larger than Japan (in population?)

### A solution without groupby
I'm going to go through this really fast, so please bear with me. If you want me to go slower, just yell at me.

In [None]:
def male_deaths_in_big_locations():
    """
    For 2010, return a dictionary mapping location_id to total 
    number of male deaths for locations with populations greater than
    the population of the united states.
    
    Things you need to know:
    * males is sex_id 1.
    * united states is location_id 102.
    * death numbers = mort_rate * population
    
    The data set is provided within the function.

    Return:
        dict[int: int]
    
    """
    data = pd.read_csv("../data.csv")
    
    male_data_2010 = data.query(
        "year_id == 2010 and sex_id == 1")
    male_data_2010["mort_count"] = (
        male_data_2010["mort_rate"]
        * male_data_2010["population"]
        )
    us_pop = get_pop(male_data_2010, 102)
    
    locations = male_data_2010["location_id"].unique()
    result = {}
    for location_id in locations:
        pop = get_pop(male_data_2010, location_id)
        if pop > us_pop:
            result[location_id] = get_mort_count(
                male_data_2010, location_id)
        else:
            pass
    return result
    
    
def get_pop(male_data_2010, location_id):
    pop = male_data_2010.query(
            "location_id == @location_id"
        ).sum()["population"]
    return pop

    
def get_mort_count(male_data_2010, location_id):
    mort_count = male_data_2010.query(
            "location_id == @location_id"
        ).sum()["mort_count"]
    return mort_count

In [None]:
def test_male_deaths_in_big_locations():
    res = male_deaths_in_big_locations()
    
    assert len(res) == 2, "not the right size."
    assert res[6] == 5484711.4546094909
    assert res[163] == 5231218.5870856401
    assert 102 not in res, "I said bigger than america, not bigger than or equal to."
    
test_male_deaths_in_big_locations()

### A solution with groupby
I want to go through this really slowly, so if I'm going too fast please tell me.

In [None]:
data = pd.read_csv("../data.csv")
data["mort_count"] = (
        data["mort_rate"] * data["population"])

male_data_2010 = data.query(
    "year_id == 2010 and sex_id == 1")
male_data_2010["mort_count"] = (
        male_data_2010["mort_rate"]
        * male_data_2010["population"]
        )
male_data_2010.head()

In [None]:
pops_and_mort_count = male_data_2010.groupby(
        "location_id"
    )[
        ["population", "mort_count"]
    ].sum()
pops_and_mort_count.head()

In [None]:
us_pop = pops_and_mort_count.loc[102]["population"]

In [None]:
big_mort_counts = pops_and_mort_count.query("population > @us_pop")

In [None]:
big_mort_counts["mort_count"].to_dict()

In [None]:
def another_male_deaths_in_big_locations():
    """
    For 2010, return a dictionary mapping location_id to total 
    number of male deaths for locations with populations greater than
    the population of the united states.
    
    Things you need to know:
    * males is sex_id 1.
    * united states is location_id 102.
    * death numbers = mort_rate * population
    
    The data set is provided within the function.

    Return:
        dict[int: int]
    
    """
    data = pd.read_csv("../data.csv")

    male_data_2010 = data.query(
        "year_id == 2010 and sex_id == 1")
    male_data_2010["mort_count"] = (
            male_data_2010["mort_rate"]
            * male_data_2010["population"]
            )
    
    pops_and_mort_count = male_data_2010.groupby(
            "location_id"
        )[
            ["population", "mort_count"]
        ].sum()
    us_pop = pops_and_mort_count.loc[102]["population"]
    big_mort_counts = pops_and_mort_count.query("population > @us_pop")
    return big_mort_counts["mort_count"].to_dict()

In [None]:
def test_another_male_deaths_in_big_locations():
    res = another_male_deaths_in_big_locations()
    
    assert len(res) == 2, "not the right size."
    assert res[6] == 5484711.4546094909
    assert res[163] == 5231218.5870856401
    assert 102 not in res, "I said bigger than america, not bigger than or equal to."
    
test_another_male_deaths_in_big_locations()

## More group by

**question**: what does the result represent?

In [None]:
data.groupby(
        ["sex_id", "location_id", "year_id"]
    )[["mort_count"]].sum().head()

**question**
For each location and year, compute the average number of death per sex.

In [None]:
data.groupby(
        ["year_id", "location_id", "age_group_id"]
    ).mean(
    ).groupby(
        ["year_id", "location_id"]
    ).sum(
    )[
        ["mort_count"]
    ].head()

## Even more groupby
groupby can be used in more flexible ways, too.

### list
Pass in a list like [1,1,0,0,1,0,0,0,1] to group 
a dataframe into two groups (a 0 group and a 1 group).

### function
Pass in a partitioning function where the rows are
grouped by the value ``function(row)``.

### probably a few other ways
Read the docs. Here's a good place to start https://pandas.pydata.org/pandas-docs/stable/groupby.html

# Aggregation
Another way to do things like `data.sum` or `data.mean`.

In [None]:
data.agg("sum")

In [None]:
data.agg(["sum", "mean"])

# Transform
Transform allows you to apply a 1-to-1 function
on all of the values in a dataframe.

In [None]:
data.transform(np.sqrt).head()

In [None]:
def make_negative(x):
    return -x

data.transform(make_negative).head()

# Apply
Apply is like agg, but with functions you can pass in,
as opposed to strings.

This has a weird side effect, so you should avoid this
if you can.

**side effect**: In the current implementation apply calls func twice on the first column/row to decide whether it can take a fast or slow code path. This can lead to unexpected behavior if func has side-effects, as they will take effect twice for the first column/row.

In [None]:
data.apply(np.sqrt).head()  # it knows this is a 1-to-1 function

In [None]:
data.apply(np.sum, axis=0)  # it knows this is a "reduce" operation.

In [None]:
# This is going to take a long long time
# does anyone know why?

data.apply(np.sum, axis=1).head()  # it knows this is a "reduce" operation.

# SECRETS

In [None]:
# population formatting
data = pd.read_csv("../data.csv")
pop_data = data[index + ["population"]]
pop_data.to_csv("07_pop.csv", index=False)
pop_data.head()

# the comm, inj, and ncd csvs were made willy nilly by kendrick.

In [None]:
def big_aggregate():
    """Return the all cause global all-age both-sex draw level death counts.
    
    You're given three files containing the log mortality rates for _ncd, _comm,
    and _inj causes. These files contain the data for:
        * region-level locations
        * non-aggregate age groups
        * male and females
        * gbd years
        
    The data provided contains draws.
    
    Return the draw-level death counts for the all-cause global all-age 
    both-sex aggregate for gbd years.
    
    Return:
        pd.DataFrame: dataframe containing the big aggregate.
    """

    # FILE NAMES! Don't change these.
    pop_file = "07_pop.csv"
    injury_file = "07_inj.csv"
    communicable_file = "07_comm.csv"
    ncd_file = "07_ncd.csv"

    # Open the files
    inj = pd.read_csv(injury_file)
    comm = pd.read_csv(communicable_file)
    ncd = pd.read_csv(ncd_file)
    pop = pd.read_csv(pop_file)

    # Add the three things together
    index = ["location_id", "age_group_id", "sex_id", "year_id"]
    comm_indexed = comm.set_index(index).transform(np.exp)
    ncd_indexed = ncd.set_index(index).transform(np.exp)
    inj_indexed = inj.set_index(index).transform(np.exp)
    all_indexed = comm_indexed + ncd_indexed + inj_indexed
    
    # Join population and multiply to get death counts
    pop_indexed = pop.set_index(index)
    combined_data = all_indexed.join(pop_indexed)
    draw_cols = ["draw_{}".format(i) for i in range(1000)]
    combined_data[draw_cols] = combined_data[draw_cols].multiply(combined_data["population"], axis="index")

    # Aggregate everything together by year.
    return combined_data.groupby("year_id")[draw_cols].sum()

In [None]:
def test_big_aggregate():
    res = big_aggregate()
    
    index = ["location_id", "age_group_id", "sex_id", "year_id"]
    pop = pd.read_csv("07_pop.csv").set_index(index)
    _all = pd.read_csv("07_all.csv").set_index(index).transform(np.exp)
    big_data = _all.join(pop)
    draw_cols = ["draw_{}".format(i) for i in range(1000)]
    big_data[draw_cols] = big_data[draw_cols].multiply(big_data["population"], axis="index")
    expected = big_data.drop("population", axis=1).groupby("year_id").sum()
    expected.to_csv("07_expected.csv")
    
    assert np.isclose(res, expected).all().all()
    
test_big_aggregate()