# Pandas DataFrames are Pandas Series on steroids

Pandas DataFrames are like Pandas Series on steriods.

In [None]:
import pandas as pd
import numpy as np

# Constructing DataFrames from pd.Series

In [None]:
heights = pd.Series(
    [56, 66, 90, 22], 
    index=["Chris", "Kendrick", "Yao", "Buddy"])

weights = pd.Series(
    [310, 134, 62, 134],
    index=["Yao", "Kendrick", "Buddy", "Chris"])

ages = pd.Series(
    [37, 25, 64, 77],
    index=["Yao", "Kendrick", "Chris", "Buddy"])

In [None]:
ages

In [None]:
pd.DataFrame([weights, heights, ages])
# This looks weird

In [None]:
pd.DataFrame(
    [weights, heights, ages],
    index=["weight", "height", "age"])
# This still looks weird

In [None]:
# This is a pandas dictionary
example_dictionary = dict(
    one=1,
    two=2,
    three=3,
    FOUR=4)
example_dictionary

In [None]:
dataframe = pd.DataFrame(dict(
    weight=weights,
    height=heights,
    age=ages))
# This looks better
dataframe.head()

In [None]:
dataframe.index

In [None]:
# Reseting the index!

dataframe.reset_index()

In [None]:
dataframe.reset_index(drop=True)

In [None]:
# Rename columns (with a chain of operations)
dataframe.reset_index().rename(columns={"index": "name"})

In [None]:
# Sorting!
dataframe.sort_values("weight", ascending=False)

# How about an alphabet sort on different columns?

In [None]:
dataframe.sort_values(["weight", "height"], ascending=False)

# A More Realistic Scenario

## Reading a CSV:

In [None]:
all_cause_mort = pd.read_csv("../data.csv")

## Initial exploration:

In [None]:
all_cause_mort.head()

## Selecting Data (columns)

In [None]:
all_cause_mort["mort_rate"].head()  # returns a series

In [None]:
all_cause_mort[["mort_rate"]].head()  # returns a dataframe

In [None]:
all_cause_mort["location_id", "population"]  # This is broken

In [None]:
# returns a dataframe
all_cause_mort[["location_id", "population"]].head() 

### DO NOT DO THIS
You can access a column of a dataframe as a series by simply using
dataframe.column_name, but this is a **bad practice**. 

Why: a common column name is ``mean``, but dataframe.mean is a function that computes means.

In [None]:
all_cause_mort.location_id.head()  # returns a series, but don't do this.

## Selecting Data (rows)
For now, we'll just show you a simple way.

In [None]:
# This is using python string continuation.
all_cause_mort.query(
    "age_group_id == 2 and location_id == 6 "
    "and sex_id == 2 and year_id > 2005"
    ).head()

# This returns a dataframe object!

In [None]:
# Demo of string line continuation...
print("hello"
     "world"
     "    "
     "yeah \n"
     "cool\tbeans\n\n")

print("hello",
     "world",
     "    ",
     "yeah \n",
     "cool\tbeans\n\n")

print(("hello",
     "world",
     "    ",
     "yeah \n",
     "cool\tbeans\n\n"))

In [None]:
# Querying with a defined variable
best_location_id = 102
my_favorite_years = range(1995, 2000)
all_cause_mort.query(
    "location_id == @best_location_id "
    "and year_id in @my_favorite_years"
    ).head()

## Adding new data (columns)

In [None]:
all_cause_mort["mort_count"] = (
    all_cause_mort["mort_rate"] 
    * all_cause_mort["population"]
    )
print(all_cause_mort["mort_count"].max())
print(all_cause_mort["mort_count"].sum())

# SOLUTIONS

In [None]:
def high_death_rates_in_small_locations():
    """
    Return the top 10% in mortality rate of the lowest
    10% in population for females in year 2016 and age
    group 10.    
    """
    data = pd.read_csv("../data.csv")
    
    data_2016 = data.query(
        "year_id == 2016 and age_group_id == 10 "
        "and sex_id == 2")
    pop_cutoff = data_2016.quantile(.10)["population"]
    small_loc_data = data_2016.query("population <= @pop_cutoff")
    mort_cutoff = small_loc_data.quantile(.10)["mort_rate"]
    high_mort_data = small_loc_data.query("mort_rate <= @mort_cutoff")
    
    return sorted(high_mort_data["mort_rate"])

In [None]:
high_death_rates_in_small_locations()

In [None]:
# Exercise
def male_deaths_in_big_locations():
    """
    For 2010, return a dictionary mapping location_id to total 
    number of male deaths for locations with populations greater than
    the population of the united states.
    
    Things you need to know:
    * males is sex_id 1.
    * united states is location_id 102.
    * death numbers = mort_rate * population
    
    The data set is provided within the function.

    Return:
        dict[int: int]
    
    """
    data = pd.read_csv("../data.csv")
    
    male_data_2010 = data.query(
        "year_id == 2010 and sex_id == 1")
    male_data_2010["mort_count"] = (
        male_data_2010["mort_rate"]
        * male_data_2010["population"]
        )
    us_pop = get_pop(male_data_2010, 102)
    
    locations = male_data_2010["location_id"].unique()
    result = {}
    for location_id in locations:
        pop = get_pop(male_data_2010, location_id)
        if pop > us_pop:
            result[location_id] = get_mort_count(
                male_data_2010, location_id)
        else:
            pass
    return result
    
    
def get_pop(male_data_2010, location_id):
    pop = male_data_2010.query(
            "location_id == @location_id"
        ).sum()["population"]
    return pop

    
def get_mort_count(male_data_2010, location_id):
    mort_count = male_data_2010.query(
            "location_id == @location_id"
        ).sum()["mort_count"]
    return mort_count

In [None]:
def test_male_deaths_in_big_locations():
    res = male_deaths_in_big_locations()
    
    assert len(res) == 2, "not the right size."
    assert res[6] == 5484711.4546094909
    assert res[163] == 5231218.5870856401
    assert 102 not in res, "I said bigger than america, not bigger than or equal to."
    
test_male_deaths_in_big_locations()