# Setting an index

In [None]:
import pandas as pd
import numpy as np  # <-- this is me being lazy.

In [None]:
df = pd.read_csv("../data.csv")
df.head()

In [None]:
df_with_useless_index = df.set_index(["location_id"])
df_with_useless_index.head()

# Question: why is this not a useful index?

In [None]:
index = ["location_id", "age_group_id", "sex_id", "year_id"]
df_with_useful_index = df.set_index(index)
df_with_useful_index.head()

# Accessing data via the index

Using .loc and .iloc will return ``views`` of the data. More on this later.

In [None]:
df.loc[6:10]

In [None]:
df.iloc[6:10]

In [None]:
df.loc[6:10, "mort_rate"]

In [None]:
df.iloc[6:10, 4]

In [None]:
df_with_useful_index.iloc[6:10]

In [None]:
df_with_useful_index.loc[6:10]  # this won't work.

In [None]:
df_with_actually_useful_index = df.set_index(index).sort_index()
df_with_actually_useful_index.loc[6:10]

# Question: what was returned by this loc[] operation?

In [None]:
# Demo some various index options
df_with_actually_useful_index.loc[6, 10:13, :, 2005]

In [None]:
# You can select a column, but only if it
# there are _two_ arguments.
df_with_actually_useful_index.loc[6, "population"].head()

# df_with_useful_index.loc[6, 10, "population"]  # doesn't work

In [None]:
# solution: use pd.IndexSlice.
df_with_actually_useful_index.loc[
    pd.IndexSlice[6, 10],
    "population"
    ]

# Kaloo Kalay!

## Boolean indexing

### Boolean data

In [None]:
great_value = 2
df == great_value

In [None]:
df["age_group_id"] == 2

In [None]:
# There are lots of built in functions
# for Series that return boolean values.
# Example: isin
df["age_group_id"].isin([2, 3]) 

### Indexing with the boolean series

In [None]:
boolean_index = df["age_group_id"].isin([5, 10]) & df["year_id"] == 2000
df.loc[boolean_index]  # this won't work. order of operations! Throw some parentheses on it.

In [None]:
# Getting a specific column
df.loc[boolean_index, "mort_rate"] 

## Views and Copies?

In [None]:
# Copy:
result = df.query("location_id == 6 and age_group_id == 5 and year_id > 2010")
print(result)

result["mort_rate"] = result["mort_rate"] * 100
print(result)

# We even get a warning that tells us how to update the original dataframe.
print(df.query("location_id == 6 and age_group_id == 5 and year_id > 2010"))

In [None]:
# View:
df2 = df.copy()  # this is just so this cell is idempotent.

the_index = (
    (df2["location_id"] == 6) & (df2["age_group_id"] == 5) 
    & (df2["year_id"] > 2010))
print(df2.loc[the_index])

df2.loc[the_index, "mort_rate"] = df2.loc[the_index, "mort_rate"] * 100

print(df2.loc[the_index])

# STOP!

## Demoing some python function things (if we have time!)

I can't explain all of this in the time we have, so I'm just going to show some things
that are surprising.

When variable's are passed in to a function, usually only one of two things happens
when the variable is modified within the function:

1. the original variable is updated
2. the original variable is not updated

```
def add_one(var):
    var = var + 1
a = 10
add_one(a)
print(a)  # In other languages, this is 10 for (1), or 11 for (2).
```

Python is weird.

In [None]:
def add_one(var):
    var = var + 1
    print("In function: {}".format(var))
a = 10
add_one(a)
print("Out function: {}".format(a))

In [None]:
def append_one(list_var):
    list_var = list_var + [1]
    print("In function: {}".format(list_var))
b = [10, 11, 12]
append_one(b)
print("Out function: {}".format(b))

In [None]:
def append_one(list_var):
    list_var.append(1)
    print("In function: {}".format(list_var))
b = [10, 11, 12]
append_one(b)
print("Out function: {}".format(b))

## Why am I telling you this?

The exercise has to do with updating the variable in a way
that the original variable is modified. We're doing this on
purpose because sometimes, you want to do that.

Keep in mind that sometimes (maybe more often than not),
you **dont** want to modify the original variable. You'll
need to be careful with python, especially when writing
functions involving objects from pandas, numpy, or whatever
else you may be using.

# STOP AGAIN!

In [None]:
def update_no_return(data):
    """This exercise is a little more academic than practical.
    
    Data will have a ``value`` column, a ``location_id`` column,
    and some other columns (maybe? doesn't really matter.)
    
    Take ``data`` and change ``value`` for location_id 6 to 100.
    
    I want the data that is passed in to change outside
    of the function.
    
    Params:
        data (pandas.DataFrame): some data with a location_id column.
    """
    data.loc[data["location_id"] == 6, "value"] = 100

In [None]:
def test_update_no_return():
    
    input_data = pd.DataFrame({
        "location_id": [6, 100, 6, 6, 100, 1],
        "value": [10, 12, 9913, 120312, 8932471.12, 1],
        "other_junk": True,
    })
    update_no_return(input_data)
    assert (input_data.query("location_id == 6")["value"].unique() == [100]).all(), "didn't update original values"
    assert 100 not in input_data.query("location_id != 6")["value"].unique(), "updated too many values"
    
test_update_no_return()