<h1>Hierarchical Indexing</h1>

In [3]:
# Hierarchical Indexing incorporates multiple index levels within a single index. 

In [4]:
# In this way, higher dimensional data can be compactly represented within the familiar one-dimensional
# Series and two-dimensional DataFrame objects. 

In [5]:
import numpy as np
import pandas as pd

<h3>A Multiply Indexed Series</h3>

In [6]:
# How we might represent two-dimensional data within one-dimensional Series. For concreteness, we will consider 
# a series of data where each point has a character and numerical key:

<h4>The bad way</h4>

In [7]:
# Using Python tuples as keys
index = [("California",2000),("California", 2010),("New York", 2000),("New York", 2010), ("Texas", 2000)
        ,("Texas", 2010)]
populations=[33871648, 37253956,18976457, 19378102,20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [8]:
# With this indexing scheme, we can straightforwardly index or slice the series based on this multiple index
pop[("California", 2010):("Texas",2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [9]:
# If you need to select all values from 2010, then we will need to perform looping
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

<h4>The better way : Pandas MultiIndex</h4>

In [10]:
# We can create a multi-index from the tuples as follows:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [11]:
# If we reindex our series with this MultiIndex,we see hierarchical representation of the data
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [12]:
# Now using the Pandas slicing notation:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

<h4>MultiIndex as extra dimension</h4>

In [14]:
# The unstack() method will convert a multiply indexed series into a conventionally indexed DataFrame:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [15]:
# The stack() method provides the opposite operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [16]:
# Multi-indexing can be used to represent data of three or more dimensions in a Series or DataFrame.
# Each extra level in a multi-index represents an extra dimension of data, taking advantage of this porperty gives
# us much more flexibility in the types of data we can represent
pop_df = pd.DataFrame({"total":pop, 
                       "under18":[
                           9267089, 9284094,4687374, 4318033,5906301, 6879014
                       ] 
                      })
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [17]:
# Fraction of people under 18 by year:
f_u18 = pop_df["under18"]/pop_df["total"]
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


<h3>Methods of MultiIndex Creation</h3>

In [18]:
# The most straightforward way to construct a multiply indexed Series or DataFrame is to simply pass
# a list of two or more index arrays to the constructor.

df = pd.DataFrame(np.random.rand(4,2), index=[["a","a","b","b"],
                                              [1,2,1,2]
                                             ],
                 columns=["data1", "data2"])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.239228,0.720963
a,2,0.935106,0.142795
b,1,0.202023,0.615089
b,2,0.38777,0.40782


In [19]:
# The work of creating the multiindex is done in the background. 
# If you pass a dictionary with appropriate tuples as keys, Pandas will automatically recognize this and 
# use a MultiIndex by default

data = {('California',2000):33871648,
        ('California',2010):37253956,
        ('Texas',2000):20851820,
        ('Texas',2010):25145561,
        ('New York',2000):18976457,
        ('New York',2010):19378102,
       }
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

<h4>Explicit MultiIndex Constructors</h4>

In [20]:
# We can leverage class method constructors available in pd.MultiIndex. 

# We can construct the MultiIndex from a simple list of arrays, giving the index values within each level:
pd.MultiIndex.from_arrays([["a","a","b","b"],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [22]:
# We can construct it from a list of tuples, giving the multiple index values of each point:
pd.MultiIndex.from_tuples([("a",1),("a",2),("b",1),("b",2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [23]:
# We can construct from cartesian product of single indexes:
pd.MultiIndex.from_product([["a","b"],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [26]:
# We can construct the MultiIndex directly using its internal encoding by passing levels and labels
pd.MultiIndex??

In [27]:
pd.MultiIndex(levels=[["a","b"],[1,2]],
             codes=[[0,0,1,1],[0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

<h4>MultiIndex level Names</h4>

In [28]:
# We can accomplish passing convenient names to the levels of MultiIndex by passing the names argument to any of 
# the names argument to any of the MultiIndex constructors or by setting the names attribute of the index 
# after the fact
pop.index.names=["state","year"]
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

<h4>MultiIndex for columns</h4>

In [29]:
# In a DataFrame, the rows and columns are completely symmetric and just as the rows can have multiple
# levels of indices, the columns can have multiple levels as well.
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=["year","visit"])
columns = pd.MultiIndex.from_product([["Bob","Guido","Sue"],["HR","Temp"]], names=["subject","type"])

In [30]:
# Mock Some Data
data = np.round(np.random.rand(4,6),1)
data[:,::2] *= 10
data += 37

In [31]:
# Create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,43.0,37.0,39.0,37.7,47.0,37.6
2013,2,39.0,37.3,46.0,37.1,46.0,37.7
2014,1,38.0,37.7,47.0,37.3,47.0,37.6
2014,2,46.0,37.9,38.0,37.8,44.0,37.0


In [32]:
health_data["Guido"]

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,39.0,37.7
2013,2,46.0,37.1
2014,1,47.0,37.3
2014,2,38.0,37.8


In [33]:
# For complicated records containing multiple labeled measurements across multiple times for many subjects 
# making use of hierarchical rows and columns can be extremely convenient.