<h1>Hierarchical Indexing</h1>

In [4]:
# Hierarchical Indexing incorporates multiple index levels within a single index. 

In [5]:
# In this way, higher dimensional data can be compactly represented within the familiar one-dimensional
# Series and two-dimensional DataFrame objects. 

In [6]:
import numpy as np
import pandas as pd

<h3>A Multiply Indexed Series</h3>

In [7]:
# How we might represent two-dimensional data within one-dimensional Series. For concreteness, we will consider 
# a series of data where each point has a character and numerical key:

<h4>The bad way</h4>

In [8]:
# Using Python tuples as keys
index = [("California",2000),("California", 2010),("New York", 2000),("New York", 2010), ("Texas", 2000)
        ,("Texas", 2010)]
populations=[33871648, 37253956,18976457, 19378102,20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [9]:
# With this indexing scheme, we can straightforwardly index or slice the series based on this multiple index
pop[("California", 2010):("Texas",2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [10]:
# If you need to select all values from 2010, then we will need to perform looping
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

<h4>The better way : Pandas MultiIndex</h4>

In [11]:
# We can create a multi-index from the tuples as follows:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [12]:
# If we reindex our series with this MultiIndex,we see hierarchical representation of the data
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [13]:
# Now using the Pandas slicing notation:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

<h4>MultiIndex as extra dimension</h4>

In [14]:
# The unstack() method will convert a multiply indexed series into a conventionally indexed DataFrame:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [15]:
# The stack() method provides the opposite operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [16]:
# Multi-indexing can be used to represent data of three or more dimensions in a Series or DataFrame.
# Each extra level in a multi-index represents an extra dimension of data, taking advantage of this porperty gives
# us much more flexibility in the types of data we can represent
pop_df = pd.DataFrame({"total":pop, 
                       "under18":[
                           9267089, 9284094,4687374, 4318033,5906301, 6879014
                       ] 
                      })
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [17]:
# Fraction of people under 18 by year:
f_u18 = pop_df["under18"]/pop_df["total"]
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


<h3>Methods of MultiIndex Creation</h3>

In [18]:
# The most straightforward way to construct a multiply indexed Series or DataFrame is to simply pass
# a list of two or more index arrays to the constructor.

df = pd.DataFrame(np.random.rand(4,2), index=[["a","a","b","b"],
                                              [1,2,1,2]
                                             ],
                 columns=["data1", "data2"])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.128378,0.128268
a,2,0.781445,0.840837
b,1,0.804423,0.349637
b,2,0.412618,0.229288


In [19]:
# The work of creating the multiindex is done in the background. 
# If you pass a dictionary with appropriate tuples as keys, Pandas will automatically recognize this and 
# use a MultiIndex by default

data = {('California',2000):33871648,
        ('California',2010):37253956,
        ('Texas',2000):20851820,
        ('Texas',2010):25145561,
        ('New York',2000):18976457,
        ('New York',2010):19378102,
       }
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

<h4>Explicit MultiIndex Constructors</h4>

In [20]:
# We can leverage class method constructors available in pd.MultiIndex. 

# We can construct the MultiIndex from a simple list of arrays, giving the index values within each level:
pd.MultiIndex.from_arrays([["a","a","b","b"],[1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [21]:
# We can construct it from a list of tuples, giving the multiple index values of each point:
pd.MultiIndex.from_tuples([("a",1),("a",2),("b",1),("b",2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [22]:
# We can construct from cartesian product of single indexes:
pd.MultiIndex.from_product([["a","b"],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [23]:
# We can construct the MultiIndex directly using its internal encoding by passing levels and labels
pd.MultiIndex??

In [24]:
pd.MultiIndex(levels=[["a","b"],[1,2]],
             codes=[[0,0,1,1],[0,1,0,1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

<h4>MultiIndex level Names</h4>

In [25]:
# We can accomplish passing convenient names to the levels of MultiIndex by passing the names argument to any of 
# the names argument to any of the MultiIndex constructors or by setting the names attribute of the index 
# after the fact
pop.index.names=["state","year"]
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

<h4>MultiIndex for columns</h4>

In [26]:
# In a DataFrame, the rows and columns are completely symmetric and just as the rows can have multiple
# levels of indices, the columns can have multiple levels as well.
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=["year","visit"])
columns = pd.MultiIndex.from_product([["Bob","Guido","Sue"],["HR","Temp"]], names=["subject","type"])

In [27]:
# Mock Some Data
data = np.round(np.random.rand(4,6),1)
data[:,::2] *= 10
data += 37

In [28]:
# Create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,37.1,37.0,37.0,38.0,37.4
2013,2,43.0,37.5,43.0,37.6,42.0,37.5
2014,1,44.0,37.7,45.0,37.4,38.0,37.9
2014,2,41.0,37.1,40.0,37.5,37.0,37.0


In [29]:
health_data["Guido"]

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,37.0,37.0
2013,2,43.0,37.6
2014,1,45.0,37.4
2014,2,40.0,37.5


In [30]:
# For complicated records containing multiple labeled measurements across multiple times for many subjects 
# making use of hierarchical rows and columns can be extremely convenient.

<h3>Indexing and Slicing a Multi Index</h3>

In [31]:
# Indexing and Slicing on a MultiIndex is designed to be intuitive and it helps if you think about the indices 
# as added dimensions. 

<h4>Multiply Indexed Series</h4>

In [32]:
# The multiply indexed Series of State Populations
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [33]:
# We can access single elements by indexing multiple terms:
pop["California",2000]

33871648

In [34]:
# The multiIndex also supports partial indexing or indexing just one of the levels of index. 
# The result is another series with lower level of indices maintained:
pop["California"]

year
2000    33871648
2010    37253956
dtype: int64

In [35]:
# As long as MultiIndex is sorted, partial slicing is available. 
pop.loc["California":"New York"]

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [36]:
# With sorted indices, we can perform partial indexing on lower levels by passing an empty slice in the first
# index:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [37]:
# Selection based on boolean masks:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [38]:
# Seletion based on fancy indexing also works:
pop[["California", "Texas"]]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

<h4>Multiply Indexed DataFrames</h4>

In [39]:
# A multiply indexed dataframe behaves in a similar manner. 
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,37.1,37.0,37.0,38.0,37.4
2013,2,43.0,37.5,43.0,37.6,42.0,37.5
2014,1,44.0,37.7,45.0,37.4,38.0,37.9
2014,2,41.0,37.1,40.0,37.5,37.0,37.0


In [40]:
# Columns are primary in DataFrame and the syntax used for multiply indexed Series applies to the columns. 
health_data["Guido","HR"]

year  visit
2013  1        37.0
      2        43.0
2014  1        45.0
      2        40.0
Name: (Guido, HR), dtype: float64

In [41]:
# loc, iloc and ix indexers can also be used:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,40.0,37.1
2013,2,43.0,37.5


In [42]:
# Each individual index in loc or iloc can be passed a tuple of multiple indices
health_data.loc[:,("Bob","HR")]

year  visit
2013  1        40.0
      2        43.0
2014  1        44.0
      2        41.0
Name: (Bob, HR), dtype: float64

In [45]:
# Working with slices within these index tuples is not convenient. Trying to create a slice within a tuple will lead 
# to a syntax error.
health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (4146078592.py, line 3)

In [46]:
# Index Slice object can be used for the purpose of slicing in this context
idx = pd.IndexSlice
health_data.loc[idx[:,1],idx[:,"HR"]]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,40.0,37.0,38.0
2014,1,44.0,45.0,38.0


<h3>Regarding Multi-Indices</h3>

In [47]:
# One of the keys to working with multiply indexed data is knowing how to effectively transform data. 

<h4>Sorted and Unsorted Indices</h4>

In [48]:
# Many of the MultiIndex slicing operations will fail if the index is not sorted. 
# We will start by creating some simple multiply indexed data where the indices are not lexicographically sorted

index = pd.MultiIndex.from_product([["a","c","b"],[1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ["char", "int"]
data

char  int
a     1      0.956027
      2      0.498441
c     1      0.254155
      2      0.352900
b     1      0.624220
      2      0.451785
dtype: float64

In [49]:
# If we try to take a partial slice of the index, it will result in an error:
try:
    data["a":"b"]
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [50]:
# Pandas provide a number of convenience routines to perform the type of sorting. 
data = data.sort_index()
data

char  int
a     1      0.956027
      2      0.498441
b     1      0.624220
      2      0.451785
c     1      0.254155
      2      0.352900
dtype: float64

In [51]:
# With data sorted, partial slicing will work as below:
data["a":"b"]

char  int
a     1      0.956027
      2      0.498441
b     1      0.624220
      2      0.451785
dtype: float64

<h4>Stacking and Unstacking Indices</h4>

In [52]:
# It is possible to convert a dataset from a stacked multi-index to a simple two-dimensional representation
# optionally specifying the level to use:

pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [53]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [54]:
# Use stack on pop to original series

pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

<h4>Index Setting and Resetting</h4>

In [55]:
# Another way to rearrange hierarchical data is to turn the index labels into columns; this can be 
# accomplished with the reset_index method.

pop_flat = pop.reset_index(name="population")
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [56]:
# The raw input data looks like this and it is useful to build a MultiIndex from the column values. This can
# be done with the set_index method of the DataFrame, which returns a multiply indexed DataFrame:

pop_flat.set_index(["state","year"])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


<h3>Data Aggregations on Multi-Indices</h3>

In [57]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,37.1,37.0,37.0,38.0,37.4
2013,2,43.0,37.5,43.0,37.6,42.0,37.5
2014,1,44.0,37.7,45.0,37.4,38.0,37.9
2014,2,41.0,37.1,40.0,37.5,37.0,37.0


In [58]:
# Mean measurements in two visits each year

data_mean = health_data.mean(level="year")
data_mean

  data_mean = health_data.mean(level="year")


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,41.5,37.3,40.0,37.3,40.0,37.45
2014,42.5,37.4,42.5,37.45,37.5,37.45


In [59]:
# By further makig use of the axis keyword, we can take the mean among levels on the columns as well:
data_mean.mean(axis=1, level="type")

  data_mean.mean(axis=1, level="type")


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,40.5,37.35
2014,40.833333,37.433333
