In [1]:
import numpy as np
import pandas as pd

##  Hierarchical Indexing

In [2]:
 data = pd.Series(np.random.uniform(size=9),
            index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
             [1, 2, 3, 1, 3, 1, 2, 2, 3]])

In [3]:
data

a  1    0.075062
   2    0.797236
   3    0.145763
b  1    0.033807
   3    0.989248
c  1    0.823762
   2    0.885363
d  2    0.067119
   3    0.651357
dtype: float64

What you’re seeing is a prettified view of a Series with a MultiIndex as its index. The
“gaps” in the index display mean “use the label directly above”:

In [4]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [5]:
data['a']

1    0.075062
2    0.797236
3    0.145763
dtype: float64

In [6]:
data['b':'c']

b  1    0.033807
   3    0.989248
c  1    0.823762
   2    0.885363
dtype: float64

In [9]:
data.loc[['b','c']]

b  1    0.033807
   3    0.989248
c  1    0.823762
   2    0.885363
dtype: float64

Selection is even possible from an “inner” level. Here I select all of the values having
the value 2 from the second index level:

In [10]:
data[:, 2]

a    0.797236
c    0.885363
d    0.067119
dtype: float64

Hierarchical indexing plays an important role in reshaping data and in group-based
operations like forming a pivot table. For example, you can rearrange this data into a
DataFrame using its unstack method:

In [12]:
data

a  1    0.075062
   2    0.797236
   3    0.145763
b  1    0.033807
   3    0.989248
c  1    0.823762
   2    0.885363
d  2    0.067119
   3    0.651357
dtype: float64

In [11]:
data.unstack()

Unnamed: 0,1,2,3
a,0.075062,0.797236,0.145763
b,0.033807,,0.989248
c,0.823762,0.885363,
d,,0.067119,0.651357


The inverse operation of unstack is stack:

In [13]:
data.unstack().stack()

a  1    0.075062
   2    0.797236
   3    0.145763
b  1    0.033807
   3    0.989248
c  1    0.823762
   2    0.885363
d  2    0.067119
   3    0.651357
dtype: float64

In [14]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
     index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
     columns=[["Ohio", "Ohio", "Colorado"],
     ["Green", "Red", "Green"]])

In [15]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
frame.index.names = ["key1", "key2"]
frame.columns.names = ["state", "color"]

In [17]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


You can see how many levels an index has by accessing its nlevels attribute:

In [18]:
frame.index.nlevels

2

In [19]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


## Reordering and Sorting Levels

The swaplevel method takes two level numbers
or names and returns a new object with the levels interchanged (but the data is
otherwise unaltered):


In [20]:
frame.swaplevel("key1", "key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


sort_index by default sorts the data lexicographically using all the index levels, but
you can choose to use only a single level or a subset of levels to sort by passing the
level argument.

In [29]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [31]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Summary Statistics by Level


In [32]:
frame.groupby(level='key2').sum()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [34]:
frame.T.groupby(level='color').sum()

key1,a,a,b,b
key2,1,2,1,2
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Green,2,8,14,20
Red,1,4,7,10


## Indexing with a DataFrame’s columns


In [35]:
frame = pd.DataFrame({"a": range(7), "b": range(7, 0, -1),
"c": ["one", "one", "one", "two", "two",
 "two", "two"],
 "d": [0, 1, 2, 0, 1, 2, 3]})

In [36]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


DataFrame’s set_index function will create a new DataFrame using one or more of
its columns as the index:

In [37]:
frame2 = frame.set_index(['c', 'd'])

In [38]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [44]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [45]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
