# Hierarchical Indexing


### Multiply Indexed Series

In [2]:
import pandas as pd
import numpy as np

In [4]:
# First the bad way
# This is bad because you are just creating a new index name for each year and state combo
# There is NO COMMONALITY BETWEEN TWO YEARS OF THE SAME STATE

index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
population = pd.Series(populations, index=index)
population



(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [7]:
# With this sort of scheme accessing specific data gets messy
# This PYTHON based operation can really slow things down

population[[i for i in population.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

### THERE IS A BETTER WAY, The Pandas MultiIndex

In [15]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [16]:
population = population.reindex(index)
population

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [17]:
# Now that the Series is MultiIndex'ed you can access the data via either state or year

In [36]:
population[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [26]:
population['California':'New York']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [40]:
population['California':'New York']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [42]:
population['California',2000]

33871648

In [44]:
# Unstack lets you turn the MultiIndex Series into a Pandas DataFrame

population_df = population.unstack()
population_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [48]:
# Stack puts it back into a MultiIndex Series

population_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [51]:
# This becomes useful when performing operations on the data.

pop_df = pd.DataFrame({'total': population,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [52]:
pop_df.stack()

California  2000  total      33871648
                  under18     9267089
            2010  total      37253956
                  under18     9284094
New York    2000  total      18976457
                  under18     4687374
            2010  total      19378102
                  under18     4318033
Texas       2000  total      20851820
                  under18     5906301
            2010  total      25145561
                  under18     6879014
dtype: int64

In [54]:
pop_df['under18']

California  2000    9267089
            2010    9284094
New York    2000    4687374
            2010    4318033
Texas       2000    5906301
            2010    6879014
Name: under18, dtype: int64

In [55]:
pop_df['total']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
Name: total, dtype: int64

In [58]:
frac_u18 = pop_df['under18'] / pop_df['total']
frac_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [60]:
frac_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Methods of MultiIndex Creation

#### Method No.1
    Pass a list of two or more index arrays to the constructor

In [64]:
Index = [list('aabb'),[1,2,1,2]]
Columns = ['data1', 'data2']
df = pd.DataFrame(np.random.rand(4,2), index=Index, columns=Columns)
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.968166,0.292436
a,2,0.00054,0.666432
b,1,0.954487,0.76163
b,2,0.791939,0.840004


#### Method No.2
    
    Pass a Dictionary with tuples for keys

In [65]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}

pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

### Explicit MultiIndex constructors

#### Method No.3

   Using pd.MultiIndex.from_

In [66]:
pd.MultiIndex.from_arrays([list('aabb'), [1,2,1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [67]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [68]:
pd.MultiIndex.from_product([list('ab'),[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [73]:
# You can also name the levels and attach the labels explicitely



TestMultiIndex = pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
                               codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

print(TestMultiIndex)

'''                   (Labels for)  (Labels for)
                      (  Level 0 )  (  Level 1 ) 
                      
Level 0             Level 1

label[0] = 'a'      label[0] = 1

label[0] = 'a'      label[1] = 2

label[1] = 'b'      label[0] = 1

label[1] = 'b'      label[1] = 2

''';

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


### MultiIndex Level Names

You can name the levels of the MultiIndex with df.index.names = 

In [74]:
population

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [78]:
population.index.names = ['State', 'Year']
population

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex for Columns


Creating a MultiIndex for columns works the same way as for the index except you pass <br>
the index object resulting from pd.MultiIndex command to the columns keyword in the <br>
Dataframe creation function

In [82]:


# hierarchical indices and columns
Index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
Columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['Patient', 'Vitals'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=Index, columns=Columns)
health_data



Unnamed: 0_level_0,Patient,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Vitals,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,36.8,33.0,36.7,44.0,37.3
2013,2,51.0,38.0,27.0,37.6,22.0,35.9
2014,1,44.0,36.7,31.0,38.1,19.0,37.6
2014,2,38.0,38.4,36.0,37.3,35.0,35.7


In [83]:
health_data['Guido']

Unnamed: 0_level_0,Vitals,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,33.0,36.7
2013,2,27.0,37.6
2014,1,31.0,38.1
2014,2,36.0,37.3


### Indexing and Slicing a MultiIndexed Pandas Object

In [99]:
population


State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [101]:
# Isolate a single item with multiple terms for an index
population['Texas', 2010]

25145561

In [103]:
# Or access all of the data for an index
population['Texas']

Year
2000    20851820
2010    25145561
dtype: int64

In [108]:
# Or using the .loc method
print(population.loc[:,2010], '\n')
print(population.loc['California':'New York'])

State
California    37253956
New York      19378102
Texas         25145561
dtype: int64 

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64


In [113]:
# Or access the data in a lower level with an empty slice on the first level of the index

population[:,2000]

State
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [114]:
# Or get whatever is available from a boolean mask

population[population > 30000000]

State       Year
California  2000    33871648
            2010    37253956
dtype: int64

In [115]:
population.sort_values()

State       Year
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
California  2000    33871648
            2010    37253956
dtype: int64

In [120]:
# Fancy Indexing works too

population[['Texas','California']]

State       Year
Texas       2000    20851820
            2010    25145561
California  2000    33871648
            2010    37253956
dtype: int64

### Multiply Indexed DataFrames

In [121]:
health_data

Unnamed: 0_level_0,Patient,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Vitals,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,36.8,33.0,36.7,44.0,37.3
2013,2,51.0,38.0,27.0,37.6,22.0,35.9
2014,1,44.0,36.7,31.0,38.1,19.0,37.6
2014,2,38.0,38.4,36.0,37.3,35.0,35.7


In [122]:
health_data['Guido', 'HR']

year  visit
2013  1        33.0
      2        27.0
2014  1        31.0
      2        36.0
Name: (Guido, HR), dtype: float64

In [129]:
health_data['Guido','HR']

year  visit
2013  1        33.0
      2        27.0
2014  1        31.0
      2        36.0
Name: (Guido, HR), dtype: float64

In [133]:
health_data.iloc[:2]

Unnamed: 0_level_0,Patient,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Vitals,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,36.8,33.0,36.7,44.0,37.3
2013,2,51.0,38.0,27.0,37.6,22.0,35.9


In [134]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,Patient,Bob,Bob
Unnamed: 0_level_1,Vitals,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,49.0,36.8
2013,2,51.0,38.0


In [153]:
health_data.loc[:, 'Guido':'Sue']

Unnamed: 0_level_0,Patient,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Vitals,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013,1,33.0,36.7,44.0,37.3
2013,2,27.0,37.6,22.0,35.9
2014,1,31.0,38.1,19.0,37.6
2014,2,36.0,37.3,35.0,35.7


In [180]:
# Index slicing is made much more convenient using a pandas pd.IndexSlice object

idx = pd.IndexSlice
print(health_data.loc[idx[:,2],:], '\n\n')
print(health_data.loc[idx[:,2],idx[:,'HR']], '\n\n')
print(health_data.loc[idx[2013],idx[:,'HR']], '\n\n')
print(health_data.loc[idx[2013,2],idx['Guido':'Sue','HR']], '\n\n')
print(health_data.loc[idx[2013],idx['Guido':'Sue','HR']], '\n\n')
print(health_data.loc[idx[2013],idx['Guido':'Sue']], '\n\n')
print(health_data.loc[:,idx['Guido':'Sue']], '\n\n')

Patient      Bob       Guido         Sue      
Vitals        HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 2      51.0  38.0  27.0  37.6  22.0  35.9
2014 2      38.0  38.4  36.0  37.3  35.0  35.7 


Patient      Bob Guido   Sue
Vitals        HR    HR    HR
year visit                  
2013 2      51.0  27.0  22.0
2014 2      38.0  36.0  35.0 


Patient   Bob Guido   Sue
Vitals     HR    HR    HR
visit                    
1        49.0  33.0  44.0
2        51.0  27.0  22.0 


Patient  Vitals
Guido    HR        27.0
Sue      HR        22.0
Name: (2013, 2), dtype: float64 


Patient Guido   Sue
Vitals     HR    HR
visit              
1        33.0  44.0
2        27.0  22.0 


Patient Guido         Sue      
Vitals     HR  Temp    HR  Temp
visit                          
1        33.0  36.7  44.0  37.3
2        27.0  37.6  22.0  35.9 


Patient    Guido         Sue      
Vitals        HR  Temp    HR  Temp
year visit                        
2013 1     

### Rearranging  with Multi-Indices

In [181]:
# If an index isn't sorted then SLICING WILL FAIL!

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.485109
      2      0.230489
c     1      0.629612
      2      0.385249
b     1      0.244373
      2      0.023099
dtype: float64

In [184]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [187]:
data_sorted = data.sort_index()

print(data_sorted, '\n\n')
print(data_sorted['a':'b'])

char  int
a     1      0.485109
      2      0.230489
b     1      0.244373
      2      0.023099
c     1      0.629612
      2      0.385249
dtype: float64 


char  int
a     1      0.485109
      2      0.230489
b     1      0.244373
      2      0.023099
dtype: float64


### Stacking and Unstacking indices

In [193]:
population.unstack() # Default level is the lowest level (highest number)

Year,2000,2010
State,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [194]:
population.unstack(level=0)

State,California,New York,Texas
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [195]:
population.unstack(level=1)

Year,2000,2010
State,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [200]:
# switch the levels Series index with unstack(level=0).stack()

population.unstack(level=0).stack() 

Year  State     
2000  California    33871648
      New York      18976457
      Texas         20851820
2010  California    37253956
      New York      19378102
      Texas         25145561
dtype: int64

### Index Setting and Resetting

In [207]:
# To reset the indices you can use the df.reset_index() method
#
# This transforms the data into DataFrame
#
# if the indices are not named they will be named as a range starting from 0
#
# Similarly the values will take the lowest available integer as the name
# unless the name keyword is used and the values are given a name
print(population, '\n\n')
print(population.reset_index(), '\n\n')
print(population.reset_index(name='population'))

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64 


        State  Year         0
0  California  2000  33871648
1  California  2010  37253956
2    New York  2000  18976457
3    New York  2010  19378102
4       Texas  2000  20851820
5       Texas  2010  25145561 


        State  Year  population
0  California  2000    33871648
1  California  2010    37253956
2    New York  2000    18976457
3    New York  2010    19378102
4       Texas  2000    20851820
5       Texas  2010    25145561


In [214]:
# Using the reverse method 'df.set_index()' you set the indices and stack
# the indices in whatever order you wish 
# ***THE LVL 0 KEY MUST BE SORTED FOR THE SERIES TO BE MULTI-INDEXED***

pop_flat = population.reset_index(name='population')

print(pop_flat, '\n\n')
print(pop_flat.set_index(['State','Year']), '\n\n')
pop_flat_byYear = pop_flat.sort_values('Year')
print(pop_flat_byYear.set_index(['Year','State']))

        State  Year  population
0  California  2000    33871648
1  California  2010    37253956
2    New York  2000    18976457
3    New York  2010    19378102
4       Texas  2000    20851820
5       Texas  2010    25145561 


                 population
State      Year            
California 2000    33871648
           2010    37253956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561 


                 population
Year State                 
2000 California    33871648
     New York      18976457
     Texas         20851820
2010 California    37253956
     New York      19378102
     Texas         25145561


### Data Aggregation and Multi-Indices

In [220]:
# Without passing any keywords aggregation by default happens
# by column at the lowest level

health_data.mean()

Patient  Vitals
Bob      HR        45.500
         Temp      37.475
Guido    HR        31.750
         Temp      37.425
Sue      HR        30.000
         Temp      36.625
dtype: float64

In [222]:
# When using the level keyword the axis defaults to '0' or the indices.

# Specifying a column level will result in an error

health_data.mean(level='year')

Patient,Bob,Bob,Guido,Guido,Sue,Sue
Vitals,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,50.0,37.4,30.0,37.15,33.0,36.6
2014,41.0,37.55,33.5,37.7,27.0,36.65


In [223]:
health_data.mean(level='visit')

Patient,Bob,Bob,Guido,Guido,Sue,Sue
Vitals,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,46.5,36.75,32.0,37.4,31.5,37.45
2,44.5,38.2,31.5,37.45,28.5,35.8


In [230]:
# You can specify the axis just as with slicing
#
# axis = 0 or 'index' or 'rows' for the indices
#
# axis = 1 or 'columns' for the columns

health_data.mean(axis='columns', level='Patient')

Unnamed: 0_level_0,Patient,Bob,Guido,Sue
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,42.9,34.85,40.65
2013,2,44.5,32.3,28.95
2014,1,40.35,34.55,28.3
2014,2,38.2,36.65,35.35


In [231]:
health_data.mean(axis='columns', level='Vitals')

Unnamed: 0_level_0,Vitals,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,42.0,36.933333
2013,2,33.333333,37.166667
2014,1,31.333333,37.466667
2014,2,36.333333,37.133333


In [232]:
health_data.mean(axis='rows', level='visit')

Patient,Bob,Bob,Guido,Guido,Sue,Sue
Vitals,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,46.5,36.75,32.0,37.4,31.5,37.45
2,44.5,38.2,31.5,37.45,28.5,35.8
