In [1]:
import numpy as np
import pandas as pd

In [2]:
index = [('California', 2000), ('California', 2010),
        ('New York', 2000), ('New York', 2010),
        ('Texas', 2000), ('Texas', 2010)]

population = [33871648, 37253956,
             18976457, 19378102,
             20851820, 25145561]

pop = pd.Series(population, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [3]:
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [4]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [5]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [6]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [7]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [8]:
pop['Texas', 2000]

20851820

In [9]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [10]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [11]:
pop_df.unstack()

2000  California    33871648
      New York      18976457
      Texas         20851820
2010  California    37253956
      New York      19378102
      Texas         25145561
dtype: int64

In [12]:
pop_df = pd.DataFrame({'total':pop,
                      'under18': [9267089, 9284094,
                                 4687374, 4318033,
                                 5906301, 6878014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6878014


In [13]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273528
dtype: float64

In [14]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273528


In [15]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1,2,1,2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.853407,0.196507
a,2,0.686038,0.999934
b,1,0.559816,0.743529
b,2,0.924437,0.885995


In [16]:
data = {('California', 2000): 33871648,
       ('California', 2010): 37253956,
       ('Texas', 2000): 20851820,
       ('Texas', 2010): 25145561,
       ('New York', 2000): 18976457,
       ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [17]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [18]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [19]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [22]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

TypeError: __new__() got an unexpected keyword argument 'labels'

In [23]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [24]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [25]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                  names=['year', 'visit'])
print(index)
print("=====================================")
columns = pd.MultiIndex.from_product([['Bob', 'JY', 'Sue'], ['HR', 'Temp']],
                                    names=['subject', 'type'])

print(columns)
print("=====================================")
data = np.round(np.random.randn(4, 6), 1)
print(data)
print("=====================================")
data[:, ::2] *= 10
print(data)
print("=====================================")
data += 37
print(data)
print("=====================================")

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])
MultiIndex([('Bob',   'HR'),
            ('Bob', 'Temp'),
            ( 'JY',   'HR'),
            ( 'JY', 'Temp'),
            ('Sue',   'HR'),
            ('Sue', 'Temp')],
           names=['subject', 'type'])
[[ 0.  -0.2 -1.1  0.1  0.7  1.8]
 [ 1.4 -1.6 -2.5  0.4  1.6  0.8]
 [-0.9  0.9 -1.   0.1  1.4 -0.8]
 [ 0.1  0.5  1.5 -0.7 -0.8 -1.1]]
[[  0.   -0.2 -11.    0.1   7.    1.8]
 [ 14.   -1.6 -25.    0.4  16.    0.8]
 [ -9.    0.9 -10.    0.1  14.   -0.8]
 [  1.    0.5  15.   -0.7  -8.   -1.1]]
[[37.  36.8 26.  37.1 44.  38.8]
 [51.  35.4 12.  37.4 53.  37.8]
 [28.  37.9 27.  37.1 51.  36.2]
 [38.  37.5 52.  36.3 29.  35.9]]


Unnamed: 0_level_0,subject,Bob,Bob,JY,JY,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,36.8,26.0,37.1,44.0,38.8
2013,2,51.0,35.4,12.0,37.4,53.0,37.8
2014,1,28.0,37.9,27.0,37.1,51.0,36.2
2014,2,38.0,37.5,52.0,36.3,29.0,35.9


In [26]:
pd.set_option('colheader_justify', 'left')
health_data

Unnamed: 0_level_0,subject,Bob,Bob,JY,JY,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,36.8,26.0,37.1,44.0,38.8
2013,2,51.0,35.4,12.0,37.4,53.0,37.8
2014,1,28.0,37.9,27.0,37.1,51.0,36.2
2014,2,38.0,37.5,52.0,36.3,29.0,35.9


In [27]:
health_data['JY']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,26.0,37.1
2013,2,12.0,37.4
2014,1,27.0,37.1
2014,2,52.0,36.3


In [28]:
health_data['HR']

KeyError: 'HR'

In [30]:
health_data[['HR']]

KeyError: "['HR'] not in index"

In [33]:
health_data['JY', 'HR'].unstack()

visit,1,2
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,26.0,12.0
2014,27.0,52.0


In [39]:
health_data[:][0]

KeyError: 0

In [40]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [41]:
pop['California', 2000]

33871648

In [42]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [43]:
pop[2020]

IndexError: index 2020 is out of bounds for axis 0 with size 6

In [44]:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [45]:
pop.loc['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [46]:
pop[pop > 20000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [47]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [48]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,JY,JY,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,36.8,26.0,37.1,44.0,38.8
2013,2,51.0,35.4,12.0,37.4,53.0,37.8
2014,1,28.0,37.9,27.0,37.1,51.0,36.2
2014,2,38.0,37.5,52.0,36.3,29.0,35.9


In [49]:
health_data['JY', 'HR']

year  visit
2013  1        26.0
      2        12.0
2014  1        27.0
      2        52.0
Name: (JY, HR), dtype: float64

In [51]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,37.0,36.8
2013,2,51.0,35.4


In [52]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        37.0
      2        51.0
2014  1        28.0
      2        38.0
Name: (Bob, HR), dtype: float64

In [53]:
health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (<ipython-input-53-fb34fa30ac09>, line 1)

In [54]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,JY,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,37.0,26.0,44.0
2014,1,28.0,27.0,51.0


In [55]:
health_data.loc[:, idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,JY,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,37.0,26.0,44.0
2013,2,51.0,12.0,53.0
2014,1,28.0,27.0,51.0
2014,2,38.0,52.0,29.0


In [56]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.466938
      2      0.808952
c     1      0.536370
      2      0.440749
b     1      0.770058
      2      0.181746
dtype: float64

In [57]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [58]:
data = data.sort_index()
data

char  int
a     1      0.466938
      2      0.808952
b     1      0.770058
      2      0.181746
c     1      0.536370
      2      0.440749
dtype: float64

In [59]:
data['a':'b']

char  int
a     1      0.466938
      2      0.808952
b     1      0.770058
      2      0.181746
dtype: float64

In [60]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [61]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [62]:
pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [63]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [64]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [65]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,JY,JY,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.0,36.8,26.0,37.1,44.0,38.8
2013,2,51.0,35.4,12.0,37.4,53.0,37.8
2014,1,28.0,37.9,27.0,37.1,51.0,36.2
2014,2,38.0,37.5,52.0,36.3,29.0,35.9


In [67]:
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,JY,JY,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,44.0,36.1,19.0,37.25,48.5,38.3
2014,33.0,37.7,39.5,36.7,40.0,36.05


In [69]:
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,37.166667,37.216667
2014,37.5,36.816667
