In [1]:
import pandas as pd
import numpy as np

In [2]:
dfo = pd.read_csv('flights2.csv.gz', low_memory=False)
df = dfo.set_index(['DESTINATION_AIRPORT', 'AIRLINE'])
df = df.sort_index()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
DESTINATION_AIRPORT,AIRLINE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ABE,DL,2015,1,4,7,2036,N979AT,ATL,2101,2101.0,0.0,...,2252.0,1.0,0,0,,,,,,
ABE,DL,2015,1,5,1,1411,N989AT,ATL,2034,2032.0,-2.0,...,2227.0,2.0,0,0,,,,,,
ABE,DL,2015,1,6,2,1411,N979AT,ATL,2034,2032.0,-2.0,...,2221.0,-4.0,0,0,,,,,,
ABE,DL,2015,1,7,3,1411,N958AT,ATL,2034,2031.0,-3.0,...,2226.0,1.0,0,0,,,,,,
ABE,DL,2015,1,8,4,1411,N994AT,ATL,2034,2030.0,-4.0,...,2223.0,-2.0,0,0,,,,,,


## Constructing MultiIndexes

### Using Set Index

In [3]:
df.index

MultiIndex([('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ('ABE', 'DL'),
            ...
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO'),
            ('YUM', 'OO')],
           names=['DESTINATION_AIRPORT', 'AIRLINE'], length=469968)

### From numpy arrays

In [4]:
airlines = dfo.AIRLINE.to_numpy()
dests = dfo.DESTINATION_AIRPORT.to_numpy()

display(airlines, dests)

array(['AS', 'AA', 'US', ..., 'F9', 'F9', 'UA'], dtype=object)

array(['SEA', 'PBI', 'CLT', ..., 'TPA', 'ATL', 'DEN'], dtype=object)

In [5]:
pd.MultiIndex.from_arrays([airlines, dests])

MultiIndex([('AS', 'SEA'),
            ('AA', 'PBI'),
            ('US', 'CLT'),
            ('AA', 'MIA'),
            ('AS', 'ANC'),
            ('DL', 'MSP'),
            ('NK', 'MSP'),
            ('US', 'CLT'),
            ('AA', 'DFW'),
            ('DL', 'ATL'),
            ...
            ('UA', 'SFO'),
            ('B6', 'JFK'),
            ('US', 'MSP'),
            ('B6', 'MCO'),
            ('B6', 'PSE'),
            ('B6', 'BQN'),
            ('DL', 'DTW'),
            ('F9', 'TPA'),
            ('F9', 'ATL'),
            ('UA', 'DEN')],
           length=469968)

### From a list of tuples

In [6]:
tups = [tuple(x) for x in dfo[['DESTINATION_AIRPORT', 'AIRLINE']][:10].to_numpy()]
tups

[('SEA', 'AS'),
 ('PBI', 'AA'),
 ('CLT', 'US'),
 ('MIA', 'AA'),
 ('ANC', 'AS'),
 ('MSP', 'DL'),
 ('MSP', 'NK'),
 ('CLT', 'US'),
 ('DFW', 'AA'),
 ('ATL', 'DL')]

In [7]:
pd.MultiIndex.from_tuples(tups)

MultiIndex([('SEA', 'AS'),
            ('PBI', 'AA'),
            ('CLT', 'US'),
            ('MIA', 'AA'),
            ('ANC', 'AS'),
            ('MSP', 'DL'),
            ('MSP', 'NK'),
            ('CLT', 'US'),
            ('DFW', 'AA'),
            ('ATL', 'DL')],
           )

### Cross product

In [8]:
pd.MultiIndex.from_product([[1, 2, 3], [2000, 2001], ['Sem 1', 'Sem 2']])

MultiIndex([(1, 2000, 'Sem 1'),
            (1, 2000, 'Sem 2'),
            (1, 2001, 'Sem 1'),
            (1, 2001, 'Sem 2'),
            (2, 2000, 'Sem 1'),
            (2, 2000, 'Sem 2'),
            (2, 2001, 'Sem 1'),
            (2, 2001, 'Sem 2'),
            (3, 2000, 'Sem 1'),
            (3, 2000, 'Sem 2'),
            (3, 2001, 'Sem 1'),
            (3, 2001, 'Sem 2')],
           )

### Naming your index

In [9]:
index = pd.MultiIndex.from_product([[1, 2, 3], [2000, 2001], ['Sem 1', 'Sem 2']])
index.names = ['Student', 'Year', 'Sem']
index

MultiIndex([(1, 2000, 'Sem 1'),
            (1, 2000, 'Sem 2'),
            (1, 2001, 'Sem 1'),
            (1, 2001, 'Sem 2'),
            (2, 2000, 'Sem 1'),
            (2, 2000, 'Sem 2'),
            (2, 2001, 'Sem 1'),
            (2, 2001, 'Sem 2'),
            (3, 2000, 'Sem 1'),
            (3, 2000, 'Sem 2'),
            (3, 2001, 'Sem 1'),
            (3, 2001, 'Sem 2')],
           names=['Student', 'Year', 'Sem'])

### MultiIndex columns

In [10]:
data = [[1.1, 40, 0.9, 30], [1.2, 45, 1.05, 40], [1.3, 65, 1.3, 55], [1.5, 80, 1.6, 75]]
columns = pd.MultiIndex.from_product([['Jack', 'Jill'], ['Height', 'Age']])
rows = pd.to_datetime(['2000-01-01', '2001-01-01', '2002-01-01', '2003-01-01'])
df2 = pd.DataFrame(data, columns=columns).set_index(rows)
df2

Unnamed: 0_level_0,Jack,Jack,Jill,Jill
Unnamed: 0_level_1,Height,Age,Height,Age
2000-01-01,1.1,40,0.9,30
2001-01-01,1.2,45,1.05,40
2002-01-01,1.3,65,1.3,55
2003-01-01,1.5,80,1.6,75


In [11]:
df2['Jack']

Unnamed: 0,Height,Age
2000-01-01,1.1,40
2001-01-01,1.2,45
2002-01-01,1.3,65
2003-01-01,1.5,80


In [12]:
df2.loc[:, ('Jack', 'Height')]

2000-01-01    1.1
2001-01-01    1.2
2002-01-01    1.3
2003-01-01    1.5
Name: (Jack, Height), dtype: float64

In [13]:
df2[('Jack', 'Height')]

2000-01-01    1.1
2001-01-01    1.2
2002-01-01    1.3
2003-01-01    1.5
Name: (Jack, Height), dtype: float64

In [14]:
idx = pd.IndexSlice
df2.loc[:, idx[:, 'Height']]

Unnamed: 0_level_0,Jack,Jill
Unnamed: 0_level_1,Height,Height
2000-01-01,1.1,0.9
2001-01-01,1.2,1.05
2002-01-01,1.3,1.3
2003-01-01,1.5,1.6


In [15]:
df2.loc[:, (slice(None), 'Height')]

Unnamed: 0_level_0,Jack,Jill
Unnamed: 0_level_1,Height,Height
2000-01-01,1.1,0.9
2001-01-01,1.2,1.05
2002-01-01,1.3,1.3
2003-01-01,1.5,1.6


In [16]:
df2.xs('Height', level=1, axis=1)

Unnamed: 0,Jack,Jill
2000-01-01,1.1,0.9
2001-01-01,1.2,1.05
2002-01-01,1.3,1.3
2003-01-01,1.5,1.6


In [17]:
df2.columns.names = ['name', 'info']
df2.columns

MultiIndex([('Jack', 'Height'),
            ('Jack',    'Age'),
            ('Jill', 'Height'),
            ('Jill',    'Age')],
           names=['name', 'info'])

In [18]:
df2.xs('Height', level='info', axis=1)

name,Jack,Jill
2000-01-01,1.1,0.9
2001-01-01,1.2,1.05
2002-01-01,1.3,1.3
2003-01-01,1.5,1.6


In [19]:
df2.iloc[:, df2.columns.get_level_values('info') == 'Height']  # Dont do this. Its silly.

name,Jack,Jill
info,Height,Height
2000-01-01,1.1,0.9
2001-01-01,1.2,1.05
2002-01-01,1.3,1.3
2003-01-01,1.5,1.6
