# Data Wrangling: Join, Combine and Reshape

## Hierarcihcal Indexing

Hierarchical indexing enables multiple index levels on axis. Anther way to look at it is as working with higher dim data in lower dim form. 

In [1]:
import pandas as pd
import numpy as np

In [9]:
data = pd.Series(np.random.uniform(size=9),
                 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    0.731638
   2    0.919470
   3    0.254580
b  1    0.105696
   3    0.771312
c  1    0.946652
   2    0.644269
d  2    0.753497
   3    0.321431
dtype: float64

In [10]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [11]:
data["b"]

1    0.105696
3    0.771312
dtype: float64

In [12]:
data["b":"c"]

b  1    0.105696
   3    0.771312
c  1    0.946652
   2    0.644269
dtype: float64

In [13]:
data[["b", "d"]]

b  1    0.105696
   3    0.771312
d  2    0.753497
   3    0.321431
dtype: float64

In [14]:
# Selection is even possible from am inner level
data.loc[:, 2]

a    0.919470
c    0.644269
d    0.753497
dtype: float64

In [15]:
# For example data can be transformed to dataframe using unstack
data.unstack()

Unnamed: 0,1,2,3
a,0.731638,0.91947,0.25458
b,0.105696,,0.771312
c,0.946652,0.644269,
d,,0.753497,0.321431


In [16]:
# The inverse operation of unstack i stack
data.unstack().stack()

a  1    0.731638
   2    0.919470
   3    0.254580
b  1    0.105696
   3    0.771312
c  1    0.946652
   2    0.644269
d  2    0.753497
   3    0.321431
dtype: float64

In [17]:
# With dataframe either axis can hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4,3)), 
                     index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
                     columns=[["Ohio", "Ohio", "Colorado"],
                              ["Green", "Red", "Green"]])

frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [19]:
# The hierarchical levels can have names.
frame.index.names = ["key1", "key2"]
frame.columns.names = ["state", "color"]
frame
# Names superseeds name attribute which can be used in single level indexes

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [20]:
# Number of levels is returned by
frame.index.nlevels

2

In [21]:
# Same selection logic can be applied to columns as for indexes
frame["Ohio"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [23]:
# A multiindex can be created by itself and then reused
pd.MultiIndex.from_arrays([["Ohio", "Ohio", "Colorado"],
                         ["Green", "Red", "Green"]],
                         names=["state", "color"])

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

## Reorderong and Sorting Levels

Sometimes order of levels need to be rearanged and sometimes ordering on a specific level is needed. The `swaplevel` method takes two level numbers or names and returns new object with levels interchanged

In [26]:
frame.swaplevel("key1", "key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


`sort_index` by default sorts on all levels lexiographically but it also acceps parameter level to sort only by one leve

In [27]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [28]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Summary Statistics by Level

In [30]:
frame.groupby(level="key2").sum()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [34]:
frame.groupby(level="color", axis=1).sum()

  frame.groupby(level="color", axis=1).sum()


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [36]:
frame.T.groupby(level="color").sum()

key1,a,a,b,b
key2,1,2,1,2
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Green,2,8,14,20
Red,1,4,7,10


## Indexing with a DataFrame's columns

In [37]:
# It is not unusual to move one or two columns of dataframe as the row index. Sometimes and index is moved to be a dataframe column.
frame = pd.DataFrame({"a": range(7), "b": range(7, 0, -1),
                      "c": ["one", "one", "one", "two", "two",
                             "two", "two"],
                      "d": [0, 1, 2, 0, 1, 2, 3]})

frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3
