# 7.2 재형성과 피벗

* 표 형식의 데이터를 재배치하는 다양한 기본 연산이 존재하는데, 이런 연산은 재형성 reshaping 또는 피벗 연산이라고 한다.

## 7.2.1 계층적 색인으로 재형성하기
* stack: 데이터의 칼럼을 로우로 피벗 또는 회전시킨다.
* unstack: 로우를 칼럼으로 피벗시킨다.

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))

In [3]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


* stack 메서드를 사용하면 칼럼이 로우로 피벗되어 이 경우에는 Series 객체를 반환한다.

In [4]:
result = data.stack()

In [5]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

* unstack 메서드를 사용하면 계층적 색인은 가진 Series로부터 DataFrame을 얻을 수 있다.

In [6]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


* stack, unstack 모두 가장 안쪽에 있는 것부터 끄집어낸다.
* 레벨 이름이나 숫자를 전달해서 끄집어낼 단계를 지정할 수 있다.

In [7]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [8]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


* 해당 레벨에 있는 모든 하위 그룹에 속하지 않을 경우 unstack을 하게 되면 누락된 데이터가 생길 수 있다.

In [9]:
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])

In [10]:
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])

In [11]:
data2 = pd.concat([s1, s2], keys=['one', 'two'])

In [12]:
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [13]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


* stack 메서드는 누락된 데이터를 자동으로 걸러낸다.

In [14]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [15]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

* DataFrame을 unstack 할 때, unstack() 레벨은 결과에서 가장 낮은 단계가 된다.

In [16]:
df = DataFrame({'left': result, 'right': result+5},
               columns=pd.Index(['left', 'right'], name='side'))

In [17]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [18]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [19]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


## 7.2.2 피버팅으로 데이터 나열 방식 바꾸기

In [20]:
data = pd.read_csv('../data/macrodata.csv')

In [21]:
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))
ldata = data.stack().reset_index().rename(columns={0: 'value'})

In [22]:
ldata[:10]

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34
5,1959-06-30,unemp,5.1
6,1959-09-30,realgdp,2775.488
7,1959-09-30,infl,2.74
8,1959-09-30,unemp,5.3
9,1959-12-31,realgdp,2785.204


In [23]:
data = pd.read_csv('../data/macrodata.csv')

In [24]:
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')

In [25]:
periods

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')

In [26]:
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))

In [27]:
data.head()

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,2710.349,0.0,5.8
1959-06-30,2778.801,2.34,5.1
1959-09-30,2775.488,2.74,5.3
1959-12-31,2785.204,0.27,5.6
1960-03-31,2847.699,2.31,5.2


In [28]:
ldata = data.stack().reset_index().rename(columns={0: 'value'})

In [29]:
ldata.head(10)

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34
5,1959-06-30,unemp,5.1
6,1959-09-30,realgdp,2775.488
7,1959-09-30,infl,2.74
8,1959-09-30,unemp,5.3
9,1959-12-31,realgdp,2785.204


In [30]:
data.stack().head()

date        item   
1959-03-31  realgdp    2710.349
            infl          0.000
            unemp         5.800
1959-06-30  realgdp    2778.801
            infl          2.340
dtype: float64

In [31]:
data.stack().reset_index().head()

Unnamed: 0,date,item,0
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34


* pivot 메서드의 처음 두 인자는 로우와 칼럼 색인으로 사용될 칼럼 이름이고 마지막 value는 DataFrame에 채워 넣을 값|을 담고 있는 칼럼이다.

In [32]:
pivoted = ldata.pivot('date', 'item', 'value')

In [33]:
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


* 한 번에 2개의 칼럼을 변형
* 마지막 인자를 생략해서 계층적 칼럼을 가지는 DataFrame을 얻을 수 있다.

In [34]:
ldata['value2'] = np.random.randn(len(ldata))

In [35]:
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-03-31,realgdp,2710.349,-0.114851
1,1959-03-31,infl,0.0,-0.451423
2,1959-03-31,unemp,5.8,0.66515
3,1959-06-30,realgdp,2778.801,-1.146776
4,1959-06-30,infl,2.34,-0.891261
5,1959-06-30,unemp,5.1,0.311278
6,1959-09-30,realgdp,2775.488,1.634578
7,1959-09-30,infl,2.74,0.133153
8,1959-09-30,unemp,5.3,1.228262
9,1959-12-31,realgdp,2785.204,0.213844


In [36]:
pivoted = ldata.pivot('date', 'item')

In [37]:
pivoted[:5]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,-0.451423,-0.114851,0.66515
1959-06-30,2.34,2778.801,5.1,-0.891261,-1.146776,0.311278
1959-09-30,2.74,2775.488,5.3,0.133153,1.634578,1.228262
1959-12-31,0.27,2785.204,5.6,1.360587,0.213844,-1.670057
1960-03-31,2.31,2847.699,5.2,-0.553967,-0.479792,-0.551893


In [38]:
pivoted['value'][:5]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


* pivot 메서드는 단지 set_index를 사용해서 계층적 색인을 만들고 unstack 메서드를 이용해서 형태를 변경하는 단축키 같은 메서드다.

In [39]:
unstacked = ldata.set_index(['date', 'item']).unstack('item')

In [40]:
unstacked.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,-0.451423,-0.114851,0.66515
1959-06-30,2.34,2778.801,5.1,-0.891261,-1.146776,0.311278
1959-09-30,2.74,2775.488,5.3,0.133153,1.634578,1.228262
1959-12-31,0.27,2785.204,5.6,1.360587,0.213844,-1.670057
1960-03-31,2.31,2847.699,5.2,-0.553967,-0.479792,-0.551893
