## 8장. 데이터 준비하기 : 조인, 병합, 변형

In [1]:
import numpy as np
import pandas as pd

In [2]:
## multi index 
data = pd.Series(np.random.rand(9), 
                 index = [['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], 
                          [1,2,3,1,3,1,2,2,3] ])
data

a  1    0.655976
   2    0.006462
   3    0.175910
b  1    0.561766
   3    0.784544
c  1    0.807393
   2    0.623753
d  2    0.521831
   3    0.236988
dtype: float64

In [3]:
# stack and unstack 
data_01 = data.unstack()
data_01

Unnamed: 0,1,2,3
a,0.655976,0.006462,0.17591
b,0.561766,,0.784544
c,0.807393,0.623753,
d,,0.521831,0.236988


In [4]:
data_02 = data_01.stack()
data_02

a  1    0.655976
   2    0.006462
   3    0.175910
b  1    0.561766
   3    0.784544
c  1    0.807393
   2    0.623753
d  2    0.521831
   3    0.236988
dtype: float64

In [5]:
## 인덱스와 컬럼에 모두 다중색인 지정 
d1 = pd.DataFrame(np.arange(12).reshape(4,3), 
                  index = [['a', 'a', 'b', 'b'],[1,2,7,9]], 
                  columns = [['LA', 'NY', 'NY'], ['Red', 'Blue', 'Red']]
                 )
d1

Unnamed: 0_level_0,Unnamed: 1_level_0,LA,NY,NY
Unnamed: 0_level_1,Unnamed: 1_level_1,Red,Blue,Red
a,1,0,1,2
a,2,3,4,5
b,7,6,7,8
b,9,9,10,11


In [6]:
## data merge(join) 
df_1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 
                    'data1' : range(7)} )

df_2 = pd.DataFrame({'key': ['a', 'b', 'c'], 
                    'data2': range(3)} )

In [7]:
print(df_1)
print(df_2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6
  key  data2
0   a      0
1   b      1
2   c      2


In [8]:
df_3 = pd.merge(df_1, df_2, on='key')
df_3

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0
6,c,3,2


###### pd.merge(data1, data2, on='key', how = 'xxx')
###### (1) inner : default (교집합) 
###### (2) left / right : 좌/우측 기준 
###### (3) outer : 합집합 

In [9]:
df1 = pd.read_csv('examples/chp08_df1.csv')
df2 = pd.read_csv('examples/chp08_df2.csv')

In [10]:
print(df1)
print(df2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5
  key  data2
0   a      0
1   b      1
2   a      2
3   b      3
4   d      4


In [11]:
df3 = pd.merge(df1, df2, on='key', how ='left')
df3

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [12]:
df4 = pd.merge(df1, df2, on='key', how ='outer')
df4

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,0.0,3.0
2,b,1.0,1.0
3,b,1.0,3.0
4,b,5.0,1.0
5,b,5.0,3.0
6,a,2.0,0.0
7,a,2.0,2.0
8,a,4.0,0.0
9,a,4.0,2.0


In [13]:
## combine_first로 누락된 값 채우기 
dff1 = pd.read_csv('examples/chp08_df1-1.csv')
dff2 = pd.read_csv('examples/chp08_df2-1.csv')

In [14]:
print(dff1) 
print(dff2)

     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14
     a    b
0  5.0  NaN
1  4.0  3.0
2  NaN  4.0
3  3.0  6.0
4  7.0  8.0


In [15]:
dff3 = dff1.combine_first(dff2)   ##dff1을 dff2로 업데이트 한다. 
dff3

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


In [2]:
## pivoting ~ stack, unstack 
data = pd.read_csv('examples/macrodata.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 14 columns):
year        203 non-null float64
quarter     203 non-null float64
realgdp     203 non-null float64
realcons    203 non-null float64
realinv     203 non-null float64
realgovt    203 non-null float64
realdpi     203 non-null float64
cpi         203 non-null float64
m1          203 non-null float64
tbilrate    203 non-null float64
unemp       203 non-null float64
pop         203 non-null float64
infl        203 non-null float64
realint     203 non-null float64
dtypes: float64(14)
memory usage: 22.3 KB


In [3]:
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [4]:
## data 변환하기 
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter, name = 'date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name = 'item')
data = data.reindex(columns = columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns = {0:'value'})

In [5]:
ldata.head()

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.0
2,1959-03-31 23:59:59.999999999,unemp,5.8
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.34


In [8]:
## pivot('row name', 'column name', 'contents' )
pivoted = ldata.pivot('date', 'item', 'value')  
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2


In [10]:
## un-pivot : pd.melt 
df = pd.read_csv('examples/chp08_unpivot.csv')
df

Unnamed: 0,key,a,b,c
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [11]:
unp = pd.melt(df, ['key'])   ##'key'를 기준으로 un-puvot하라. 
unp

Unnamed: 0,key,variable,value
0,foo,a,1
1,bar,a,2
2,baz,a,3
3,foo,b,4
4,bar,b,5
5,baz,b,6
6,foo,c,7
7,bar,c,8
8,baz,c,9


In [13]:
pivt = unp.pivot('key', 'variable', 'value')   ## 행 기준 / 열 기준 / 내용 ~ 으로 피봇하라. 
pivt

variable,a,b,c
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7
