In [1]:
import pandas as pd
import numpy as np

# GroupBy 메카닉

In [2]:
df = pd.DataFrame({'key1': ['a','a','b','b','a'],
                  'key2': ['one','two','one','two','one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.772222,0.8245
1,a,two,-0.685153,-0.965524
2,b,one,-0.607463,0.621705
3,b,two,-0.342635,1.595988
4,a,one,-0.532102,0.798709


In [3]:
# 위 데이터를 key1으로 묶고 각 그룹에서 data1의 평균 구하기

grouped = df['data1'].groupby(df['key1'])    #groupby 메서드를 호출하고 key1 컬럼을 넘김
grouped       #key1 컬럼에 있는 유일한 값으로 색인되는 Series객체가 생성됨, 생성된 객체의 색인은 key1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002253240E348>

In [4]:
grouped.mean()

key1
a   -0.663159
b   -0.475049
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()  # 여러개의 배열을 '리스트' 넘겨 계층적 색인 삼음
means

key1  key2
a     one    -0.652162
      two    -0.685153
b     one    -0.607463
      two    -0.342635
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.652162,-0.685153
b,-0.607463,-0.342635


In [7]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()    
# 길이만 같다면 어떤 배열이라도 상관 없음, 이 경우는 key1, key2가 states,years로 바뀐 채 평균 구하는 연산 진행된 것

California  2005   -0.685153
            2006   -0.607463
Ohio        2005   -0.557428
            2006   -0.532102
Name: data1, dtype: float64

In [8]:
# DataFrame에 대하여

df.groupby('key1').mean()    
# 출력물을 보면 컬럼에 key2가 빠져있다. 숫자데이터가 아니므로 '성가신 컬럼' 취급해 자동으로 결과에서 제외한 것

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.663159,0.219228
b,-0.475049,1.108846


In [9]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.652162,0.811605
a,two,-0.685153,-0.965524
b,one,-0.607463,0.621705
b,two,-0.342635,1.595988


In [10]:
#그룹의 크기를 담고 있는 Series 반환하는 size 메서드
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

그룹간 순회

In [11]:
for name, group in df.groupby('key1'): #이터레이션을 지원, 그룹 이름과 그에 따른 데이터 묶음을 튜플로 반환
    print(name)   # a와 b
    print('\n')
    print(group)  #아래 DataFrame들(type을 검사해보면 DataFrame임)
    print('\n')

a


  key1 key2     data1     data2
0    a  one -0.772222  0.824500
1    a  two -0.685153 -0.965524
4    a  one -0.532102  0.798709


b


  key1 key2     data1     data2
2    b  one -0.607463  0.621705
3    b  two -0.342635  1.595988




In [12]:
# 기준을 두 개로 주어서 묶는 것도 마찬가지 
for (k1,k2), group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.772222  0.824500
4    a  one -0.532102  0.798709
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.685153 -0.965524
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.607463  0.621705
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.342635  1.595988


In [13]:
pieces = dict(list(df.groupby('key1')))      # 그룹별 데이터를 사전형으로 바꿔서 사용하기
pieces['b']                                  # key를 a,b로, value를 DataFrame으로 갖는 딕셔너리

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.607463,0.621705
3,b,two,-0.342635,1.595988


In [14]:
# 기본적으로 axis = 0에 대하여 그룹을 만들지만 다른 축으로 하는 것도 가능

df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [15]:
# 다음은 df.dtypes를 기준으로 삼아 그룹을 묶음

grouped = df.groupby(df.dtypes, axis=1)
for dtype,group in grouped:
    print(dtype)
    print('\n')
    print(group)
    print('\n')

float64


      data1     data2
0 -0.772222  0.824500
1 -0.685153 -0.965524
2 -0.607463  0.621705
3 -0.342635  1.595988
4 -0.532102  0.798709


object


  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one




컬럼이나 컬럼의 일부만 선택하기

In [16]:
df.groupby(['key1','key2'])[['data2']].mean()      # data2 컬럼의 자료를 key1, key2를 기준으로 분류

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.811605
a,two,-0.965524
b,one,0.621705
b,two,1.595988


In [17]:
# 색인으로 얻은 객체는 groupby 메서드에 '리스트나 배열'을 넘겼을 경우 DataFrameGroupBy 객체가 되고
# '단일 값'으로 하나의 컬럼 이름만 넘겼을 경우 SeriesGroupBy 객체가 된다.
df.groupby(['key1','key2'])['data2'].mean() 

key1  key2
a     one     0.811605
      two    -0.965524
b     one     0.621705
      two     1.595988
Name: data2, dtype: float64

사전과 Series에서 그룹핑하기

In [18]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis'])
people.iloc[2:3,[1,2]] = np.nan   # NaN 값을 추가합시다
people

Unnamed: 0,a,b,c,d,e
Joe,-0.840207,0.077488,0.865513,0.460271,0.471456
Steve,0.33709,-0.536188,0.883699,0.420053,-0.744443
Wes,-2.072589,,,0.593197,1.779559
Jim,0.458349,-1.442659,1.467385,-0.438542,-0.879397
Travis,0.247845,1.207355,-0.629301,0.228919,-0.893715


In [19]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

by_column = people.groupby(mapping,axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,1.325784,-0.291263
Steve,1.303752,-0.943542
Wes,0.593197,-0.29303
Jim,1.028843,-1.863708
Travis,-0.400382,0.561484


In [20]:
map_series = pd.Series(mapping)
people.groupby(map_series, axis=1).sum()    #딕셔너리 대신 Series에 대해서도 같은 기능을 수행할 수 있다

Unnamed: 0,blue,red
Joe,1.325784,-0.291263
Steve,1.303752,-0.943542
Wes,0.593197,-0.29303
Jim,1.028843,-1.863708
Travis,-0.400382,0.561484


함수로 그룹핑

In [21]:
people.groupby(len).sum()    # len은 사람 이름에 대한 길이를 기준으로 분류함

Unnamed: 0,a,b,c,d,e
3,-2.454447,-1.365171,2.332898,0.614926,1.371618
5,0.33709,-0.536188,0.883699,0.420053,-0.744443
6,0.247845,1.207355,-0.629301,0.228919,-0.893715


In [22]:
# 내부적으로는 모두 배열로 변환되므로 함수를 배열, 사전, Series와 섞어 쓰더라고 전혀 문제되지 않는다

key_list=['one','one','one','two','two']     # len이 3인 사람 중 Joe, Wes는 one, Jimd은 two
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-2.072589,0.077488,0.865513,0.460271,0.471456
3,two,0.458349,-1.442659,1.467385,-0.438542,-0.879397
5,one,0.33709,-0.536188,0.883699,0.420053,-0.744443
6,two,0.247845,1.207355,-0.629301,0.228919,-0.893715


색인단계로 그룹핑하기

In [23]:
# 계층적으로 색인된 데이터는 축 색인의 단계 중 하나를 사용해서 편리하게 집계할 수 있는 기능을 제공한다.

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
#['US','US','US','JP','JP'] 컬럼의 이름은 cty
#[1,3,5,1,3] 컬럼의 이름은 tenor

hier_df = pd.DataFrame(np.random.randn(4,5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.852137,-0.665252,0.357801,-0.603953,-0.605784
1,-0.687421,-0.18556,1.458691,1.172315,1.182538
2,-0.849576,0.320269,-0.979541,-0.644603,-0.276314
3,0.326249,1.417131,0.00178,-0.34838,-0.34809


In [24]:
# 색인 중 하나를 사용하는 기능을 사용하고자 한다면 level 예약어를 사용해서 레벨 번호나 이름을 넘기면 된다.
hier_df.groupby(level='tenor',axis=1).count()

tenor,1,3,5
0,2,2,1
1,2,2,1
2,2,2,1
3,2,2,1


# 데이터 집계

In [25]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.772222,0.8245
1,a,two,-0.685153,-0.965524
2,b,one,-0.607463,0.621705
3,b,two,-0.342635,1.595988
4,a,one,-0.532102,0.798709


In [26]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)    # 변위치 계산

key1
a   -0.562712
b   -0.369117
Name: data1, dtype: float64

In [27]:
# 자신만의 데이터 집계 함수 사용하기 --> agg 혹은 aggregate 메서드에 해당 함수를 넘김

def peakpeak(arr):
    return arr.max()-arr.min()
grouped.agg(peakpeak)

# 단, 사용자 정의 함수는 일반적으로 정의된 함수에 비해 매우 느리게 동작한다

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.24012,1.790025
b,0.264829,0.974283


컬럼에 여러 가지 함수 적용하기

In [16]:
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']  # 새 컬럼 추가
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [29]:
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')       #이미 통용되는 기술 통계 함수는 함수 이름을 문자열로 넘기면 된다.

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [30]:
grouped_pct.agg(['mean','std',peakpeak])  # 함수목록을 리스트로 넘기면 DataFrame을 얻을 수 있다.
                                          # 사용자 정의 함수는 ''를 붙히지 않는다.

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peakpeak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [31]:
# grouped_pct = grouped['tip_pct']
grouped_pct.agg([('평균','mean'),('표준편차',np.std)])       # 함수 이름이 아닌 다른 이름으로 컬럼 이름을 지정하는 방법

Unnamed: 0_level_0,Unnamed: 1_level_0,평균,표준편차
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [32]:
# DataFrame은 컬럼마다 다른 함수를 적용하거나 여러 개의 함수를 모든 컬럼에 적용할 수 있다.

functions=[('개수','count'),'mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,개수,mean,max,개수,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [33]:
# 해당 DataFrame은 계층적 컬럼을 가지고 있음. 
# 각 컬럼을 따로 계산한 다음 concat 메서드를 이용해 keys인자로 컬럼이름을 넘겨서 이어붙인 것과 동일
# pd.concat([result['tip_pct'],result['total_bill']],axis=1)
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,개수,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [34]:
# 컬럼마다 다른 함수를 사용하고 싶다면 딕셔너리를 이용

grouped.agg({'tip':np.max,'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [35]:
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
# 단 하나의 컬럼에라도 여러 개의 함수가 적용되었다면 DataFrame은 계층적 컬럼을 가지게 된다.

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


In [36]:
tips.groupby(['day','smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


In [37]:
# as_index=False를 넘겨 색인되지 않도록 할 수 있다.
tips.groupby(['day','smoker'],as_index=False).mean()

# 혹은 색인된 결과에 reset_index 메서드를 호출해 같은 결과를 얻을 수 있다.
# tips.groupby(['day','smoker']).mean().reset_index()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


# Apply : 일반적인 분리 - 적용 - 병합

In [38]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]       
# 특정 컬럼(기본값은 tip_pct)에서 가장 큰 값을 가지는 로우를 선택하는 함수

top(tips,n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [39]:
tips.groupby('smoker').apply(top)

# 흡연 여부를 No와 Yes로 분류한 상태로 top을 적용
# --> 나누어진 DataFrame의 각 부분에 모두 적용 
# --> pd.concat으로 합쳐진 후 이름 붙은 꼴

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [40]:
# apply 메서드로 넘길 함수가 추가적인 인자를 받는다면 함수 이름 뒤에 붙여서 넘겨주면 된다.
tips.groupby(['smoker','day']).apply(top,n=2,column='total_bill')
# 두 개의 분류 기준값마다 두 개(n=2)씩이 최대값들이 출력됨. 최대값의 기준 컬럼은 totall_bill

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,91,22.49,3.5,No,Fri,Dinner,2,0.155625
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,59,48.27,6.73,No,Sat,Dinner,4,0.139424
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,112,38.07,4.0,No,Sun,Dinner,3,0.10507
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,85,34.83,5.17,No,Thur,Lunch,4,0.148435
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,90,28.97,3.0,Yes,Fri,Dinner,2,0.103555
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775


그룹 색인 생략하기

In [41]:
# group_keys=False 옵션을 통해 색인 생략 가능

tips.groupby('smoker',group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


변위치 분석과 버킷 분석

In [42]:
frame = pd.DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
quartiles = pd.cut(frame.data1,4)
# cut(~,4)는 그룹의 크기를 4등분하는 구간 (처음값과 긑값이 중요)
#   --> list1, list2(list1 보다 1만큼 작은 리스트)를 옵션으로 주는 것도 가능
# qcut(~,4)는 분위수에 맞게 곧 구간마다 원소들의 개수가 일정하게 하는 4개 구간으로 나눔
# precision =2 하면 소수점 2자리까지로 제한

quartiles[:10]

0    (-2.239, -0.423]
1    (-4.062, -2.239]
2     (-0.423, 1.394]
3     (-0.423, 1.394]
4    (-2.239, -0.423]
5     (-0.423, 1.394]
6     (-0.423, 1.394]
7    (-2.239, -0.423]
8    (-2.239, -0.423]
9     (-0.423, 1.394]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-4.062, -2.239] < (-2.239, -0.423] < (-0.423, 1.394] < (1.394, 3.21]]

In [43]:
# data2 컬럼에 대한 몇 가지 통계 계산

def get_starts(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}

frame.data2.groupby(quartiles).apply(get_starts).unstack()
# 인덱스 명은 위에서 data1으로 설정되어있음 (컬럼마다의 계산값은 data2를 나타내지만 분류기준은 data1임)

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-4.062, -2.239]",-1.597025,1.514966,15.0,-0.007502
"(-2.239, -0.423]",-3.024237,2.741577,331.0,0.09225
"(-0.423, 1.394]",-2.973792,3.117275,571.0,0.035157
"(1.394, 3.21]",-2.626456,3.323755,83.0,0.05542


In [44]:
# 위는 등간격 버킷, 아래는 표본 변위치에 기반한 버킷

grouping = pd.qcut(frame.data1,10,labels=False)
grouping   # 각 색인에 해당하는 값들이 몇 번째 구간에 속해있나만 나타냄

0      1
1      0
2      7
3      4
4      2
      ..
995    5
996    8
997    4
998    2
999    6
Name: data1, Length: 1000, dtype: int64

In [45]:
frame.data2.groupby(grouping).apply(get_starts).unstack()
# 마찬가지로 각 data1 의 구간에 따른 data2 값들의 (함수가 적용된 형태로) 반환

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-2.783508,2.062414,100.0,0.150376
1,-3.024237,2.185899,100.0,-0.088328
2,-2.050592,2.741577,100.0,0.219492
3,-2.620827,2.263307,100.0,0.019146
4,-2.135401,2.744904,100.0,0.100864
5,-1.869094,3.117275,100.0,0.124102
6,-2.829223,2.504961,100.0,-0.123898
7,-2.319567,2.380799,100.0,0.049702
8,-2.973792,2.062946,100.0,0.035847
9,-2.626456,3.323755,100.0,0.063662


예제 : 그룹에 다른 값으로 결측치 채우기

In [46]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1   -0.925038
2         NaN
3    0.142385
4         NaN
5    0.262740
dtype: float64

In [47]:
s.fillna(s.mean())

0   -0.173304
1   -0.925038
2   -0.173304
3    0.142385
4   -0.173304
5    0.262740
dtype: float64

In [48]:
# 그룹별로 채워넣고 싶은 갑싱 다르다면 데이터를 그룹으로 나누고 apply함수를 사용해서 각 그룹에 대한 fillna를 적용하면 된다.

states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key = ['East']*4 + ['West']*4
data = pd.Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
data

Ohio          1.455284
New York      0.153257
Vermont            NaN
Florida       0.010949
Oregon       -1.639126
Nevada             NaN
California    0.031165
Idaho              NaN
dtype: float64

In [49]:
data.groupby(group_key).mean()

East    0.53983
West   -0.80398
dtype: float64

In [50]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
# 결측치가 위아래 로우의 평균값으로 채워진다

Ohio          1.455284
New York      0.153257
Vermont       0.539830
Florida       0.010949
Oregon       -1.639126
Nevada       -0.803980
California    0.031165
Idaho        -0.803980
dtype: float64

In [51]:
fill_values = {'East':0.5,'West':-1}
fill_func = lambda g:g.fillna(fill_values[g.name])   # 각 그룹은 내부적으로 name이라는 속성 가지고 있음
data.groupby(group_key).apply(fill_func)

Ohio          1.455284
New York      0.153257
Vermont       0.500000
Florida       0.010949
Oregon       -1.639126
Nevada       -1.000000
California    0.031165
Idaho        -1.000000
dtype: float64

예제 : 랜덤 표본과 순열

In [52]:
# 하트, 스페이드, 클럽, 다이아몬드
suits = ['H','S','C','D']
card_val = (list(range(1,11))+[10]*3)*4
base_names=['A']+list(range(2,11))+['J','K','Q']
cards=[]
for suit in suits:
    cards.extend(str(num)+suit for num in base_names)

deck = pd.Series(card_val, index=cards)
deck[:15]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
dtype: int64

In [53]:
# 다섯 장의 카드 뽑기

def draw(deck,n=5):
    return deck.sample(n)

draw(deck)

3H     3
JS    10
9S     9
2C     2
7C     7
dtype: int64

In [54]:
# 각 세트(하트,스페이드,클럽,다이아몬드)별로 2장의 카드를 무작위로 뽑기

get_suit = lambda card: card[-1]  # 마지막 글자가 세트
deck.groupby(get_suit).apply(draw,n=2)    # apply는 각각 분류된 DataFrame마다 함수를 적용하는 것
# (p395참고) groupby의 괄호안에는 함수가 들어가도 상관없음. 내부적으로는 배열로 변환되서 계산 

C  QC     10
   3C      3
D  4D      4
   2D      2
H  JH     10
   10H    10
S  3S      3
   9S      9
dtype: int64

예제 : 그룹 가중 평균과 상관관계

In [55]:
df = pd.DataFrame({'category': ['a','a','a','a','b','b','b','b'],
                  'data': np.random.randn(8),
                  'weights': np.random.rand(8)})    # weights는 가중평균
df

Unnamed: 0,category,data,weights
0,a,1.033959,0.718825
1,a,-0.035451,0.127723
2,a,-1.040863,0.870262
3,a,-0.667862,0.512442
4,b,1.996377,0.105387
5,b,-0.016034,0.597051
6,b,-2.392828,0.950718
7,b,-0.028319,0.826966


In [63]:
grouped = df.groupby('category')

# category별 그룹 가중평균

get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
# np.average(list,weights=weights)  list의 원소들에 weights만큼의 가중치를 부과해 그 가중평균을 구한다
grouped.apply(get_wavg)

category
a   -0.228488
b   -0.845726
dtype: float64

In [68]:
close_px = pd.read_csv('stock_px_2.csv',parse_dates=True,index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB


In [69]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [72]:
# 퍼센트 변화율로 일일 수익률을 계산하여 연간 SPX 지수와의 상관관계를 살펴보도록 합시다.

# 우선 SPX 컬럼과 다른 컬럼과의 상관관계 계산
spx_corr = lambda x:x.corrwith(x['SPX'])
# pct_change 함수를 이용해 close_px의 퍼센트 변화율을 계산
rets = close_px.pct_change().dropna()
# 각 datetime에서 연도속성만을 반환하는 함수를 이용해 연도별 퍼센트 변화율 구한다.
get_year = lambda x:x.year

by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [73]:
# 두 컬럼간의 상관관계 계산(다음은 애플과 마이크로소프트)

by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

예제 : 그룹상의 선형회귀

In [74]:
import statsmodels.api as sm
def regress(data,yvar,xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1
    result = sm.OLS(Y,X).fit()
    return result.params

In [75]:
by_year.apply(regress,'AAPL',['SPX'])
# SPX 수익률에 대한 애플(AAPL) 주식의 연간 선형회귀

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


# 피벗테이블과 교차일람표

In [77]:
tips.pivot_table(index=['day','smoker'])
# tips의 여섯 개 컬럼 중 두 개를 인덱스로 사용

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [78]:
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker')
# 나타낼 값, 기준이 될 인덱스, 컬럼을 지정

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [81]:
# margines=True를 넘겨서 부분합을 포함하도록 확장 가능 --> All컬럼과 All로우가 추가되어 단일 줄 내에서의 그룹 통계 얻음

tips.pivot_table(['tip_pct'],index=['time','smoker'],columns='day',aggfunc=len, margins=True)
# 다른 집계함수를 사용하기 위해서는 aggfunc로 넘기면 되는데 (DataFrame안에 값으로 들어가있는 것들)
# 예를 들어 'count'나 len함수는 그룹 크기의 교차일람표(총 개수나 빈도)를 반환한다.

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


In [84]:
# 만일 어떤 조합이 비어있다면 fill_value를 넘길 수 있음

tips.pivot_table(['tip_pct'],index=['time','size','smoker'],columns='day',aggfunc='mean', fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Dinner,1,No,0.0,0.137931,0.0,0.0
Dinner,1,Yes,0.0,0.325733,0.0,0.0
Dinner,2,No,0.139622,0.162705,0.168859,0.159744
Dinner,2,Yes,0.171297,0.148668,0.207893,0.0
Dinner,3,No,0.0,0.154661,0.152663,0.0
Dinner,3,Yes,0.0,0.144995,0.15266,0.0
Dinner,4,No,0.0,0.150096,0.148143,0.0
Dinner,4,Yes,0.11775,0.124515,0.19337,0.0
Dinner,5,No,0.0,0.0,0.206928,0.0
Dinner,5,Yes,0.0,0.106572,0.06566,0.0


교차일람표

In [14]:
d=[list(range(1,11)),['USA','Japan','USA','Japan','Japan','Japan','USA','USA','Japan','USA'],
   ['Right','Left','Right','Right','Left','Right','Right','Left','Right','Right']]
a=np.array(d).T
data=pd.DataFrame(a,columns=['Sample','Nationality','Handedness'])
data

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right
1,2,Japan,Left
2,3,USA,Right
3,4,Japan,Right
4,5,Japan,Left
5,6,Japan,Right
6,7,USA,Right
7,8,USA,Left
8,9,Japan,Right
9,10,USA,Right


In [15]:
# 요약

pd.crosstab(data.Nationality,data.Handedness,margins=True)

Handedness,Left,Right,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [17]:
# crosstab함수의 처음 두 인자는 배열이나 Series 혹은 배열의 리스트가 될 수 있다.

pd.crosstab([tips.time, tips.day],tips.smoker,margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
