# Data Aggregation and Group Operations

## GroupBy mechanics

In [2]:
import pandas as pd
import numpy as np

In [12]:
df = pd.DataFrame({
    'data1' : np.random.randn(5) * 50 + 20,
    'data2' : np.random.randn(5) * 20 + 5,
    'key1' : ['one', 'two', 'one', 'two', 'one'],
    'key2' : list('aabba')})

df

Unnamed: 0,data1,data2,key1,key2
0,9.285499,14.19693,one,a
1,-33.947354,5.010908,two,a
2,18.338313,-42.904177,one,b
3,99.520381,-7.893047,two,b
4,61.090149,4.646001,one,a


In [15]:
gb = df.groupby('key1')

gb.sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,88.713961,-24.061246
two,65.573026,-2.88214


In [16]:
gb.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,29.57132,-8.020415
two,32.786513,-1.44107


In [17]:
gb.std()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,27.668859,30.585335
two,94.37594,9.124474


In [19]:
df.groupby('key2').mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
a,12.142764,7.95128
b,58.929347,-25.398612


In [20]:
df.groupby('key2').std()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
a,47.583135,5.411968
b,57.40439,24.756608


In [24]:
df

Unnamed: 0,data1,data2,key1,key2
0,9.285499,14.19693,one,a
1,-33.947354,5.010908,two,a
2,18.338313,-42.904177,one,b
3,99.520381,-7.893047,two,b
4,61.090149,4.646001,one,a


In [23]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,35.187824,9.421466
one,b,18.338313,-42.904177
two,a,-33.947354,5.010908
two,b,99.520381,-7.893047


In [25]:
df.groupby(['key1', 'key2']).size()

key1  key2
one   a       2
      b       1
two   a       1
      b       1
dtype: int64

### Iterating over groups

In [29]:
for name, group in df.groupby('key1'):
    
    print (name, type(group))

one <class 'pandas.core.frame.DataFrame'>
two <class 'pandas.core.frame.DataFrame'>


In [31]:
list(gb)

[('one',        data1      data2 key1 key2
  0   9.285499  14.196930  one    a
  2  18.338313 -42.904177  one    b
  4  61.090149   4.646001  one    a), ('two',        data1     data2 key1 key2
  1 -33.947354  5.010908  two    a
  3  99.520381 -7.893047  two    b)]

In [33]:
dict(list(gb))['one']

Unnamed: 0,data1,data2,key1,key2
0,9.285499,14.19693,one,a
2,18.338313,-42.904177,one,b
4,61.090149,4.646001,one,a


### Selecting a column or subset of columns

In [37]:
df.groupby('key1')['data1'].mean()

key1
one    29.571320
two    32.786513
Name: data1, dtype: float64

In [40]:
df.groupby(['key1','key2'])['data1'].mean()

key1  key2
one   a       35.187824
      b       18.338313
two   a      -33.947354
      b       99.520381
Name: data1, dtype: float64

## Data aggregation

In [43]:
df.groupby('key1')['data1'].quantile(.9)

key1
one    52.539782
two    86.173607
Name: data1, dtype: float64

In [45]:
def peak_to_peak(series):
    return series.max() - series.min()

peak_to_peak(df['data1'])

133.46773496850795

In [46]:
df.groupby('key1')['data1'].agg(peak_to_peak)

key1
one     51.804650
two    133.467735
Name: data1, dtype: float64

In [47]:
!wget https://github.com/wesm/pydata-book/raw/1st-edition/ch08/tips.csv

--2018-05-25 18:39:38--  https://github.com/wesm/pydata-book/raw/1st-edition/ch08/tips.csv
Resolving github.com (github.com)... 192.30.253.113, 192.30.253.112
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv [following]
--2018-05-25 18:39:39--  https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.132.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.132.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943 (7,8K) [text/plain]
Saving to: ‘tips.csv’


2018-05-25 18:39:39 (36,0 MB/s) - ‘tips.csv’ saved [7943/7943]



In [50]:
tips = pd.read_csv('tips.csv')
tips.shape


(244, 7)

In [51]:
tips.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [53]:
tips.groupby('sex')['tip'].mean()

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

In [54]:
tips.groupby('sex').mean()

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,18.056897,2.833448,2.45977
Male,20.744076,3.089618,2.630573


In [55]:
tips.groupby('sex').mean()['tip']

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

### Column-wise and multiple function application

In [58]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [60]:
tips.groupby(['sex'])['tip_pct'].agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,mean,std,peak_to_peak
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,0.166491,0.053632,0.360233
Male,0.157651,0.064778,0.674707


In [62]:
stats = tips.groupby(['sex'])['tip_pct'].agg(['mean', 'std'])
stats

Unnamed: 0_level_0,mean,std
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.166491,0.053632
Male,0.157651,0.064778


In [67]:
annotated = tips.merge(stats, left_on='sex', right_index=True)

annotated.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std
43,9.68,1.32,Male,No,Sun,Dinner,2,0.136364,0.157651,0.064778
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,0.157651,0.064778
202,13.0,2.0,Female,Yes,Thur,Lunch,2,0.153846,0.166491,0.053632
26,13.37,2.0,Male,No,Sat,Dinner,2,0.149589,0.157651,0.064778
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672,0.166491,0.053632


In [74]:
annotated['z-score'] = (annotated['tip_pct'] - annotated['mean']) / annotated['std']
annotated.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std,z-score
27,12.69,2.0,Male,No,Sat,Dinner,2,0.157604,0.157651,0.064778,-0.000712
23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288,0.157651,0.064778,0.534714
194,16.58,4.0,Male,Yes,Thur,Lunch,2,0.241255,0.157651,0.064778,1.290626
29,19.65,3.0,Female,No,Sat,Dinner,2,0.152672,0.166491,0.053632,-0.257664
133,12.26,2.0,Female,No,Thur,Lunch,2,0.163132,0.166491,0.053632,-0.062623


In [76]:
tips.groupby(['sex'])['tip_pct'].agg([('media', 'mean'), ('desviacion', 'std')])

Unnamed: 0_level_0,media,desviacion
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.166491,0.053632
Male,0.157651,0.064778


In [78]:
functions_to_use_to_aggregate = {
    'tip' : ['mean', 'std', np.sum],
    'total_bill': 'sum'}
functions_to_use_to_aggregate

{'tip': ['mean', 'std', <function numpy.core.fromnumeric.sum>],
 'total_bill': 'sum'}

In [81]:
multiple_aggregations = tips.groupby('smoker').agg(functions_to_use_to_aggregate)
multiple_aggregations

Unnamed: 0_level_0,tip,tip,tip,total_bill
Unnamed: 0_level_1,mean,std,sum,sum
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
No,2.991854,1.37719,451.77,2897.43
Yes,3.00871,1.401468,279.81,1930.34


In [82]:
multiple_aggregations.columns

MultiIndex(levels=[['tip', 'total_bill'], ['mean', 'std', 'sum']],
           labels=[[0, 0, 0, 1], [0, 1, 2, 2]])

In [85]:
multiple_aggregations['tip']['mean']

smoker
No     2.991854
Yes    3.008710
Name: mean, dtype: float64

In [86]:
multiple_aggregations[('tip','mean')]

smoker
No     2.991854
Yes    3.008710
Name: (tip, mean), dtype: float64

### Apply: General split-apply-combine

In [95]:
def top(df, n=2, column='tip'):
    
    return df.sort_values(by='tip')[-n:]
    
top(tips, n=10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
47,32.4,6.0,Male,No,Sun,Dinner,4,0.185185
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
214,28.17,6.5,Female,Yes,Sat,Dinner,3,0.230742
141,34.3,6.7,Male,No,Thur,Lunch,6,0.195335
59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


In [99]:
gb = tips.groupby('smoker')
gb.apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


In [101]:
lambda df: top(df, n=5)

<function __main__.<lambda>>

In [100]:
gb = tips.groupby('smoker')
gb.apply(lambda df: top(df, n=5))

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,47,32.4,6.0,Male,No,Sun,Dinner,4,0.185185
No,141,34.3,6.7,Male,No,Thur,Lunch,6,0.195335
No,59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
No,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Yes,211,25.89,5.16,Male,Yes,Sat,Dinner,4,0.199305
Yes,181,23.33,5.65,Male,Yes,Sun,Dinner,2,0.242177
Yes,214,28.17,6.5,Female,Yes,Sat,Dinner,3,0.230742
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


In [104]:
gb.apply(top, n=5, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,day,sex,size,time,tip,tip_pct,total_bill
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,47,Sun,Male,4,Dinner,6.0,0.185185,32.4
No,141,Thur,Male,6,Lunch,6.7,0.195335,34.3
No,59,Sat,Male,4,Dinner,6.73,0.139424,48.27
No,23,Sat,Male,4,Dinner,7.58,0.192288,39.42
No,212,Sat,Male,4,Dinner,9.0,0.18622,48.33
Yes,211,Sat,Male,4,Dinner,5.16,0.199305,25.89
Yes,181,Sun,Male,2,Dinner,5.65,0.242177,23.33
Yes,214,Sat,Female,3,Dinner,6.5,0.230742,28.17
Yes,183,Sun,Male,4,Dinner,6.5,0.280535,23.17
Yes,170,Sat,Male,3,Dinner,10.0,0.196812,50.81


#### Suppressing the group keys

In [107]:
tips.groupby('smoker', group_keys=True).apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


In [108]:
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


### Quantile and bucket analysis

In [113]:
pd.cut(tips['total_bill'], 5).head()

0    (12.618, 22.166]
1     (3.022, 12.618]
2    (12.618, 22.166]
3    (22.166, 31.714]
4    (22.166, 31.714]
Name: total_bill, dtype: category
Categories (5, interval[float64]): [(3.022, 12.618] < (12.618, 22.166] < (22.166, 31.714] < (31.714, 41.262] < (41.262, 50.81]]

In [118]:
tips['total_bill'].head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64

In [119]:
pd.cut(tips['total_bill'], range(0,30,5)).head()

0    (15, 20]
1    (10, 15]
2    (20, 25]
3    (20, 25]
4    (20, 25]
Name: total_bill, dtype: category
Categories (5, interval[int64]): [(0, 5] < (5, 10] < (10, 15] < (15, 20] < (20, 25]]

In [123]:
pd.qcut(tips['total_bill'], 10).head()

0    (16.222, 17.795]
1      (3.069, 10.34]
2    (19.818, 22.508]
3    (22.508, 26.098]
4    (22.508, 26.098]
Name: total_bill, dtype: category
Categories (10, interval[float64]): [(3.069, 10.34] < (10.34, 12.636] < (12.636, 14.249] < (14.249, 16.222] ... (19.818, 22.508] < (22.508, 26.098] < (26.098, 32.235] < (32.235, 50.81]]

In [125]:
quantile_series = pd.qcut(tips['total_bill'], 10)

tips.groupby(quantile_series).size()

total_bill
(3.069, 10.34]      26
(10.34, 12.636]     23
(12.636, 14.249]    24
(14.249, 16.222]    25
(16.222, 17.795]    24
(17.795, 19.818]    24
(19.818, 22.508]    25
(22.508, 26.098]    24
(26.098, 32.235]    24
(32.235, 50.81]     25
dtype: int64

In [126]:
tips.groupby(quantile_series).mean()

Unnamed: 0_level_0,total_bill,tip,size,tip_pct
total_bill,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(3.069, 10.34]",8.828462,1.844615,1.923077,0.215923
"(10.34, 12.636]",11.61087,1.886522,2.0,0.162514
"(12.636, 14.249]",13.330417,2.238333,2.0,0.167743
"(14.249, 16.222]",15.3324,2.4156,2.16,0.158184
"(16.222, 17.795]",16.88,2.95375,2.416667,0.175093
"(17.795, 19.818]",18.572917,2.95375,2.583333,0.159171
"(19.818, 22.508]",20.9712,3.3956,2.6,0.161685
"(22.508, 26.098]",24.206667,3.715833,2.958333,0.153092
"(26.098, 32.235]",28.842917,3.7075,3.458333,0.128327
"(32.235, 50.81]",39.184,4.8516,3.6,0.123403


### Example: Filling missing values with group-specific values

In [127]:
provinces = ['M', 'Va', 'So', 'O', 'Ac', 'S']

groups = ['C', 'C', 'C', 'N', 'N', 'N']

df = pd.DataFrame(np.random.randn(6) * 1000000, index=provinces)
df

Unnamed: 0,0
M,-323841.8
Va,1319858.0
So,2209741.0
O,332036.1
Ac,-743948.4
S,598630.0


In [129]:
df.groupby(groups).sum()

Unnamed: 0,0
C,3205758.0
N,186717.7


In [135]:
df[0][2:5] = np.nan
df

Unnamed: 0,0
M,-323841.8
Va,1319858.0
So,
O,
Ac,
S,598630.0


In [138]:
means = df.groupby(groups).mean()
means

Unnamed: 0,0
C,498008.226158
N,598629.989605


In [142]:
df.groupby(groups).apply(lambda df: df.fillna(df.mean()))

Unnamed: 0,Unnamed: 1,0
C,M,-323841.8
C,Va,1319858.0
C,So,498008.2
N,O,598630.0
N,Ac,598630.0
N,S,598630.0


## Pivot tables and Cross-tabulation

In [145]:
pivoted = tips.pivot_table(index=['sex','smoker'])
type(pivoted)

pandas.core.frame.DataFrame

In [146]:
pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,2.592593,2.773519,0.156921,18.105185
Female,Yes,2.242424,2.931515,0.18215,17.977879
Male,No,2.71134,3.113402,0.160669,19.791237
Male,Yes,2.5,3.051167,0.152771,22.2845


In [147]:
pivoted = tips.pivot_table(['tip', 'size'],index=['sex','smoker'])
pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,2.592593,2.773519
Female,Yes,2.242424,2.931515
Male,No,2.71134,3.113402
Male,Yes,2.5,3.051167
