```
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df.groupby(['key1', 'key2'])[['data2']] are syntactic sugar for:

df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
df['data2'].groupby([df['key1'],df['key2']])
```

In [2]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [35]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.124121,0.302614,0.523772,0.00094,1.34381
Steve,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
Wes,0.560145,-1.265934,0.119827,-1.063512,0.332883
Jim,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703
Travis,0.28635,0.377984,-0.753887,0.331286,1.349742


In [36]:
people.index

Index(['Joe', 'Steve', 'Wes', 'Jim', 'Travis'], dtype='object')

In [37]:
index_key = pd.unique([len(x)for x in people.index]) #pd.array -> same result groupby as list below
# index_key = list(index_key) #list 

In [38]:
{len(x)for x in people.index} #set

{3, 5, 6}

In [39]:
list(people.iloc[:,0])

[0.12412127567340774,
 -0.7135439850963832,
 0.5601452930280342,
 -2.3594188073836815,
 0.2863497470141551]

In [40]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [41]:
[index_key, key_list]

[array([3, 5, 6]), ['one', 'one', 'one', 'two', 'two']]

In [65]:
#len : 3,5,3,3,6 
people.groupby([len, key_list]).mean()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.342133,-0.48166,0.3218,-0.531286,0.838346
3,two,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703
5,one,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
6,two,0.28635,0.377984,-0.753887,0.331286,1.349742


In [66]:
#len : 3,5,3,3,6 
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.124121,-1.265934,0.119827,-1.063512,0.332883
3,two,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703
5,one,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
6,two,0.28635,0.377984,-0.753887,0.331286,1.349742


In [67]:
# key_list2 = ['two', 'two', 'two', 'one', 'two'] 
# key_list2 = ['one', 'two', 'two', 'two', 'two']
key_list2 = ['one', 'one', 'two', 'one', 'three'] #duplcated elements(one, one) grouped as single(one)

In [68]:
[index_key, key_list2] ##len : 3,5,3,3,6 

[array([3, 5, 6]), ['one', 'one', 'two', 'one', 'three']]

In [69]:
#len : 3,5,3,3,6 
people.groupby([len, key_list2]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703
3,two,0.560145,-1.265934,0.119827,-1.063512,0.332883
5,one,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
6,three,0.28635,0.377984,-0.753887,0.331286,1.349742


In [70]:
key_list2 = ['one', 'two', 'two', 'two', 'two']
people.groupby([len, key_list2]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.124121,0.302614,0.523772,0.00094,1.34381
3,two,-2.359419,-1.265934,-1.541996,-1.063512,-1.30703
5,two,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
6,two,0.28635,0.377984,-0.753887,0.331286,1.349742


In [71]:
key_list2 = ['two', 'two', 'two', 'two', 'two']
people.groupby([len, key_list2]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,two,-2.359419,-1.265934,-1.541996,-1.063512,-1.30703
5,two,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
6,two,0.28635,0.377984,-0.753887,0.331286,1.349742


In [72]:
key_list2 = ['two', 'two', 'two', 'one', 'two']
people.groupby([len, key_list2]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703
3,two,0.124121,-1.265934,0.119827,-1.063512,0.332883
5,two,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757
6,two,0.28635,0.377984,-0.753887,0.331286,1.349742


In [73]:
df = pd.DataFrame({"A":[1, 5, 3, 4, 2], 
                   "B":[3, 2, 4, 3, 4], 
                   "C":[2, 2, 7, 3, 4],  
                   "D":[4, 3, 6, 12, 7]}) 
df

Unnamed: 0,A,B,C,D
0,1,3,2,4
1,5,2,2,3
2,3,4,7,6
3,4,3,3,12
4,2,4,4,7


In [74]:
df.quantile(.2, axis = 0) 

A    1.8
B    2.8
C    2.0
D    3.8
Name: 0.2, dtype: float64

In [76]:
df['A'].mean()

3.0

In [79]:
df['A'].quantile(.5) 

3.0

In [80]:
tips = pd.read_csv('examples/tips.csv')
print(tips.shape)
tips.head()

(244, 6)


Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [88]:
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.50,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.139780
4,24.59,3.61,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,No,Sat,Dinner,2,0.098204


In [86]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [87]:
tips.groupby(['smoker', 'day']).apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,99,12.46,1.50,No,Fri,Dinner,2,0.120385
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Fri,91,22.49,3.50,No,Fri,Dinner,2,0.155625
No,Fri,223,15.98,3.00,No,Fri,Lunch,3,0.187735
No,Sat,228,13.28,2.72,No,Sat,Dinner,2,0.204819
...,...,...,...,...,...,...,...,...,...
Yes,Thur,204,20.53,4.00,Yes,Thur,Lunch,4,0.194837
Yes,Thur,205,16.47,3.23,Yes,Thur,Lunch,3,0.196114
Yes,Thur,191,19.81,4.19,Yes,Thur,Lunch,2,0.211509
Yes,Thur,200,18.71,4.00,Yes,Thur,Lunch,3,0.213789
