<b> This notebook demonstrates the grouping and aggregation that can be applied to Series and DataFrame.</b>
<ul>
<li> NOTE: When we group a dataframe we get another dataframe of type DataFrameGroupBy. </li>
<li> When we access a single column from this grouped dataframe the type returned is SeriesGroupBy.</li>
</ul>

<i>Both these types support many built-in aggregation or reduction functions and provides a way to write our own.</i>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
''' Group By and Aggregate functions '''
tips.groupby('sex')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000009C25F28>

In [6]:
tips.groupby('sex').count() # this will give column wise count (Excluding nan) for each group

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,157,157,157,157,157,157
Female,87,87,87,87,87,87


In [7]:
tips.groupby('sex')['total_bill'].count() # gives the count of the selected column(s) for each group

sex
Male      157
Female     87
Name: total_bill, dtype: int64

In [9]:
tips.groupby('sex')['tip'].max()

sex
Male      10.0
Female     6.5
Name: tip, dtype: float64

In [10]:
tips.groupby('sex')['tip'].sum()

sex
Male      485.07
Female    246.51
Name: tip, dtype: float64

In [11]:
tips.groupby('sex').corr() # pairwise correlation for each group

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,size,1.0,0.45519,0.56881
Male,tip,0.45519,1.0,0.669753
Male,total_bill,0.56881,0.669753,1.0
Female,size,1.0,0.566127,0.651807
Female,tip,0.566127,1.0,0.682999
Female,total_bill,0.651807,0.682999,1.0


In [12]:
tips.groupby('sex').describe() # gives common statistics for each group

Unnamed: 0_level_0,size,size,size,size,size,size,size,size,tip,tip,tip,tip,tip,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Male,157.0,2.630573,0.955997,1.0,2.0,2.0,3.0,6.0,157.0,3.089618,...,3.76,10.0,157.0,20.744076,9.246469,7.25,14.0,18.35,24.71,50.81
Female,87.0,2.45977,0.937644,1.0,2.0,2.0,3.0,6.0,87.0,2.833448,...,3.5,6.5,87.0,18.056897,8.009209,3.07,12.75,16.4,21.52,44.3


In [15]:
tips[:10].groupby('sex')['tip'].cumsum() # cummulative sum for each group

0     1.01
1     1.66
2     5.16
3     8.47
4     4.62
5    13.18
6    15.18
7    18.30
8    20.26
9    23.49
Name: tip, dtype: float64

In [16]:
''' Aggregate using custom function.'''
tips[:10].groupby('sex').agg(['min', 'max']) # Automatically picks numeric columns only. 
#NOTE we can pass list of aggregate functions to the agg() function.

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,size,size
Unnamed: 0_level_1,min,max,min,max,min,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Male,8.77,26.88,1.66,4.71,2,4
Female,16.99,24.59,1.01,3.61,2,4


In [17]:
tips[:10].groupby('sex')[['total_bill', 'tip']].agg(['min', 'max']) # manually pick columns, 
#but same aggregate functions run on both

Unnamed: 0_level_0,total_bill,total_bill,tip,tip
Unnamed: 0_level_1,min,max,min,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,8.77,26.88,1.66,4.71
Female,16.99,24.59,1.01,3.61


In [18]:
# Different aggregate functions on different columns
tips[:10].groupby('sex').agg({'total_bill':['min', 'max'], 'tip': 'sum'}) 

Unnamed: 0_level_0,total_bill,total_bill,tip
Unnamed: 0_level_1,min,max,sum
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Male,8.77,26.88,23.49
Female,16.99,24.59,4.62


In [30]:
tips[:10].groupby('sex').aggregate({'total_bill':['min', 'max'], 'tip': 'sum'}) # aggregate is same as agg

Unnamed: 0_level_0,total_bill,total_bill,tip
Unnamed: 0_level_1,min,max,sum
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Male,8.77,26.88,23.49
Female,16.99,24.59,4.62


In [1]:
# tips[:10].groupby('sex')[['total_bill', 'tip']].agg(lambda x : x * 2) This will not work because  the lambda expression
# does not contain an aggregate

In [25]:
tips[:10].groupby('sex')['total_bill', 'tip'].apply(np.min) # apply used to perform only one function

Unnamed: 0_level_0,total_bill,tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,8.77,1.66
Female,16.99,1.01


In [27]:
''' WHILE AGG/AGGREGATE CAN PERFORM ONLY AGGREGATE OPERATION ... apply can perform any operation'''
tips[:10].groupby('sex')['total_bill', 'tip'].apply(lambda x: x * 2)  

Unnamed: 0,total_bill,tip
0,33.98,2.02
1,20.68,3.32
2,42.02,7.0
3,47.36,6.62
4,49.18,7.22
5,50.58,9.42
6,17.54,4.0
7,53.76,6.24
8,30.08,3.92
9,29.56,6.46


In [29]:
tips[:10][['total_bill', 'tip']].apply(lambda x: x * 2) # # NOTE: grouping does not make any difference

Unnamed: 0,total_bill,tip
0,33.98,2.02
1,20.68,3.32
2,42.02,7.0
3,47.36,6.62
4,49.18,7.22
5,50.58,9.42
6,17.54,4.0
7,53.76,6.24
8,30.08,3.92
9,29.56,6.46
