# Aggregation and Grouping - Another Example
The VanderPlas book/notebook uses the planets data from Seaborn.  I'm going to do another example using the tips data (I've been a waiter at two restaurants, so this one was particularly interesting!).

In [None]:
# Setup -- import plus load
import seaborn as sns
tips = sns.load_dataset('tips')
type(tips), tips.shape

In [None]:
# looks like a dataset of restaurant tips!
tips.head()

In [None]:
# Let's calculate the tip percentage
tips['percent'] = (tips['tip'] / tips['total_bill'])*100
tips.head()

## Aggregate Functions

In [None]:
# Aggregate functions - mean, median, and total tips.
tips['tip'].mean(), tips['tip'].median(), tips['tip'].sum()

In [None]:
# total number of people
tips['size'].sum()

In [None]:
# Let's look at some aggregate statistics for the whole dataset
tips.describe()

In [None]:
# wow -- who left a 71% tip?
tips[tips['percent'] > 71]
# looks like a male smoker who ate dinner on a Sunday with one other person.

In [None]:
# who had a $51 meal?
tips[tips['total_bill'] > 50]

In [None]:
set(tips['day'].values)

In [None]:
# Best day to work?  Could use a similar method and filter by day 
tips[tips['day']=='Thur']

In [None]:
# now pick off tips and sum them. 
tips[tips['day']=='Thur']['tip'].sum()
# and then for Fri, Sat, Sun ... 

## Group By

In [None]:
# Use groupby to grab them all at once ...  what days do I want to work?
tips.groupby('day')[['tip']].sum()
# Why double brackets? 
# https://stackoverflow.com/questions/33417991/pandas-why-are-double-brackets-needed-to-select-column-after-boolean-indexing

In [None]:
# so, what is the object created by groupby()?
tips.groupby('day')[['tip']]

In [None]:
# add the total bill (total)
tips.groupby('day')[['total_bill', 'tip']].sum()

In [None]:
# Look at some averages by day
tips.groupby('day')[['total_bill', 'tip', 'percent']].mean()

In [None]:
# Aggregate - Look at more summary stats:
tips.groupby('day')[['total_bill', 'tip']].aggregate(['sum', 'mean', 'median'])

In [None]:
# median values by time (lunch/dinner)
tips.groupby('time')[['total_bill', 'tip', 'percent']].median()

In [None]:
# female/male?
tips.groupby('sex')[['total_bill', 'tip', 'percent']].mean()

In [None]:
# smoker/non-smoker?
tips.groupby('smoker')[['total_bill', 'tip']].aggregate(['mean','median', 'std'])

In [None]:
# Multiple levels -- 
tips.groupby(['day', 'sex'])[['total_bill', 'tip', 'percent']].median()

In [None]:
tips.groupby(['day', 'smoker'])[['total_bill', 'tip', 'percent']].aggregate(['mean', 'median'])

In [None]:
tips.groupby(['day', 'sex', 'smoker'])[['total_bill', 'tip', 'percent']].median()

## Dispatch Methods

In [None]:
# Use the describe() method that's defined for DataFrame and Series objects -- essentially "passed through" the GroupBy.
tips.groupby('day')[['total_bill', 'tip']].describe()

## Group By Iterator -- Iterate through the resulting groups

In [None]:
# loop through the day groups and show some stuff
for (day, group) in tips.groupby('day'):
    print("{:5s}  {:3d}  ${:.2f}".format(day, group.shape[0], group['total_bill'].sum()))