### Libraries / Data

import numpy and pandas libraries

In [None]:
import numpy as np
import pandas as pd

specify some pandas settings that regulate output format

In [None]:
pd.options.display.max_rows = 10

upload data

In [None]:
tips = pd.read_csv("../data/tips.csv")
tips.head()

In [None]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [None]:
tips.head()

### Mechanism GroupBy

<img src = '../images/split_apply_combine.png' style='width: 600px;'/>

In [None]:
df = pd.DataFrame({'x': ['a','a','b','b','c','c'],
                   'y': [2,4,0,5,5,10]})
df

In [None]:
groups = df.groupby(['x'])
groups

Receive information on the number of groups to be established

In [None]:
groups.ngroups

Get information on the number of elements in each group

In [None]:
groups.size()

What do the groups represent?

In [None]:
groups.groups

Get specific group data

In [None]:
groups.get_group('b')

extract the first row of each group

In [None]:
groups.nth([1])

group traversal:

In [None]:
for key, group in groups:
    print(key)
    print(group) 

average computation

In [None]:
groups.y.mean()

### Grouping type

#### by coloumns: 

In [None]:
tips.head()

In [None]:
tips.groupby(['day','time']).tip.mean().unstack()

#### by index level

create a copy of the data and re-index it

In [None]:
copy_tips = tips.copy()
copy_tips = copy_tips.set_index(['day', 'time'])
copy_tips

can group one or more index levels by passing the corresponding column values to the level argument

In [None]:
copy_tips.groupby(level=['time']).sum()

In [None]:
copy_tips.groupby(level = ['day', 'time']).mean()

#### using a function

In [None]:
copy_tips.head()

In [None]:
copy_tips.groupby(len, level='day').count()

In [None]:
tips.day.value_counts()

#### using an array

random group label array:

In [None]:
np.random.seed(123)
rnd_array = np.random.choice(['first_group', 'second_group'], 
                             size=tips.shape[0],
                             p = [0.4, 0.6])
rnd_array[0:5]

Transfer an array of group labels to groupby

In [None]:
tips.groupby(rnd_array).count()

In [None]:
93/(93+151)

#### bonus

data type:

In [None]:
tips.dtypes

In [None]:
group_tips = tips.groupby(tips.dtypes, axis=1)

In [None]:
group_tips.size()

### Group aggregation

create object groupby:

In [None]:
group_tips = tips.groupby(['day', 'smoker'])['tip_pct']

method equivalence check:

In [None]:
group_tips.agg == group_tips.aggregate

own function:

In [None]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

group_tips.agg([peak_to_peak])

combination:

In [None]:
group_tips.agg([peak_to_peak, 'mean'])

specify column names:

In [None]:
group_tips.agg([('delta_max_min', peak_to_peak), ('mean_value','mean')])

separate feature sets for each column:

In [None]:
group_tips = tips.groupby(['day', 'smoker'])['tip_pct', 'total_bill']

In [None]:
group_tips.agg({'tip_pct':[('max_value', np.max),
                           ('min_value','min')],
                'total_bill': 'sum'})

### Group transformation

####  transform method

<img src = '../images/transform.png' style='width: 900px;'/>

In [None]:
df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'C', 'B', 'B', 'A'],
                   'Col2': [1, 2, 3, 4, 2, 5, 3]})
df

In [None]:
df['Col3'] = df.groupby('Col1').transform(sum)
df.sort_values('Col1')

object groupby:

In [None]:
group_tips = tips.groupby(['smoker'])['total_bill']

rationing:

In [None]:
norm = lambda x: (x - x.mean())/x.std()
group_tips.transform(norm)

In [None]:
tips_copy = tips.copy()
tips_copy['total_bill_norm'] = group_tips.transform(norm)
tips_copy.head()

In [None]:
tips_copy.groupby('smoker').total_bill_norm.agg(['mean', 'std'])

#### method apply

functiom:

In [None]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips, n=6)

Applying an apply method

In [None]:
tips.groupby('smoker').apply(top)

### Group exclusion

create data for our examples

In [None]:
df = pd.DataFrame({'Label': list('AABCCC'),
                   'Values': [1, 2, 3, 4, np.nan, 8]})
df

Remove groups with less than two possible values

In [None]:
f = lambda x: x.Values.count() > 1
df.groupby('Label').filter(f)

remove groups with omissions

In [None]:
f = lambda x: x.Values.isnull().sum() == 0
df.groupby('Label').filter(f)

### summary table

average

In [None]:
tips.pivot_table(index=['day', 'smoker'])

row and column output

In [None]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker')

Inclusion of partial outcomes:

In [None]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                 columns='smoker', margins=True)

set function:

In [None]:
tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',
                 aggfunc=max, margins=True)