# 7 Group Operations

The "group by" process:
* Splitting the data into groups
* Applying a function to each group
    - Aggreation
    - Transformation
    - Filtration
* Combinging the results

In [8]:
import pandas as pd
import numpy as np

## Splitting the Data into Groups

Grouping needs a mapping of labels to group names:
* For DataFrame, a string indicating a column name to be grouped by.
* A list of the same length as the selected axis.
* A dict providing a mapping from labels to group names.
* A function to be called on each of the axis lables.

In [4]:
# Splitting by DataFrame column name(s)
df = pd.DataFrame({'key1': ['foo', 'bar', 'foo', 'bar'],
                   'key2': ['one', 'two', 'one', 'one'],
                   'data1': [1, 2, 3, 4],
                   'data2': [10, 20, 30, 40]})
print(df)
grouped = df.groupby('key1')
for key, group in grouped:
    print(key)
    print(group)

   data1  data2 key1 key2
0      1     10  foo  one
1      2     20  bar  two
2      3     30  foo  one
3      4     40  bar  one
bar
   data1  data2 key1 key2
1      2     20  bar  two
3      4     40  bar  one
foo
   data1  data2 key1 key2
0      1     10  foo  one
2      3     30  foo  one


In [5]:
# Splitting by DataFrame column name(s)
df = pd.DataFrame({'key1': ['foo', 'bar', 'foo', 'bar'],
                   'key2': ['one', 'two', 'one', 'one'],
                   'data1': [1, 2, 3, 4],
                   'data2': [10, 20, 30, 40]})
print(df)
grouped = df.groupby(['key1', 'key2'])
for (key1, key2), group in grouped:
    print(key1, key2)
    print(group)

   data1  data2 key1 key2
0      1     10  foo  one
1      2     20  bar  two
2      3     30  foo  one
3      4     40  bar  one
bar one
   data1  data2 key1 key2
3      4     40  bar  one
bar two
   data1  data2 key1 key2
1      2     20  bar  two
foo one
   data1  data2 key1 key2
0      1     10  foo  one
2      3     30  foo  one


In [6]:
# Splitting by a list of the same length as the selected axis.
df = pd.DataFrame({'key1': ['foo', 'bar', 'foo', 'bar'],
                   'key2': ['one', 'two', 'one', 'one'],
                   'data1': [1, 2, 3, 4],
                   'data2': [10, 20, 30, 40]})
print(df)
alist = ['group1', 'group2', 'group1', 'group2']
grouped = df.groupby(alist)
for key, group in grouped:
    print(key)
    print(group)

   data1  data2 key1 key2
0      1     10  foo  one
1      2     20  bar  two
2      3     30  foo  one
3      4     40  bar  one
group1
   data1  data2 key1 key2
0      1     10  foo  one
2      3     30  foo  one
group2
   data1  data2 key1 key2
1      2     20  bar  two
3      4     40  bar  one


In [7]:
# Splitting by a dict providing a mapping from labels to group names.
df = pd.DataFrame({'key1': ['foo', 'bar', 'foo', 'bar'],
                   'key2': ['one', 'two', 'one', 'one'],
                   'data1': [1, 2, 3, 4],
                   'data2': [10, 20, 30, 40]})
print(df)
adict = {0: 'group1', 1: 'group2', 2: 'group1', 3: 'group2'}
grouped = df.groupby(adict)
for key, group in grouped:
    print(key)
    print(group)

   data1  data2 key1 key2
0      1     10  foo  one
1      2     20  bar  two
2      3     30  foo  one
3      4     40  bar  one
group1
   data1  data2 key1 key2
0      1     10  foo  one
2      3     30  foo  one
group2
   data1  data2 key1 key2
1      2     20  bar  two
3      4     40  bar  one


In [9]:
# Splitting by a function to be called on each of the axis lables.
# Return values will be used as group names
df = pd.DataFrame({'key1': ['foo', 'bar', 'foo', 'bar'],
                   'key2': ['one', 'two', 'one', 'one'],
                   'data1': [1, 2, 3, 4],
                   'data2': [10, 20, 30, 40]})
print(df)
afunc = lambda x: np.remainder(x, 2)
grouped = df.groupby(afunc)
for key, group in grouped:
    print(key)
    print(group)

   data1  data2 key1 key2
0      1     10  foo  one
1      2     20  bar  two
2      3     30  foo  one
3      4     40  bar  one
0
   data1  data2 key1 key2
0      1     10  foo  one
2      3     30  foo  one
1
   data1  data2 key1 key2
1      2     20  bar  two
3      4     40  bar  one


In [10]:
# Selecting a group
df = pd.DataFrame({'key1': ['foo', 'bar', 'foo', 'bar'],
                   'key2': ['one', 'two', 'one', 'one'],
                   'data1': [1, 2, 3, 4],
                   'data2': [10, 20, 30, 40]})
print(df)
grouped = df.groupby(['key1', 'key2'])
for (key1, key2), group in grouped:
    print(key1, key2)
    print(group)
print(grouped.get_group(('bar', 'one')))

   data1  data2 key1 key2
0      1     10  foo  one
1      2     20  bar  two
2      3     30  foo  one
3      4     40  bar  one
bar one
   data1  data2 key1 key2
3      4     40  bar  one
bar two
   data1  data2 key1 key2
1      2     20  bar  two
foo one
   data1  data2 key1 key2
0      1     10  foo  one
2      3     30  foo  one
   data1  data2 key1 key2
3      4     40  bar  one
