# Grouping and Aggregating Data


In [1]:
# import pandas and numpy
import numpy as np
import pandas as pd
 
# Set some pandas options for controlling output
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

# inline graphics
%matplotlib inline

## The split, apply, and combine (SAC) pattern
Many data analysis problems utilize a pattern of processing data, known as
split-apply-combine. In this pattern, three steps are taken to analyze data:
1. A data set is split into smaller pieces
2. Each of these pieces are operated upon independently
3. All of the results are combined back together and presented as a single unit

Splitting in pandas is performed using the .groupby() method, which will
divide the data based on the values present in the specified index labels and columns.
Once the data is split into groups, one or more of the following three broad classes of
operations is applied:
* Aggregation: This calculates a summary statistic, such as group means or counts of the items in each group
* Transformation: This performs group- or item-specific calculations and returns a set of like-indexed results
* Filtration: This removes entire groups of data based on a group level computation

The combine stage of the pattern is performed automatically by pandas.
A paper on SAC paatern using R: http://www.jstatsoft.org/v40/i01/paper


## Split

In [4]:
# load the sensors data
sensors = pd.read_csv("data/sensors.csv")
sensors

    interval       sensor axis  reading
0          0        accel    Z      0.0
1          0        accel    Y      0.5
2          0        accel    X      1.0
3          1        accel    Z      0.1
4          1        accel    Y      0.4
..       ...          ...  ...      ...
19         2  orientation    Y      0.3
20         2  orientation    X      0.2
21         3  orientation    Z      0.0
22         3  orientation    Y      0.4
23         3  orientation    X      0.3

[24 rows x 4 columns]

### Grouping by a single column's values

In [5]:
# group this data by the sensor column / variable
# returns a DataFrameGroupBy object
grouped = sensors.groupby('sensor')
grouped


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000028689EC4E48>

Notice:
* the result is DataFrameGroupBy object (SeriesGroupBy when grouping on Series).
* The actual process of grouping is a lazy process in pandas, and at this point, the grouping has not actually been performed. 
* This object represents an description of the grouping to be performed. This allows pandas to first validate that the grouping description provided to it is valid

In [6]:
# get the number of groups that this will create
grouped.ngroups

2

In [7]:
# what are the groups that were found?
grouped.groups

{'accel': Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64'),
 'orientation': Int64Index([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], dtype='int64')}

### Accessing the results of grouping

In [8]:
# a helper function to print the contents of the groups
def print_groups (groupobject):
    # loop over all groups, printing the group name
    # and group details
    for name, group in groupobject:
        print (name)
        print (group)

In [9]:
# examine the content of the groups we created
# a group has been created for each distinct value in the sensors column and has been named with that value. 
print_groups(grouped)

accel
    interval sensor axis  reading
0          0  accel    Z      0.0
1          0  accel    Y      0.5
2          0  accel    X      1.0
3          1  accel    Z      0.1
4          1  accel    Y      0.4
..       ...    ...  ...      ...
7          2  accel    Y      0.3
8          2  accel    X      0.8
9          3  accel    Z      0.3
10         3  accel    Y      0.2
11         3  accel    X      0.7

[12 rows x 4 columns]
orientation
    interval       sensor axis  reading
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0
15         1  orientation    Z      0.0
16         1  orientation    Y      0.2
..       ...          ...  ...      ...
19         2  orientation    Y      0.3
20         2  orientation    X      0.2
21         3  orientation    Z      0.0
22         3  orientation    Y      0.4
23         3  orientation    X      0.3

[12 rows x 4 columns]


In [10]:
grouped.size()

sensor
accel          12
orientation    12
dtype: int64

In [11]:
grouped.count()

             interval  axis  reading
sensor                              
accel              12    12       12
orientation        12    12       12

In [12]:
# get the data in one specific group
grouped.get_group('accel')

    interval sensor axis  reading
0          0  accel    Z      0.0
1          0  accel    Y      0.5
2          0  accel    X      1.0
3          1  accel    Z      0.1
4          1  accel    Y      0.4
..       ...    ...  ...      ...
7          2  accel    Y      0.3
8          2  accel    X      0.8
9          3  accel    Z      0.3
10         3  accel    Y      0.2
11         3  accel    X      0.7

[12 rows x 4 columns]

In [13]:
# get the first three items in each group
grouped.head(3)

    interval       sensor axis  reading
0          0        accel    Z      0.0
1          0        accel    Y      0.5
2          0        accel    X      1.0
12         0  orientation    Z      0.0
13         0  orientation    Y      0.1
14         0  orientation    X      0.0

In [14]:
# get the first item in each group
grouped.nth(0)

             interval axis  reading
sensor                             
accel               0    Z      0.0
orientation         0    Z      0.0

In [15]:
# group by both sensor and axis values
mcg = sensors.groupby(['sensor', 'axis'])
print_groups(mcg)

('accel', 'X')
    interval sensor axis  reading
2          0  accel    X      1.0
5          1  accel    X      0.9
8          2  accel    X      0.8
11         3  accel    X      0.7
('accel', 'Y')
    interval sensor axis  reading
1          0  accel    Y      0.5
4          1  accel    Y      0.4
7          2  accel    Y      0.3
10         3  accel    Y      0.2
('accel', 'Z')
   interval sensor axis  reading
0         0  accel    Z      0.0
3         1  accel    Z      0.1
6         2  accel    Z      0.2
9         3  accel    Z      0.3
('orientation', 'X')
    interval       sensor axis  reading
14         0  orientation    X      0.0
17         1  orientation    X      0.1
20         2  orientation    X      0.2
23         3  orientation    X      0.3
('orientation', 'Y')
    interval       sensor axis  reading
13         0  orientation    Y      0.1
16         1  orientation    Y      0.2
19         2  orientation    Y      0.3
22         3  orientation    Y      0.4
('orient

In [16]:
# get descriptive statistics for each
mcg.describe()

                 interval                            ... reading               \
                    count mean       std  min   25%  ...     min    25%   50%   
sensor      axis                                     ...                        
accel       X         4.0  1.5  1.290994  0.0  0.75  ...     0.7  0.775  0.85   
            Y         4.0  1.5  1.290994  0.0  0.75  ...     0.2  0.275  0.35   
            Z         4.0  1.5  1.290994  0.0  0.75  ...     0.0  0.075  0.15   
orientation X         4.0  1.5  1.290994  0.0  0.75  ...     0.0  0.075  0.15   
            Y         4.0  1.5  1.290994  0.0  0.75  ...     0.1  0.175  0.25   
            Z         4.0  1.5  1.290994  0.0  0.75  ...     0.0  0.000  0.00   

                              
                    75%  max  
sensor      axis              
accel       X     0.925  1.0  
            Y     0.425  0.5  
            Z     0.225  0.3  
orientation X     0.225  0.3  
            Y     0.325  0.4  
            Z     0.00

### Grouping using index levels

In [17]:
# make a copy of the data and reindex the copy
mi = sensors.copy()
mi = mi.set_index(['sensor', 'axis'])
mi

                  interval  reading
sensor      axis                   
accel       Z            0      0.0
            Y            0      0.5
            X            0      1.0
            Z            1      0.1
            Y            1      0.4
...                    ...      ...
orientation Y            2      0.3
            X            2      0.2
            Z            3      0.0
            Y            3      0.4
            X            3      0.3

[24 rows x 2 columns]

In [18]:
# group by the first level of the index
mig_l1 = mi.groupby(level=0)
print_groups(mig_l1)


accel
             interval  reading
sensor axis                   
accel  Z            0      0.0
       Y            0      0.5
       X            0      1.0
       Z            1      0.1
       Y            1      0.4
...               ...      ...
       Y            2      0.3
       X            2      0.8
       Z            3      0.3
       Y            3      0.2
       X            3      0.7

[12 rows x 2 columns]
orientation
                  interval  reading
sensor      axis                   
orientation Z            0      0.0
            Y            0      0.1
            X            0      0.0
            Z            1      0.0
            Y            1      0.2
...                    ...      ...
            Y            2      0.3
            X            2      0.2
            Z            3      0.0
            Y            3      0.4
            X            3      0.3

[12 rows x 2 columns]


In [20]:
# group by multiple levels of the index
mig_l12 = mi.groupby(level=['sensor', 'axis'])
print_groups(mig_l12)


('accel', 'X')
             interval  reading
sensor axis                   
accel  X            0      1.0
       X            1      0.9
       X            2      0.8
       X            3      0.7
('accel', 'Y')
             interval  reading
sensor axis                   
accel  Y            0      0.5
       Y            1      0.4
       Y            2      0.3
       Y            3      0.2
('accel', 'Z')
             interval  reading
sensor axis                   
accel  Z            0      0.0
       Z            1      0.1
       Z            2      0.2
       Z            3      0.3
('orientation', 'X')
                  interval  reading
sensor      axis                   
orientation X            0      0.0
            X            1      0.1
            X            2      0.2
            X            3      0.3
('orientation', 'Y')
                  interval  reading
sensor      axis                   
orientation Y            0      0.1
            Y            1     

## Apply

### Applying aggregation functions to groups
.agg() method
of the GroupBy object. The parameter of .agg() is a reference to a function that is
applied to each group. In the case of DataFrame, the function will be applied to
each column.

In [22]:
# calculate the mean for each sensor/axis
mig_l12.agg(np.mean)

                  interval  reading
sensor      axis                   
accel       X          1.5     0.85
            Y          1.5     0.35
            Z          1.5     0.15
orientation X          1.5     0.15
            Y          1.5     0.25
            Z          1.5     0.00

In [25]:
# do not create an index matching the original object
sensors.groupby(['sensor', 'axis'],
 as_index=False).agg(np.mean)

        sensor axis  interval  reading
0        accel    X       1.5     0.85
1        accel    Y       1.5     0.35
2        accel    Z       1.5     0.15
3  orientation    X       1.5     0.15
4  orientation    Y       1.5     0.25
5  orientation    Z       1.5     0.00

In [27]:
# Many aggregation functions are built in directly to the GroupBy object to save you some more typing
# can simply apply the agg function to the group by object
mig_l12.mean()

                  interval  reading
sensor      axis                   
accel       X          1.5     0.85
            Y          1.5     0.35
            Z          1.5     0.15
orientation X          1.5     0.15
            Y          1.5     0.25
            Z          1.5     0.00

In [28]:
# apply multiple aggregation functions at once
mig_l12.agg([np.sum, np.std])

                 interval           reading          
                      sum       std     sum       std
sensor      axis                                     
accel       X           6  1.290994     3.4  0.129099
            Y           6  1.290994     1.4  0.129099
            Z           6  1.290994     0.6  0.129099
orientation X           6  1.290994     0.6  0.129099
            Y           6  1.290994     1.0  0.129099
            Z           6  1.290994     0.0  0.000000

In [29]:
# apply a different function to each column
mig_l12.agg({'interval' : len,
 'reading': np.mean})

                  interval  reading
sensor      axis                   
accel       X            4     0.85
            Y            4     0.35
            Z            4     0.15
orientation X            4     0.15
            Y            4     0.25
            Z            4     0.00

In [30]:
# calculate the mean of the reading column
mig_l12['reading'].mean()

sensor       axis
accel        X       0.85
             Y       0.35
             Z       0.15
orientation  X       0.15
             Y       0.25
             Z       0.00
Name: reading, dtype: float64