# Aggregation and Grouping

In [1]:
import numpy as np
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [3]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [5]:
# Import the planets data

planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


### Simple Aggregation

In [6]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [7]:
ser.sum()

2.811925491708157

In [8]:
ser.mean()

0.5623850983416314

In [9]:
df = pd.DataFrame({'A': rng.rand(5), 'B': rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [10]:
df.mean()

A    0.477888
B    0.443420
dtype: float64

In [11]:
df.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [15]:
# In addition to the previously mentioned aggregatoin methods there is a 
# '.describe' method that outputs the common aggregates for each column

planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [16]:
"""
THE AGGREGATIONS FOR DATAFRAMES

count()           Total number of items
first(), last()   First and last item
mean(), median()  Mean and median
min(), max()      Minimum and maximum
std(), var()      Standard deviation and variance
mad()             Mean absolute deviation
prod()            Product of all items
sum()             Sum of all items

""";

### GroupBy: Split, Apply, Combine

In [20]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 'data': range(6)}, columns=['key','data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [23]:
df.groupby('key').mean()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5


In [26]:
df.groupby('key').describe()

Unnamed: 0_level_0,data,data,data,data,data,data,data,data
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,2.0,1.5,2.12132,0.0,0.75,1.5,2.25,3.0
B,2.0,2.5,2.12132,1.0,1.75,2.5,3.25,4.0
C,2.0,3.5,2.12132,2.0,2.75,3.5,4.25,5.0


## The GroupBy object

Allows you to perform aggregate, filter, transform, and apply functions

### Column Indexing

In [28]:
planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc0685b9fd0>

In [32]:
planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc087551690>

In [34]:
planets['method'].unique()

array(['Radial Velocity', 'Imaging', 'Eclipse Timing Variations',
       'Transit', 'Astrometry', 'Transit Timing Variations',
       'Orbital Brightness Modulation', 'Microlensing', 'Pulsar Timing',
       'Pulsation Timing Variations'], dtype=object)

In [36]:
planets.groupby('method')['orbital_period'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,631.18,544.217663,246.36,438.77,631.18,823.59,1016.0
Eclipse Timing Variations,9.0,4751.644444,2499.130945,1916.25,2900.0,4343.5,5767.0,10220.0
Imaging,12.0,118247.7375,213978.177277,4639.15,8343.9,27500.0,94250.0,730000.0
Microlensing,7.0,3153.571429,1113.166333,1825.0,2375.0,3300.0,3550.0,5100.0
Orbital Brightness Modulation,3.0,0.709307,0.725493,0.240104,0.291496,0.342887,0.943908,1.544929
Pulsar Timing,5.0,7343.021201,16313.265573,0.090706,25.262,66.5419,98.2114,36525.0
Pulsation Timing Variations,1.0,1170.0,,1170.0,1170.0,1170.0,1170.0,1170.0
Radial Velocity,553.0,823.35468,1454.92621,0.73654,38.021,360.2,982.0,17337.5
Transit,397.0,21.102073,46.185893,0.355,3.16063,5.714932,16.1457,331.60059
Transit Timing Variations,3.0,79.7835,71.599884,22.3395,39.67525,57.011,108.5055,160.0


In [41]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


### Iteration

GroupBy objects support direct iteration over the groups

In [61]:
# To iterate manually you can use a for loop like this one.
#
# This can be done with '.apply()' method but this way gives more fine control

for (method, group) in planets.groupby('method'):
    print("{0:30s} Shape:  {1}\n".format(method, group.shape))


Astrometry                     Shape:  (2, 6)

Eclipse Timing Variations      Shape:  (9, 6)

Imaging                        Shape:  (38, 6)

Microlensing                   Shape:  (23, 6)

Orbital Brightness Modulation  Shape:  (3, 6)

Pulsar Timing                  Shape:  (5, 6)

Pulsation Timing Variations    Shape:  (1, 6)

Radial Velocity                Shape:  (553, 6)

Transit                        Shape:  (397, 6)

Transit Timing Variations      Shape:  (4, 6)



In [90]:
# Any method not directly implemented by the GroupBy object will instead be
# implemented individually on the groups and applied to every column
# unless one is specified.

planetsBYmethod = planets.groupby('method')
print(type(planetsBYmethod['orbital_period']))
print("APPLIED TO FULL GROUPBY OBJECT:\n", planetsBYmethod.describe())
print("\n\nAPPLIED TO THE YEAR COLUMN:\n", planetsBYmethod['year'].describe())


<class 'pandas.core.groupby.generic.SeriesGroupBy'>
APPLIED TO FULL GROUPBY OBJECT:
                               number                                           \
                               count      mean       std  min  25%  50%   75%   
method                                                                          
Astrometry                       2.0  1.000000  0.000000  1.0  1.0  1.0  1.00   
Eclipse Timing Variations        9.0  1.666667  0.500000  1.0  1.0  2.0  2.00   
Imaging                         38.0  1.315789  0.933035  1.0  1.0  1.0  1.00   
Microlensing                    23.0  1.173913  0.387553  1.0  1.0  1.0  1.00   
Orbital Brightness Modulation    3.0  1.666667  0.577350  1.0  1.5  2.0  2.00   
Pulsar Timing                    5.0  2.200000  1.095445  1.0  1.0  3.0  3.00   
Pulsation Timing Variations      1.0  1.000000       NaN  1.0  1.0  1.0  1.00   
Radial Velocity                553.0  1.721519  1.157141  1.0  1.0  1.0  2.00   
Transit                 

## .aggregate( ), .filter( ), .transform( ), .apply( )

In [91]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [97]:
# Pass whatever numpy aggregation functions you want to the aggregate method.

df.groupby('key').aggregate(['min', np.median, max, 'std'])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,median,max,std,min,median,max,std
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,0,1.5,3,2.12132,3,4.0,5,1.414214
B,1,2.5,4,2.12132,0,3.5,7,4.949747
C,2,3.5,5,2.12132,3,6.0,9,4.242641


In [99]:
# Or you can pass a dictionary of the aggregations you want done on each column

df.groupby('key').aggregate({'data1': ['min', 'max'], 'data2': ['std', 'mean']})

Unnamed: 0_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,min,max,std,mean
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,0,3,1.414214,4.0
B,1,4,4.949747,3.5
C,2,5,4.242641,6.0


In [122]:
def filter_func(x):
    return x['method'].count() < 50

methodsOVER50 = planetsBYmethod.filter(filter_func)

methodsOVER50

Unnamed: 0,method,number,orbital_period,mass,distance,year
29,Imaging,1,,,45.52,2005
30,Imaging,1,,,165.00,2007
31,Imaging,1,,,140.00,2004
32,Eclipse Timing Variations,1,10220.0,6.05,,2009
33,Imaging,1,,,,2008
...,...,...,...,...,...,...
950,Imaging,1,,,,2010
957,Imaging,1,,,,2008
958,Pulsation Timing Variations,1,1170.0,,,2007
1027,Imaging,1,,,19.20,2011


### Transformation

In [166]:
# Generate the experimental DataFrame

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'], 
                   'B' : [1,2,3,4,5,6],
                   'C' : [2.0, 5., 8., 1., 2., 9.] })
df

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,2,5.0
2,foo,3,8.0
3,bar,4,1.0
4,foo,5,2.0
5,bar,6,9.0


In [167]:
# Create the experimental group

grouped = df.groupby('A')

In [168]:
# Check the mean of column 'C'
grouped['C'].mean()

A
bar    5.0
foo    4.0
Name: C, dtype: float64

In [169]:
# Center column 'B' on it's mean

grouped['B'].transform(lambda x: x-x.mean())

0   -2
1   -2
2    0
3    0
4    2
5    2
Name: B, dtype: int64

In [170]:
# Check the values in the DataFrame, notice the values weren't 
# changed by the .transform method.  The outputs from the group simply reference
# an existing DataFrame.

df

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,2,5.0
2,foo,3,8.0
3,bar,4,1.0
4,foo,5,2.0
5,bar,6,9.0


In [171]:
# Square column 'B'

df['B'] = df['B']**2

In [172]:

df

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,4,5.0
2,foo,9,8.0
3,bar,16,1.0
4,foo,25,2.0
5,bar,36,9.0


In [173]:
# Center the new column 'B' on it's mean

grouped.transform(lambda x: x-x.mean())

Unnamed: 0,B,C
0,-10.666667,-2.0
1,-14.666667,0.0
2,-2.666667,4.0
3,-2.666667,-4.0
4,13.333333,-2.0
5,17.333333,4.0


In [174]:
# Check the sum of column 'C'

grouped['C'].sum()

A
bar    15.0
foo    12.0
Name: C, dtype: float64

In [175]:
df

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,4,5.0
2,foo,9,8.0
3,bar,16,1.0
4,foo,25,2.0
5,bar,36,9.0


In [156]:


def divBYc(x):
    x['B'] /= x['C']
    return x
dfBbyC = grouped.apply(divBYc)

In [158]:
display('df', 'dfBbyC')

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,4,5.0
2,foo,9,8.0
3,bar,16,1.0
4,foo,25,2.0
5,bar,36,9.0

Unnamed: 0,A,B,C
0,foo,0.5,2.0
1,bar,0.8,5.0
2,foo,1.125,8.0
3,bar,16.0,1.0
4,foo,12.5,2.0
5,bar,4.0,9.0


## Specifying the Split Key

In [191]:
# So far the key to group on has been using the default of naming the key in the DataFrame.

df.groupby('A').sum()

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,56,15.0
foo,35,12.0


In [192]:
# You can also pass a list of values with the same length as the number of rows.
#
# Each value in the list will be the value that the DataFrame is grouped on.

L = ['a._____this','a._____this','b._______is','b._______is','c. arbitrary','c. arbitrary']
display('df', 'df.groupby(L).std()')

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,4,5.0
2,foo,9,8.0
3,bar,16,1.0
4,foo,25,2.0
5,bar,36,9.0

Unnamed: 0,B,C
a._____this,2.12132,2.12132
b._______is,4.949747,4.949747
c. arbitrary,7.778175,4.949747


In [195]:
# The default method is the same as passing the df['key'] vector to the groupby function.

display('df', 'df.groupby("A").sum()', 'df.groupby(df["A"]).sum()')

Unnamed: 0,A,B,C
0,foo,1,2.0
1,bar,4,5.0
2,foo,9,8.0
3,bar,16,1.0
4,foo,25,2.0
5,bar,36,9.0

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,56,15.0
foo,35,12.0

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,56,15.0
foo,35,12.0


### Dictionary or Series Mapping Index to Group

In [200]:
# If your DataFrame has indices you can specify a mapping dictionary that
# gives each index value a corresponding grouping index.

df2 = df.set_index('A')

mapping = {'foo': 'FOOFOO', 'bar': 'BBARRR'}
display('df2', 'df2.groupby(mapping).sum()')

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,1,2.0
bar,4,5.0
foo,9,8.0
bar,16,1.0
foo,25,2.0
bar,36,9.0

Unnamed: 0,B,C
BBARRR,56,15.0
FOOFOO,35,12.0


In [202]:
# Not all of the indices of the DataFrame need to appear in the dictionary.
# 
# Missing indices will simply be ommited from the groupings.

mapping = {'foo': 'FOOFOO', 'bo': 'BBARRR'}
display('df2', 'df2.groupby(mapping).sum()')

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,1,2.0
bar,4,5.0
foo,9,8.0
bar,16,1.0
foo,25,2.0
bar,36,9.0

Unnamed: 0,B,C
FOOFOO,35,12.0


### Any Python Function

Don't know why you would want to do this but any python function that will operate<br>
on the indices can be used as the grouping values

In [203]:
display('df2', 'df2.groupby(str.upper).mean()' )

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,1,2.0
bar,4,5.0
foo,9,8.0
bar,16,1.0
foo,25,2.0
bar,36,9.0

Unnamed: 0,B,C
BAR,18.666667,5.0
FOO,11.666667,4.0


## Let's do the Planet's Example!

In [219]:
decade = 10* (planets['year'] // 10)
decade = decade.astype(str) +'s'
decade.name = 'decade'
print(planets.groupby(['method',decade])['number'].sum())
display("planets.groupby(['method',decade])['number'].sum().unstack()",
        "planets.groupby(['method',decade])['number'].sum().unstack().fillna(0)")


method                         decade
Astrometry                     2010s       2
Eclipse Timing Variations      2000s       5
                               2010s      10
Imaging                        2000s      29
                               2010s      21
Microlensing                   2000s      12
                               2010s      15
Orbital Brightness Modulation  2010s       5
Pulsar Timing                  1990s       9
                               2000s       1
                               2010s       1
Pulsation Timing Variations    2000s       1
Radial Velocity                1980s       1
                               1990s      52
                               2000s     475
                               2010s     424
Transit                        2000s      64
                               2010s     712
Transit Timing Variations      2010s       9
Name: number, dtype: int64


decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,,,,2.0
Eclipse Timing Variations,,,5.0,10.0
Imaging,,,29.0,21.0
Microlensing,,,12.0,15.0
Orbital Brightness Modulation,,,,5.0
Pulsar Timing,,9.0,1.0,1.0
Pulsation Timing Variations,,,1.0,
Radial Velocity,1.0,52.0,475.0,424.0
Transit,,,64.0,712.0
Transit Timing Variations,,,,9.0

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
