In [45]:
import numpy as np
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# Planets Data

In [46]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [47]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


# Simple Aggregation in Pandas

In [48]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [49]:
ser.sum()

np.float64(2.811925491708157)

In [50]:
ser.mean()

np.float64(0.5623850983416314)

In [51]:
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [52]:
df.mean()
# when not specified, means each column

A    0.477888
B    0.443420
dtype: float64

In [53]:
df.mean(axis = 'columns')
# means each row (strangely enough)

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [54]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [55]:
planets.iloc[7:10].describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,3.0,3.0,2.0,3.0,3.0
mean,1.333333,748.2,6.145,56.433333,2004.666667
std,0.57735,273.738251,5.876057,30.342865,7.571878
min,1.0,452.8,1.99,21.41,1996.0
25%,1.0,625.65,4.0675,47.255,2002.0
50%,1.0,798.5,6.145,73.1,2008.0
75%,1.5,895.9,8.2225,73.945,2009.0
max,2.0,993.3,10.3,74.79,2010.0


In [56]:
# Aggregation         Description
# count()             total number of items
# first(), last()     first and last item
# mean(), median()    mean and median
# min(), max          minimum and maximum
# std(), var()        standard deviation and variance
# mad()               mean absolute deviation
# prod()              product of all items
# sum()               sum of all items

# all aggregation methods of 
# DataFrame and Series objects

# GroupBy: Split, Apply, Combine

## Split, apply, combine

In [57]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data' : range(6)}, columns = ['key', 'data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [58]:
df.groupby('key')
# notice that the return is not a set of data frames, 
# but a DataFrameGroupBy object. you can think of it 
# as a special view of the dataframe, which is poised
# to dig into the groups but does nothing until the
# aggregation is applied. 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6c7c12b430>

In [59]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


## The GroupBy object

### Column indexing

In [60]:
planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f6c7c12bd90>

In [61]:
planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f6c7c129e70>

In [62]:
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

### Iteration over groups

In [63]:
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


### Dispatch methods

In [64]:
planets.groupby('method')['year'].describe().unstack()
# didnt work as the book said it would
# oh nevermind it kind of did yes

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

## Aggregate, filter, transform, apply

In [None]:
# for the purpose of the following subsections,
# we'll use this data frame:

rng = np.random.RandomState(0)
df = pd.DataFrame({'key' : ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
# columns get the name even though you dont include
# that last part so it must have an other utility
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


### Aggregation

In [67]:
# the aggregate method allows us to take multiple
# object types and compute all the aggregates at once
df.groupby('key').aggregate(['min', np.median, max])

  df.groupby('key').aggregate(['min', np.median, max])
  df.groupby('key').aggregate(['min', np.median, max])


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [None]:
df.groupby('key')

KeyError: 'Column not found: A'