In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
planets = sns.load_dataset('planets') # This is fetched online using load_dataset.
planets.shape

(1035, 6)

In [5]:
planets.columns

Index(['method', 'number', 'orbital_period', 'mass', 'distance', 'year'], dtype='object')

In [6]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


## For a dataframe, sum, mean etc are calculated for each column

In [7]:
df = pd.DataFrame({'A': np.random.rand(5), 'B': np.random.rand(5)})
df

Unnamed: 0,A,B
0,0.803553,0.147392
1,0.791597,0.726461
2,0.945554,0.281477
3,0.934171,0.166317
4,0.924644,0.202168


In [8]:
df.sum()

A    4.399519
B    1.523815
dtype: float64

In [9]:
df.mean()

A    0.879904
B    0.304763
dtype: float64

## We can aggregate for each row by using 'axis'

### Note:

<blockquote>A DataFrame object has two axes: “axis 0” and “axis 1”. “axis 0” represents rows and “axis 1” represents columns.</blockquote>

## Note: The axis 0 or 1 decides the direction things will be computed.

## 0 is Downward direction (as rows ascend downwards from top to bottom)
## 1 is left to right (as columns advance)

In [10]:
df.sum(axis=0) # Or axis='rows'

A    4.399519
B    1.523815
dtype: float64

In [11]:
# There is a convinient 'describe' method giving everything for the DF.

planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [12]:
# Counter checking values.
planets.dropna().mean(axis=0)

number               1.734940
orbital_period     835.778671
mass                 2.509320
distance            52.068213
year              2007.377510
dtype: float64

All methods available table

## **Aggregation	- Description**
- count()	Total number of items
-first(), last()	First and last item
-mean(), median()	Mean and median
-min(), max()	Minimum and maximum
-std(), var()	Standard deviation and variance
-mad()	Mean absolute deviation
-prod()	Product of all items
-sum()	Sum of all items


# GroupBy: Split, Apply, Combine

In [13]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)})
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [14]:
# We create a groupby object and using that, we can do all sort of computations.

gr_df = df.groupby('key')

In [15]:
gr_df.sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [16]:
planets_year_gr = planets.dropna().groupby('year')
planets_year_gr.count().head()

Unnamed: 0_level_0,method,number,orbital_period,mass,distance
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1989,1,1,1,1,1
1995,1,1,1,1,1
1996,4,4,4,4,4
1997,1,1,1,1,1
1998,5,5,5,5,5


In [17]:
# Get count sorted by hishest distance to lowest.

planets_year_gr.count().sort_values('distance')[::-1].head()

Unnamed: 0_level_0,method,number,orbital_period,mass,distance
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011,90,90,90,90,90
2009,70,70,70,70,70
2008,43,43,43,43,43
2010,41,41,41,41,41
2005,34,34,34,34,34


In [18]:
# We could also get just a column we are interested in.

orbital_period_series = planets_year_gr.mean()['orbital_period']
orbital_period_series

year
1989      83.888000
1995       4.230785
1996     275.145383
1997      39.845000
1998     114.310661
1999     591.477789
2000     572.027671
2001     792.019727
2002    1003.144581
2003     789.289313
2004     787.419733
2005     620.300633
2006     785.564764
2007     478.827107
2008     741.466230
2009    1381.749559
2010     819.593669
2011     603.793312
2012    1965.938509
2013     357.501831
2014    3639.500000
Name: orbital_period, dtype: float64

### As you can see, what we get is a Series.

In [19]:
type(orbital_period_series)

pandas.core.series.Series

In [20]:
for year, group in planets_year_gr:
    print('{0:10} shape={1}'.format(year, group.shape))
    
# Note: The count() column also gives us the same stats as shape. 
# shape=(90, 6) is 90 records/rows each having 6 values.

      1989 shape=(1, 6)
      1995 shape=(1, 6)
      1996 shape=(4, 6)
      1997 shape=(1, 6)
      1998 shape=(5, 6)
      1999 shape=(14, 6)
      2000 shape=(14, 6)
      2001 shape=(11, 6)
      2002 shape=(31, 6)
      2003 shape=(22, 6)
      2004 shape=(15, 6)
      2005 shape=(34, 6)
      2006 shape=(20, 6)
      2007 shape=(27, 6)
      2008 shape=(43, 6)
      2009 shape=(70, 6)
      2010 shape=(41, 6)
      2011 shape=(90, 6)
      2012 shape=(23, 6)
      2013 shape=(29, 6)
      2014 shape=(2, 6)


## Dispatch Methods.

### These can do things automatically. E.g:
- describe

In [21]:
# Get me count of each method 
planets.groupby('method')['number'].count()

method
Astrometry                         2
Eclipse Timing Variations          9
Imaging                           38
Microlensing                      23
Orbital Brightness Modulation      3
Pulsar Timing                      5
Pulsation Timing Variations        1
Radial Velocity                  553
Transit                          397
Transit Timing Variations          4
Name: number, dtype: int64

In [22]:
planets.groupby('method')['year'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [23]:
planets_year_gr['distance'].describe()

# Note: We have NaN data wherever count is 1.

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1989,1.0,40.57,,40.57,40.57,40.57,40.57,40.57
1995,1.0,15.36,,15.36,15.36,15.36,15.36,15.36
1996,4.0,13.92,1.288746,12.53,13.235,13.775,14.46,15.6
1997,1.0,17.43,,17.43,17.43,17.43,17.43,17.43
1998,5.0,26.302,16.855839,4.7,19.72,21.29,37.88,47.92
1999,14.0,29.795,14.260006,10.91,17.9025,29.35,38.58,59.03
2000,14.0,28.34,12.821017,3.22,20.4425,32.865,36.95,42.43
2001,11.0,37.212727,18.877052,14.08,23.035,34.6,43.74,77.82
2002,31.0,44.396129,23.294079,12.53,32.285,37.44,49.145,121.36
2003,22.0,44.505909,30.735342,11.11,28.22,36.42,50.07,133.16


## Aggregate, filter, transform, apply

#### We have more methods available with the group by object.

In [24]:
# Test DataFrame.

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


### Aggregation

In [25]:
# We can aggregate specific things.

df.groupby('key').aggregate(['min', np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [26]:
# We can aggregate for specific column and specific methods.

df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [35]:
# We can filter data.
df_sum = df.groupby('key').sum()
df_sum[df_sum > 5].fillna(0) # Here the NaN values, I am replacing with 0

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.0,8
B,0.0,7
C,7.0,12


## Transformation

#### We can generate new values using some calculation.


In [36]:
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


## Apply

#### The apply() function lets you use a function to calculate something.

In [39]:
def multiply_by(x):
    x['data1'] = (x['data2'] * 100)/np.pi
    return x
    
df.groupby('key').apply(multiply_by)

Unnamed: 0,key,data1,data2
0,A,159.154943,5
1,B,0.0,0
2,C,95.492966,3
3,A,95.492966,3
4,B,222.81692,7
5,C,286.478898,9
