In [2]:
import pandas as pd
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url, low_memory=False)
city_mpg = df['city08']
highway_mpg = df['highway08']

## Aggregrate Methods
+ Aggregate methods collapse the values of a series down to a scalar. Aggregations are the numbers that your boss wants to be reported
+ Aggregations allow you to take detailed data and collapse it to a single value

In [3]:
# calculate the mean value of a series
city_mpg.mean()

18.369045304297103

In [4]:
# Aggregrate Properties
city_mpg.is_unique

False

In [5]:
city_mpg.is_monotonic_increasing

False

In [8]:
## Quantile - returns a scalar
city_mpg.quantile(.8)

21.0

In [9]:
## Quantile - returns a series
city_mpg.quantile([.1, .8, .99])

0.10    13.0
0.80    21.0
0.99    40.0
Name: city08, dtype: float64

## Count and Mean of an Attribute

In [10]:
# Get the count and % of cars with mileage > 20
(city_mpg
.gt(20)
.sum())

10272

In [11]:
# get the % of values that meet criteria
(city_mpg
.gt(20)
.mul(100)
.mean())

24.965973167412017

## .agg and Aggregration Strings

In [12]:
# the .agg method does aggregations 
# But it transforms the data in other ways depending on how it is called
city_mpg.agg('mean')

18.369045304297103

In [13]:
# Where .agg shines is in the ability to perform multiple aggregations
import numpy as np
def second_to_last(s):
    return s.iloc[-2]

In [14]:
city_mpg.agg(['mean', np.var, max, second_to_last])

mean               18.369045
var                62.503036
max               150.000000
second_to_last     18.000000
Name: city08, dtype: float64