## Aggregation on dogs.csv

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Read csv and store is as DataFrame
dogs = pd.read_csv('./data/dogs.csv')
dogs

Unnamed: 0,Name,Breed,Color,Height(cm),Weight(kg),Date of Birth
0,Bella,Labrador,Brown,56,25,2013-07-01
1,Charlie,Poodle,Black,43,23,2016-09-16
2,Lucy,Chow,Brown,46,22,2014-08-25
3,Cooper,Schnauzer,Gray,49,17,2011-12-11
4,Max,Labrador,Black,59,29,2017-01-20
5,Stella,Chihuahua,Tan,18,2,2015-04-20
6,Bernie,St. Bernard,White,77,74,2018-02-2


In [5]:
# Find number of rows & columns
dogs.shape

(7, 6)

In [4]:
# Find information about columns
dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            7 non-null      object
 1   Breed           7 non-null      object
 2    Color          7 non-null      object
 3    Height(cm)     7 non-null      int64 
 4    Weight(kg)     7 non-null      int64 
 5    Date of Birth  7 non-null      object
dtypes: int64(2), object(4)
memory usage: 464.0+ bytes


In [6]:
# Statistical summary
dogs.describe()

Unnamed: 0,Height(cm),Weight(kg)
count,7.0,7.0
mean,49.714286,27.428571
std,17.960274,22.292429
min,18.0,2.0
25%,44.5,19.5
50%,49.0,23.0
75%,57.5,27.0
max,77.0,74.0


In [10]:
# Remove whitespaces in columns
dogs.columns = dogs.columns.str.strip()

In [12]:
# Find mean on column 'Height'
dogs['Height(cm)'].mean()

49.714285714285715

In [14]:
dogs['Height(cm)']

0    56
1    43
2    46
3    49
4    59
5    18
6    77
Name: Height(cm), dtype: int64

In [15]:
# Find min
dogs['Height(cm)'].min()

18

In [16]:
# Find max
dogs['Height(cm)'].max()

77

In [17]:
# Find variance
dogs['Height(cm)'].var()

322.5714285714286

In [18]:
# Find standard deviation
dogs['Height(cm)'].std()

17.960273621841864

In [19]:
# Find sum
dogs['Height(cm)'].sum()

348

In [20]:
# Calculate 30% percentile of "Height(cm)"
dogs['Height(cm)'].agg(lambda x: x.quantile(.3))

45.4

In [21]:
dogs

Unnamed: 0,Name,Breed,Color,Height(cm),Weight(kg),Date of Birth
0,Bella,Labrador,Brown,56,25,2013-07-01
1,Charlie,Poodle,Black,43,23,2016-09-16
2,Lucy,Chow,Brown,46,22,2014-08-25
3,Cooper,Schnauzer,Gray,49,17,2011-12-11
4,Max,Labrador,Black,59,29,2017-01-20
5,Stella,Chihuahua,Tan,18,2,2015-04-20
6,Bernie,St. Bernard,White,77,74,2018-02-2


In [23]:
# Calculate 30% percentile of "Height(cm)" & "Weight(kg)"
dogs[['Height(cm)', 'Weight(kg)']].agg(lambda x: x.quantile(0.3))

Height(cm)    45.4
Weight(kg)    21.0
dtype: float64

In [24]:
# Find 30% & 40% percentile of column 'Height'
def pct30(col): return col.quantile(0.3)
def pct40(col): return col.quantile(0.4)

dogs['Height(cm)'].agg([pct30, pct40])

pct30    45.4
pct40    47.2
Name: Height(cm), dtype: float64

In [26]:
# Find cumulative product of "Height"
dogs['Height(cm)'].cumprod()

0              56
1            2408
2          110768
3         5427632
4       320230288
5      5764145184
6    443839179168
Name: Height(cm), dtype: int64

In [27]:
dogs

Unnamed: 0,Name,Breed,Color,Height(cm),Weight(kg),Date of Birth
0,Bella,Labrador,Brown,56,25,2013-07-01
1,Charlie,Poodle,Black,43,23,2016-09-16
2,Lucy,Chow,Brown,46,22,2014-08-25
3,Cooper,Schnauzer,Gray,49,17,2011-12-11
4,Max,Labrador,Black,59,29,2017-01-20
5,Stella,Chihuahua,Tan,18,2,2015-04-20
6,Bernie,St. Bernard,White,77,74,2018-02-2


In [30]:
# Find the average 'Weight' of dogs with 'Color'
dogs.groupby('Color')['Weight(kg)'].mean()

Color
 Black    26.0
 Brown    23.5
 Gray     17.0
 Tan       2.0
 White    74.0
Name: Weight(kg), dtype: float64

In [39]:
# Caculate total number, min & max weight with different dogs color
dogs.groupby('Color')['Weight(kg)'].agg(['count', 'min', 'max'])

Unnamed: 0_level_0,count,min,max
Color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Black,2,23,29
Brown,2,22,25
Gray,1,17,17
Tan,1,2,2
White,1,74,74
