In [1]:
# first step in exploring any data set is often to compute various statistics
# numpy has fast built in aggregation functions for working on arrays

In [5]:
import numpy as np
rng = np.random.default_rng()

# computing the sum of all values in an array
x = rng.random(100)
%timeit sum(x)
%timeit np.sum(x) # numpy version is faster



5.5 µs ± 92.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
1.64 µs ± 29.5 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [9]:
# also have min and max functions
x = rng.random(100000)
min(x), max(x)
np.min(x), np.max(x) # operate faster

# can also just call it on the object itself
x.min(), x.max(), x.sum()

(1.2472657524353892e-05, 0.9999876872761552, 50090.194747878915)

In [16]:
# can aggregate along a row or column
x = rng.integers(0, 10, (3,4))
# can specify axis
x.sum(axis = 0) # sum the columns
x.sum(axis = 1) # sum the rows

array([13, 28, 28])

In [17]:
# other aggregation functionsed for up to 5% s
''' 
Name           Nan Safe         Description
np.sum         np.nansum        Compute sum of elements
np.prod        np.nanprod       Compute product of elements
np.mean        np.nanmean       Compute mean of elements
np.std         np.nanstd        Compute standard deviation
np.var         np.nanvar        Compute variance
np.min         np.nanmin        Find minimum value
np.max         np.nanmax        Find maximum value
np.argmin      np.nanargmin     Find index of minimum value
np.argmax      np.nanargmax     Find index of maximum value
np.median      np.nanmedian     Compute median of elements
np.percentile  np.nanpercentile Compute rank-based statistics of elements
np.any         N/A              Evaluate whether any elements are true
np.all         N/A              Evaluate whether all elements are true
'''

' \nName           Nan Safe         Description\nnp.sum         np.nansum        Compute sum of elements\nnp.prod        np.nanprod       Compute product of elements\nnp.mean        np.nanmean       Compute mean of elements\nnp.std         np.nanstd        Compute standard deviation\nnp.var         np.nanvar        Compute variance\nnp.min         np.nanmin        Find minimum value\nnp.max         np.nanmax        Find maximum value\nnp.argmin      np.nanargmin     Find index of minimum value\nnp.argmax      np.nanargmax     Find index of maximum value\nnp.median      np.nanmedian     Compute median of elements\nnp.percentile  np.nanpercentile Compute rank-based statistics of elements\nnp.any         N/A              Evaluate whether any elements are true\nnp.all         N/A              Evaluate whether all elements are true\n'

In [22]:
# Example: what is the average height of the US presidents
import pandas as pd
data = pd.read_csv("Data/president_heights.csv")

In [25]:
heights = np.array(data["height(cm)"])
heights

array([189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175,
       178, 183, 193, 178, 173, 174, 183, 183, 168, 170, 178, 182, 180,
       183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177, 185, 188,
       188, 182, 185, 191, 182])

In [26]:
print("Mean height:", heights.mean())
print("Standard deviation:", heights.std())
print("Minimum height:", heights.min())
print("Maximum height:", heights.max())
print("25th percentile:", np.percentile(heights, 25))
print("Median:", np.median(heights))
print("75th percentile:", np.percentile(heights, 75))


Mean height: 180.04545454545453
Standard deviation: 6.983599441335736
Minimum height: 163
Maximum height: 193
25th percentile: 174.75
Median: 182.0
75th percentile: 183.5
