In [24]:
## Central tendencies
import statistics
import math
from collections import Counter

In [25]:
def mean(x):
    """The sum of the data divided by its count."""
    return sum(x) / len(x)

In [26]:
mean([1,2,3]), statistics.mean([1,2,3])

(2.0, 2)

In [27]:
def median(v):
    """finds the middle-most value of v"""
    n = len(v)
    sorted_v = sorted(v)
    midpoint = n // 2
    
    if n % 2 == 1:
        # If odd, return the middle value.
        return sorted_v[midpoint]
    else:
        # If even, return the average of the middle values.
        lo = midpoint - 1
        hi = midpoint
        return (sorted_v[lo] + sorted_v[hi]) / 2

In [28]:
median([1,2,3]), statistics.mean([1,2,3])

(2, 2)

In [29]:
# Represents the values less than which a certain percentile of the data lies.
def quantile(x, p):
    """Returns the pth-percentile value in x"""
    p_index = int(p * len(x))
    return sorted(x)[p_index]

In [30]:
# Most common values.
def mode(x):
    """Returns a list, might be more than one mode"""
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items()
            if count == max_count]

In [31]:
mode([1,2,3,1]), statistics.mode([1,2,3,1]) # [1,2,3] will throw error because there's no mode.

([1], 1)

In [40]:
# Dispersion refers to measures of how spread out our data is. 
# Range is the difference between the largest and smallest elements.
def data_range(x):
    return max(x) - min(x)

In [41]:
def sum_of_squares(v):
    return sum([v_i * v_i for v_i in v])

In [45]:
sum_of_squares([1,2,3]), sum(np.square([1,2,3]))

(14, 14)

In [46]:
def de_mean(x):
    """translate x by subtracting its mean (so the result has mean 0)"""
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

In [47]:
def variance(x):
    """assumes x has at least two elements"""
    n = len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n - 1)

In [111]:
X = list(range(-5, 5))
y = sorted(X, reverse=True)

variance(X), np.cov(X)

(9.166666666666666, array(9.16666667))

In [112]:
def standard_deviation(x):
    return math.sqrt(variance(x))

In [113]:
standard_deviation(X), statistics.stdev(X)

(3.0276503540974917, 3.0276503540974917)

In [114]:
# Computes the difference between the 75th percentile value and the 25th percentile values.
def interquatile_range(x):
    return quantile(x, 0.75) - quantile(x, 0.25)

In [115]:
def dot(v, w):
    return sum([vi * wi for vi, wi in zip(v, w)])

In [116]:
## Correlation

# Variance measures how a single variable deviates from its means,
# covariance measures how two variables vary in tandem from their means.
def covariance(x, y):
    n = len(x)
    return dot(de_mean(x), de_mean(y)) / (n - 1)

In [128]:
covariance(X, y), np.cov(X, y)[0, 1]

(-9.166666666666666, -9.166666666666666)

In [118]:
# -1 perfect anti-correlation
# 1 perfect correlation
def correlation(x, y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(x, y) / stdev_x / stdev_y
    else:
        return 0 # If no variation, the correlation is zero.

In [127]:
correlation(X, Y), np.corrcoef(X, Y)[0, 1]

(-0.9999999999999999, -0.9999999999999999)