# Statistics

In [92]:
import numpy as np
import pandas as pd
import statistics
import random
import scipy

## Measures of Central Tendency (Mean, Median and Mode)

### Mean - O(n)

In [93]:
a = np.array([3, 6, 2,8,5])
np.mean(a)

4.8

In [94]:
a = np.array([[1,2,3], [4,5,6], [7,8,9]])
np.mean(a)

5.0

In [95]:
np.mean(a, axis=0) #column-wise

array([4., 5., 6.])

In [96]:
np.mean(a, axis=1)

array([2., 5., 8.])

In [97]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
df.mean() #df.mean(axis=0)

0    4.0
1    5.0
2    6.0
dtype: float64

In [98]:
df.mean(axis=1)

0    2.0
1    5.0
2    8.0
dtype: float64

### Median - O(n logn)
Value in the middle of the ascending ordered array:
- Odd: Element in the middle
- Even: Average of the two elements in the middle

#### Numpy

In [99]:
a = np.array([[1,5,9,4],[3,8,4,7], [4,9,3,8]])

In [100]:
np.median(a)

4.5

In [101]:
np.median(a, axis=0)

array([3., 8., 4., 7.])

In [102]:
np.median(a, axis=1)

array([4.5, 5.5, 6. ])

#### Pandas

In [103]:
df = pd.DataFrame([[1,5,9,4],[3,8,4,7], [4,9,3,8]])

In [104]:
df.median() #=axis=0

0    3.0
1    8.0
2    4.0
3    7.0
dtype: float64

In [105]:
df.median(axis=1)

0    4.5
1    5.5
2    6.0
dtype: float64

### Mode
Most repetitive value in a data set.

In [106]:
a = [3, 5, 7, 3, 8, 150, 4, 7,  1, 7]

In [107]:
statistics.mode(a)

7

In [108]:
a = [1, 2, 2, 1, 3, 3, 6, 8, 6]

In [109]:
statistics.multimode(a)

[1, 2, 3, 6]

In [110]:
a = np.random.randint(0, 10, (10, 10))
print(a)

[[2 5 9 4 4 6 3 3 7 5]
 [0 2 3 2 4 9 8 0 3 6]
 [6 5 3 1 0 0 5 9 7 6]
 [3 6 1 7 4 6 0 2 4 7]
 [2 3 5 2 5 4 7 7 5 3]
 [8 0 8 8 9 9 2 9 0 4]
 [8 6 3 9 6 6 2 9 0 9]
 [8 3 0 4 7 3 0 7 9 8]
 [8 2 5 9 4 4 7 3 7 7]
 [5 9 9 4 0 8 7 1 9 5]]


In [111]:
mr0 = scipy.stats.mode(a, axis=0)
print(mr0.mode)
print(mr0.count)

[8 2 3 4 4 6 7 9 7 5]
[4 2 3 3 4 3 3 3 3 2]


mr1 = scipy.stats.mode(a, axis=1)
print(mr1.mode)
print(mr1.count)

## Measures of Dispersion

### Standard Deviation
is a measure of the amount of variation of a random variable expected about its mean. A low standard deviation indicates that the values tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the values are spread out over a wider range. The standard deviation is commonly used in the determination of what constitutes an outlier and what does not. [Wikipedia]

- **Population Standard Deviation** is calculated from every item in the population. (/N)
- **Sample Standard Deviation** is calculated for a sample that is randomly selected from a population. (/(N-1))

### Python Std lib

In [112]:
a = [9, 4, 5, 2, 2, 6, 6, 8, 0, 8]
statistics.pstdev(a) # Population std dev

2.8284271247461903

In [113]:
statistics.stdev(a) # Samples std dev

2.9814239699997196

#### Numpy

In [114]:
a = np.random.randint(0, 100, (10, 10))

In [115]:
np.std(a, axis=0) # Population std dev for each column ddof=0

array([27.13963891, 26.39337038, 28.54978108, 18.24938355, 28.50263146,
       33.44861731, 31.45727261, 28.46067462, 19.81514572, 29.3318939 ])

In [116]:
np.std(a, axis=0, ddof=1)# Sample std dev (divided by N-ddof) for each column

array([28.60769127, 27.82105518, 30.09411164, 19.23653931, 30.04441157,
       35.25793843, 33.15887681, 30.00018518, 20.88699755, 30.91853094])

#### Pandas

In [117]:
s = pd.Series([1,2,3,4,5])

In [118]:
s.std() # ddof=1 -> Sample std dev

1.5811388300841898

In [119]:
s.std(ddof=0) # Population std dev

1.4142135623730951

In [120]:
df = pd.DataFrame(np.random.randint(0, 100, (10, 10)))

In [121]:
df.std(axis=0)# ddof=1 -> Sample std dev

0    30.089681
1    34.098061
2    30.111644
3    27.855979
4    26.416325
5    30.565231
6    31.714876
7    35.415157
8    22.528007
9    30.776073
dtype: float64

In [122]:
df.std(axis=0, ddof=0) # Population std dev

0    28.545578
1    32.348261
2    28.566414
3    26.426502
4    25.060726
5    28.996724
6    30.087373
7    33.597768
8    21.371944
9    29.196746
dtype: float64

### Variance
variance is the expected value of the squared deviation from the mean of a random variable. 

#### Python std lib

In [123]:
a = np.array([1,2,3,4,5])
statistics.pvariance(a) # population variance

2

In [124]:
statistics.variance(a)

2

#### Numpy

In [125]:
np.var(a) # Population variance

2.0

In [126]:
np.var(a, ddof=1) #Samples variance (Bessel's correction)

2.5

#### Pandas

In [127]:
s=pd.Series(a)

In [128]:
s.var() # Samples variance  -> ddof=1

2.5

In [129]:
s.var(ddof=0) # population variance

2.0

In [130]:
b = np.random.randint(0,10,(10,10))

In [131]:
df = pd.DataFrame(b)

In [132]:
df.var(axis=0) # ddof=1 so Samples variance

0     8.322222
1     4.044444
2    10.266667
3    10.488889
4    10.177778
5     8.277778
6     9.822222
7     8.488889
8    11.955556
9     9.122222
dtype: float64

In [133]:
df.var(axis=0, ddof=0) # population variance

0     7.49
1     3.64
2     9.24
3     9.44
4     9.16
5     7.45
6     8.84
7     7.64
8    10.76
9     8.21
dtype: float64

## Probability

In [134]:
def head_tail(n):
    head = 0
    for _ in range(n):
        head += random.randint(0,1)
    return head/n

In [139]:
# Law of large numbers
print(head_tail(1))
print(head_tail(10))
print(head_tail(100))
print(head_tail(1000))
print(head_tail(10000))
print(head_tail(100000))
print(head_tail(1000000))

1.0
0.5
0.5
0.51
0.4993
0.49946
0.50095


### Probability Density Function