In [20]:
import numpy as np
import pandas as pd
import string
import calendar
import re
import math
import sympy
import matplotlib.pyplot as plt

from fractions import Fraction
from scipy import stats
from scipy import special


print(f"numpy version is {np.__version__}")
print(f"pandas version is {pd.__version__}")

numpy version is 1.20.3
pandas version is 1.3.1


### Mean of a normal RV

#### Known variance

In [138]:
mu_0 = 37.5
alpha = 0.05
sigma = math.sqrt(9.375)

# data = pd.Series(np.fromstring('72 68.1 69.2 72.8 71.2 72.2 70.8 74 66 70.3 70.4 76 72.5 74 71.8 69.6 75.6 70.6 76.2 77', dtype = float, sep = ' '))
# xBar = data.mean()
# n = data.shape[0]

xBar = 41.5
n = 1

# Two-sided
testStat = math.sqrt(n) * abs(xBar - mu_0) / sigma
pVal = 2*(1 - stats.norm.cdf(testStat))

# One-sided-lower
# testStat = math.sqrt(n) * (xBar - mu_0) / sigma
# pVal = stats.norm.cdf(testStat)

# One-sided-upper
# testStat = math.sqrt(n) * (xBar - mu_0) / sigma
# pVal = (1 - stats.norm.cdf(testStat))

beta = stats.norm.cdf((mu_0  - xBar) * math.sqrt(n) / sigma + stats.norm.ppf(1 - alpha/2))

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Mean': xBar, 'Power function': 1 - beta}
pd.Series(outputs)

Test Statistic     1.306395
p value           19.141843
Alpha              5.000000
Sample Mean       41.500000
Power function     0.256695
dtype: float64

#### unknown variance

In [46]:
mu_0 = 100
alpha = 0.05


data = pd.Series(np.fromstring('96,98,105,92,111,114,99,103,95,101,106,97', dtype = float, sep = ','))
xBar = data.mean()
n = data.shape[0]
sample_std = data.std()

# xBar = 200
# n = 64
# sample_std = 35

# Two-sided
# testStat = math.sqrt(n) * abs(xBar - mu_0) / sample_std
# pVal = 2*(1 - stats.t.cdf(testStat, n-1))

# One-sided-lower
testStat = math.sqrt(n) * (xBar - mu_0) / sample_std
pVal = stats.t.cdf(testStat, n-1)

# One-sided-upper
# testStat = math.sqrt(n) * (xBar - mu_0) / sample_std
# pVal = (1 - stats.t.cdf(testStat, n-1))

beta = stats.t.cdf((mu_0  - xBar) * math.sqrt(n) / sample_std + stats.t.ppf(1 - alpha/2, n-1), n-1)

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Mean': xBar, 'Sample Std': sample_std, 'Power function': 1 - beta}
pd.Series(outputs)

Test Statistic      0.740659
p value            76.279314
Alpha               5.000000
Sample Mean       101.416667
Sample Std          6.625822
Power function      0.086083
dtype: float64

### Equality of Means of two normal RVs

#### variances known

In [144]:
# data1 = pd.Series(np.fromstring('122,114,130,165,144,133,139,142,150', dtype = float, sep = ','))
# data2 = pd.Series(np.fromstring('108,125,122,140,132,120,137,128,138', dtype = float, sep = ','))
# xBar, yBar = data1.mean(), data2.mean()
# n, m = data1.shape[0], data2.shape[0]

xBar, yBar = 39/72, 44/84
n, m = 72, 84

pooled_p = (n*xBar + m*yBar) / (n+m)


s1, s2 = 10, 5
alpha = 0.05


std_comp = math.sqrt(pooled_p * (1 - pooled_p) * (1/n + 1/m))
# std_comp = math.sqrt(s1**2 / n + s2**2 / m)

# Two-sided
testStat = abs(xBar - yBar) / std_comp
pVal = 2*(1 - stats.norm.cdf(testStat))

# One-sided-lower
# testStat = (xBar - yBar) / std_comp
# pVal = stats.norm.cdf(testStat)

# One-sided-upper
# testStat = (xBar - yBar) / std_comp
# pVal = (1 - stats.norm.cdf(testStat))

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Mean Diff': xBar - yBar, 'Sample Comp Std': std_comp}
pd.Series(outputs)


Test Statistic       0.222833
p value             82.366541
Alpha                5.000000
Sample Mean Diff     0.017857
Sample Comp Std      0.080137
dtype: float64

In [148]:
(xBar - yBar) / std_comp

0.22283309232919832

#### Variances unknown but equal

In [81]:
# data1 = pd.Series(np.fromstring('32.2,27.4,28.6,32.4,40.5,26.2,29.4,25.8,36.6,30.3,28.5,32.0', dtype = float, sep = ','))
# data2 = pd.Series(np.fromstring('30.5,28.4,40.2,37.6,36.5,38.8,34.7,29.5,29.7,37.2,41.5,37.0', dtype = float, sep = ','))
# xBar, yBar = data1.mean(), data2.mean()
# n, m = data1.shape[0], data2.shape[0]
# vx, vy = data1.var(), data2.var()

xBar, yBar = 6.8, 7.2
n, m = 53, 44
vx, vy = 5.2, 4.9

alpha = 0.01
std_pooled = math.sqrt(((n-1)*vx + (m-1)*vy)/ (n+m-2))

# Two-sided
testStat = abs(xBar - yBar) / (std_pooled * math.sqrt(1/n + 1/m))
pVal = 2*(1 - stats.t.cdf(testStat, n+m-2))

# One-sided-lower
# testStat = (xBar - yBar) / (std_pooled * math.sqrt(1/n + 1/m))
# pVal = stats.t.cdf(testStat, n+m-2)

# One-sided-upper
# testStat = (xBar - yBar) / (std_pooled * math.sqrt(1/n + 1/m))
# pVal = (1 - stats.t.cdf(testStat, n+m-2))

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Mean Diff': xBar - yBar, 'Sample Comp Std': std_comp}
pd.Series(outputs)


Test Statistic       0.871530
p value             38.566185
Alpha                1.000000
Sample Mean Diff    -0.400000
Sample Comp Std      3.726780
dtype: float64

#### paired t-test

In [98]:
data1 = pd.Series(np.fromstring('74 86 98 102 78 84 79 70', dtype = float, sep = ' '))
data2 = pd.Series(np.fromstring('70 85 90 110 71 80 69 74', dtype = float, sep = ' '))
data3 = data1 - data2
wBar = data3.mean()
n = data3.shape[0]
std_w = data3.std()

# xBar, yBar = 0.015, 0.006
# wBar = xBar - yBar
# n = 33
# vx, vy = 0.004**2, 0.006**2
# std_w = math.sqrt(vx + vy)

alpha = 0.05


# Two-sided
testStat = abs(wBar) / (std_w / math.sqrt(n))
pVal = 2*(1 - stats.t.cdf(testStat, n-1))

# One-sided-lower
# testStat = (xBar - yBar) / (std_w / math.sqrt(n))
# pVal = stats.t.cdf(testStat, n-1)

# One-sided-upper
# testStat = (xBar - yBar) / (std_w / math.sqrt(n))
# pVal = (1 - stats.t.cdf(testStat, n-1))

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Mean Diff': wBar, 'Sample Comp Std': std_comp}
pd.Series(outputs)

Test Statistic       1.262974
p value             24.703941
Alpha                5.000000
Sample Mean Diff     2.750000
Sample Comp Std      3.726780
dtype: float64

### Variance tests

#### Variance and mean unknown

In [5]:
alpha = 0.01
sigma_0 = 0.4

data = pd.Series(np.fromstring('5.728 5.731 5.722 5.719 5.727 5.724 5.718 5.726 5.723 5.722', dtype = float, sep = ' '))
n = data.shape[0]
sample_std = data.std()

# sample_std = 0.08
# n = 50

# Two-sided
# testStat = (n-1) * sample_std ** 2 / sigma_0 ** 2
# pVal = 2*min(1 - stats.chi2.cdf(testStat, n-1), stats.chi2.cdf(testStat, n-1))

# One-sided-lower
testStat = (n-1) * sample_std ** 2 / sigma_0 ** 2
pVal = stats.chi2.cdf(testStat, n-1)

# One-sided-upper
# testStat = (n-1) * sample_std ** 2 / sigma_0 ** 2
# pVal = 1 - stats.chi2.cdf(testStat, n-1)

# beta = stats.norm.cdf((mu_0  - xBar) * math.sqrt(n) / sigma + stats.norm.ppf(1 - alpha/2))

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Std': sample_std}
pd.Series(outputs)

Test Statistic    9.250000e-04
p value           1.879238e-15
Alpha             1.000000e+00
Sample Std        4.055175e-03
dtype: float64

#### Equality of variances of two normal populations

In [13]:
alpha = 0.05

# data1 = pd.Series(np.fromstring('10.62,10.58,10.33,10.72,10.44,10.74', dtype = float, sep = ','))
# data2 = pd.Series(np.fromstring('10.50,10.52,10.58,10.62,10.55,10.51,10.53', dtype = float, sep = ','))
# n, m = data1.shape[0], data2.shape[0]
# sample_std_a, sample_std_b = data1.std(), data2.std()

sample_std_a, sample_std_b = math.sqrt((91-75*0.948**2)/74), math.sqrt((82-75*0.652**2)/74)
# sample_std_a, sample_std_b = 0.08
n, m = 75, 75

# Two-sided
# testStat = sample_std_a ** 2 / sample_std_b ** 2
# pVal = 2*min(1 - stats.f.cdf(testStat, n-1, m-1), stats.f.cdf(testStat, n-1, m-1))

# One-sided-lowerat
testStat = sample_std_a ** 2 / sample_std_b ** 2
pVal = stats.f.cdf(testStat, n-1, m-1)

# One-sided-upper
# testStat = sample_std_a ** 2 / sample_std_b ** 2
# pVal = 1 - stats.f.cdf(testStat, n-1, m-1)

# beta = stats.norm.cdf((mu_0  - xBar) * math.sqrt(n) / sigma + stats.norm.ppf(1 - alpha/2))

outputs = {'Test Statistic': testStat, 'p value': 100*pVal, 'Alpha': 100*alpha, 'Sample Std A': sample_std_a, 'Sample Std B': sample_std_b}
pd.Series(outputs)

Test Statistic    0.470840
p value           0.070646
Alpha             5.000000
Sample Std A      0.564696
Sample Std B      0.822958
dtype: float64

### Bernoulli parameter tests

#### Equality of two bernoulli parameters

In [152]:
N, M = 286, 310
X, Y = 252, 270

alpha = 0.05

# Two-sided
testStat = stats.hypergeom.pmf(X, N+M, N, X+Y)
pSum = 0
for k in range(X+Y+1):
    if stats.hypergeom.pmf(k, N+M, N, X+Y) <= stats.hypergeom.pmf(X, N+M, N, X+Y):
        pSum += stats.hypergeom.pmf(k, N+M, N, X+Y)

outputs = {'Test Statistic': testStat, 'p value': 100*pSum, 'Alpha': 100*alpha, 'pBar': 100*X/N, 'qBar': 100*Y/M}
pd.Series(outputs)

Test Statistic     0.092321
p value           80.380621
Alpha              5.000000
pBar              88.111888
qBar              87.096774
dtype: float64

#### Single Bernoulli parameter test

In [140]:
p_0 = 0.5
X = 72
N = 156
alpha = 0.05

# Two-sided
testStat = stats.binom.pmf(X, N, p_0)
pSum = 0
for k in range(N+1):
    if stats.binom.pmf(k, N, p_0) <= stats.binom.pmf(X, N, p_0):
        pSum += stats.binom.pmf(k, N, p_0)
        
# One-sided upper
# testStat = stats.binom.pmf(X, N, p_0)
# pSum = 1 - stats.binom.cdf(X-1, N, p_0)
        
# One-sided lower
# testStat = stats.binom.pmf(X, N, p_0)
# pSum = stats.binom.cdf(X, N, p_0)

outputs = {'Test Statistic': testStat, 'p value': 100*pSum, 'Alpha': 100*alpha, 'pBar': 100*X/N}
pd.Series(outputs)

Test Statistic     0.040302
p value           37.853702
Alpha              5.000000
pBar              46.153846
dtype: float64

### Poisson parameter test

#### Single Poisson paramter test

In [161]:
# data = pd.Series(np.fromstring('46, 62, 60, 58, 47, 50, 59, 49', dtype = float, sep = ', '))
# xBar = data.mean()

xBar = 27

x_0 = 6.7


alpha = 0.05

# Two-sided
testStat = stats.poisson.pmf(round(xBar), x_0)
p = 2*min(stats.poisson.cdf(xBar, x_0), 1 - stats.poisson.cdf(xBar, x_0))
        
# One-sided upper
# testStat = stats.binom.pmf(X, N, p_0)
# pSum = 1 - stats.binom.cdf(X-1, N, p_0)
        
# One-sided lower
# testStat = stats.binom.pmf(X, N, p_0)
# pSum = stats.binom.cdf(X, N, p_0)

outputs = {'Test Statistic': testStat, 'p value': 100*p, 'Alpha': 100*alpha, 'xBar': xBar}
pd.Series(outputs)

Test Statistic    2.276480e-09
p value           1.412710e-07
Alpha             5.000000e+00
xBar              2.700000e+01
dtype: float64

#### comparing Poisson parameters

In [170]:
data1 = pd.Series(np.fromstring('24,32,29,33,40,28,34,36', dtype = float, sep = ','))
data2 = pd.Series(np.fromstring('42,36,41', dtype = float, sep = ','))

n, m = data1.shape[0], data2.shape[0]
c = 1
xBar = 27

x_0 = 6.7

p_pooled = (n/m) / (c + n/m)
alpha = 0.05

# Two-sided
testStat = stats.binom.pmf(n, n+m, p_pooled)
p = 2*min(stats.binom.pmf(n, n+m, p_pooled), 1 - stats.binom.pmf(n, n+m, p_pooled))
        
# One-sided upper
# testStat = stats.binom.pmf(X, N, p_0)
# pSum = 1 - stats.binom.cdf(X-1, N, p_0)
        
# One-sided lower
# testStat = stats.binom.pmf(X, N, p_0)
# pSum = stats.binom.cdf(X, N, p_0)

outputs = {'Test Statistic': testStat, 'p value': 100*p, 'Alpha': 100*alpha, 'xBar': xBar}
pd.Series(outputs)

Test Statistic     0.261968
p value           52.393579
Alpha              5.000000
xBar              27.000000
dtype: float64

In [171]:
p_pooled

0.7272727272727273