In [2]:
import numpy as np
import pandas as pd
import string
import calendar
import re
import math
import sympy
import matplotlib.pyplot as plt
import pprint
import itertools

from fractions import Fraction
from scipy import stats
from scipy import special


print(f"numpy version is {np.__version__}")
print(f"pandas version is {pd.__version__}")


numpy version is 1.20.3
pandas version is 1.3.2


### WIlcoxon signed rank test - recursive p-value

In [25]:
def wilcoxon_pval(k, i):
    if k == 1:
        if i < 0:
            return 0
        elif i == 0:
            return 1/2
        elif i >= 1:
            return 1
    elif k > 1:
        if i < 0:
            return 0
        else:
            return 0.5 * (wilcoxon_pval(k-1, i-k) + wilcoxon_pval(k-1, i))
        
def wilcoxon_ranked_sum_pval(n, m, k):
    if (n < 0) or (m < 0):
        return 0
    else:
        if (n == 1 and m == 0):
            if k <= 0:
                return 0
            else:
                return 1
        elif (n == 0 and m == 1):
            if k < 0:
                return 0
            else:
                return 1
        else:
            return (n/(n+m)) * wilcoxon_ranked_sum_pval(n-1, m, k-n-m) + (m/(n+m)) * wilcoxon_ranked_sum_pval(n, m-1, k)
        
def runs_test_discrete_CDF(n, m, k):
    all_PMFS = np.array([])
    for i in range(1, k+1):
        if i % 2 == 0:
            j               = max(int(i/2), 0)
            even_PMF        = 2 * math.comb(m-1, j-1) * math.comb(n-1, j-1) / math.comb(m+n, n)
            all_PMFS        = np.hstack((all_PMFS, even_PMF))
        else:
            j               = max(int((i-1)/2), 0)
            odd_PMF         = (math.comb(m-1, max(0, j-1)) * math.comb(n-1, j) + math.comb(m-1, j) * math.comb(n-1, max(0, j-1)))  / math.comb(m+n, n)
            all_PMFS        = np.hstack((all_PMFS, odd_PMF))
            
    return all_PMFS.sum()

### Sign test

In [65]:
sig         = 5
tailed      = 0

# manual data entry
# n           = 200
# mu          = 120

# paired data as rows
# rawSign     = np.loadtxt('sign_test.txt', delimiter=' ')
# n           = rawSign.shape[1]
# mu          = np.count_nonzero((rawSign[0] - rawSign[1]) <= 0)

# paired data as columns
rawSign     = pd.read_csv('sign_test.txt', delimiter=' ', header = None)
n           = rawSign.shape[0]
mu          = np.count_nonzero((rawSign.iloc[:, 0] - rawSign.iloc[:, 1]) <= 0)

# unpaired data
# rawSign     = np.loadtxt('sign_test.txt', delimiter=',').flatten()
# n           = rawSign.shape[0]
# mu          = np.count_nonzero(rawSign<= 124.4)

# note the mu-1 in the argument since this is a discrete distribution

if tailed == 0:         #two-tailed
    pVal        = 2 * min(stats.binom.cdf(mu, n, 1/2), 1 - stats.binom.cdf(mu-1, n, 1/2))
elif tailed == -1:      # reject if greater
    pVal        = stats.binom.cdf(mu, n, 1/2)
else:                   # reject if smaller
    pVal        = 1 - stats.binom.cdf(mu-1, n, 1/2)

hypo        = 'rejected' if pVal < (0.01 * sig) else 'accepted'

#### output as table

In [66]:
outDict     ={'Observations ($n$)'                : '{:d}'.format(n),
            'Test statistic ($\\nu$)'                : '{:d}'.format(mu),
            '$p$ value \%'                  : '{:.2f}'.format(100 * pVal),
            'Significance ($\\alpha$) \%'   : '{:.2f}'.format(sig),
            'null hypothesis ($H_0$)'       : hypo
            }
dfb = pd.DataFrame.from_dict(outDict, orient='index', columns = ['Value'])

s4          = dfb.to_latex(escape=False, column_format='@{}lr@{}')
s4          = s4.replace('{} &     Value', '\multicolumn{2}{c}{\\texttt{Sign Test}}')

print('\\begin{table}[H]')
print('\\centering')
print(s4)
print('\\end{table}')
print('\\bigskip')

\begin{table}[H]
\centering
\begin{tabular}{@{}lr@{}}
\toprule
\multicolumn{2}{c}{\texttt{Sign Test}} \\
\midrule
Observations ($n$)         &         6 \\
Test statistic ($\nu$)     &         2 \\
$p$ value \%               &     68.75 \\
Significance ($\alpha$) \% &      5.00 \\
null hypothesis ($H_0$)    &  accepted \\
\bottomrule
\end{tabular}

\end{table}
\bigskip


### Wilcoxon signed rank test

In [67]:
sig         = 5

# one-column 
# rawData     = np.loadtxt('wilcoxon_signed_rank.txt', delimiter= ' ')
# n           = rawData.shape[0]
# m_0         = 0
# Ij          = (rawData[np.argsort(abs(rawData - m_0))] < m_0).astype(int)


# paired data as columns
rawSign    = pd.read_csv('wilcoxon_signed_rank.txt', delimiter=' ', header = None)
n           = rawSign.shape[0]
m_0         = 0
rawData     = (rawSign.iloc[:, 0] - rawSign.iloc[:, 1]).to_numpy()

Ij          = (rawData[np.argsort(abs(rawData - m_0))] < m_0).astype(int)
t           = np.multiply(Ij, np.arange(1, n + 1)).sum()
TS          = min(t, -t + 0.5 * (n + 1) * n)
pVal        = 2 * wilcoxon_pval(n, TS)
# note the mu-1 in the argument since this is a discrete distribution

# if tailed == 0:         #two-tailed
#     pVal        = 2 * min(stats.binom.cdf(mu, n, 1/2), 1 - stats.binom.cdf(mu-1, n, 1/2))
# elif tailed == -1:      # reject if greater
#     pVal        = stats.binom.cdf(mu, n, 1/2)
# else:                   # reject if smaller
#     pVal        = 1 - stats.binom.cdf(mu-1, n, 1/2)

hypo        = 'rejected' if pVal < (0.01 * sig) else 'accepted'


#### output contingency table and summary

In [68]:
outDict     ={'Observations ($n$)'          : '{:d}'.format(n),
            'Test statistic '               : '{:d}'.format(int(TS)),
            '$p$ value \%'                  : '{:.2f}'.format(100 * pVal),
            'Significance ($\\alpha$) \%'   : '{:.2f}'.format(sig),
            'null hypothesis ($H_0$)'       : hypo
            }
dfb = pd.DataFrame.from_dict(outDict, orient='index', columns = ['Value'])

s4          = dfb.to_latex(escape=False, column_format='@{}lr@{}')
s4          = s4.replace('{} &     Value', '\multicolumn{2}{c}{\\texttt{Wilcoxon Signed Rank Test}}')

print('\\begin{table}[H]')
print('\\centering')
print(s4)
print('\\end{table}')
print('\\bigskip')

\begin{table}[H]
\centering
\begin{tabular}{@{}lr@{}}
\toprule
\multicolumn{2}{c}{\texttt{Wilcoxon Signed Rank Test}} \\
\midrule
Observations ($n$)         &         6 \\
Test statistic             &         5 \\
$p$ value \%               &     31.25 \\
Significance ($\alpha$) \% &      5.00 \\
null hypothesis ($H_0$)    &  accepted \\
\bottomrule
\end{tabular}

\end{table}
\bigskip


### Wilcoxon rank sum test

In [82]:
sig         = 5
rawData     = pd.read_table('wilcoxon_ranked_sum.txt', sep = ',', header = None, )
n           = rawData.shape[0]

X           = rawData.loc[0].dropna().to_numpy()
Y           = rawData.loc[1].dropna().to_numpy()
n,m         = X.shape[0], Y.shape[0]
A           = np.sort(np.hstack((X, Y)))
ranks       = np.array([1]) + np.where(np.in1d(A, X))

TS          = ranks.sum()
# pVal        = 2 * min(wilcoxon_ranked_sum_pval(n, m, TS), 1 - wilcoxon_ranked_sum_pval(n, m, TS - 1))
pVal        = 100.00

# approximate p value using normal distribution
mu              = 0.5 * n * (n+m+1)
var             = (n*m*(n+m+1)) / 12
runs_prob       = stats.norm(loc = mu, scale = math.sqrt(var))
pVal_approx     = 2 * min(runs_prob.cdf(TS), 1 - runs_prob.cdf(TS))

hypo        = 'rejected' if pVal < (0.01 * sig) else 'accepted'

#### output contingency table and summary

In [83]:
outDict     ={'Observations ($n$)'          : '{:d}'.format(n),
              'Comparisons ($m$)'           : '{:d}'.format(m),
            'Test statistic '               : '{:d}'.format(TS),
            '$p$ value \%'                  : '{:.2f}'.format(100 * pVal),
            '$p$ value approx \%'           : '{:.2f}'.format(100 * pVal_approx),
            'Significance ($\\alpha$) \%'   : '{:.2f}'.format(sig),
            'null hypothesis ($H_0$)'       : hypo
            }
dfb = pd.DataFrame.from_dict(outDict, orient='index', columns = ['Value'])

s4          = dfb.to_latex(escape=False, column_format='@{}lr@{}')
s4          = s4.replace('{} &     Value', '\multicolumn{2}{c}{\\texttt{Wilcoxon Rank Sum Test}}')

print('\\begin{table}[H]')
print('\\centering')
print(s4)
print('\\end{table}')
print('\\bigskip')

\begin{table}[H]
\centering
\begin{tabular}{@{}lr@{}}
\toprule
\multicolumn{2}{c}{\texttt{Wilcoxon Rank Sum Test}} \\
\midrule
Observations ($n$)         &        23 \\
Comparisons ($m$)          &         9 \\
Test statistic             &       369 \\
$p$ value \%               &  10000.00 \\
$p$ value approx \%        &     65.99 \\
Significance ($\alpha$) \% &      5.00 \\
null hypothesis ($H_0$)    &  accepted \\
\bottomrule
\end{tabular}

\end{table}
\bigskip


### Kruskal-Wallis test

In [84]:
sig         = 5

rawDataKW       = pd.read_table('Kruskal_Wallis.txt', sep = ',', index_col = 0, header = None)
rawDataKW       = rawDataKW.transpose()

all_Vals        = np.array([])
all_n           = np.array([])
ranks           = np.array([])
for i in rawDataKW.iteritems():
    all_Vals        = np.hstack((all_Vals, i[1].dropna().to_numpy()))
    all_n           = np.hstack((all_n, i[1].dropna().to_numpy().shape[0]))
    
all_Vals_sorted = np.sort(all_Vals)

for j in rawDataKW.iteritems():
    curRank     = (np.array([1]) + np.where(np.in1d(all_Vals_sorted, j[1].dropna().to_numpy()))).sum()
    ranks       = np.hstack((ranks, curRank))

N               = int(all_n.sum())

TS              = (np.divide(np.multiply(ranks, ranks), all_n)).sum()
TS_chisq        = (12 / (N * (N+1))) * TS - 3 * (N+1)
pVal            = 1 - stats.chi2.cdf(TS_chisq, rawDataKW.columns.shape[0] - 1)
hypo        = 'rejected' if pVal < (0.01 * sig) else 'accepted'


#### output summary

In [85]:
outDict     ={'Observations ($N$)'          : '{:d}'.format(N),
            'Populations ($k$)'             : '{:d}'.format(rawDataKW.columns.shape[0]),
            'Test statistic '               : '{:.2f}'.format(TS_chisq),
            '$p$ value \%'                  : '{:.2f}'.format(100 * pVal),
            'Significance ($\\alpha$) \%'   : '{:.2f}'.format(sig),
            'null hypothesis ($H_0$)'       : hypo
            }
dfb = pd.DataFrame.from_dict(outDict, orient='index', columns = ['Value'])

s4          = dfb.to_latex(escape=False, column_format='@{}lr@{}')
s4          = s4.replace('{} &     Value', '\multicolumn{2}{c}{\\texttt{Wilcoxon Rank Sum Test}}')

print('\\begin{table}[H]')
print('\\centering')
print(s4)
print('\\end{table}')
print('\\bigskip')

\begin{table}[H]
\centering
\begin{tabular}{@{}lr@{}}
\toprule
\multicolumn{2}{c}{\texttt{Wilcoxon Rank Sum Test}} \\
\midrule
Observations ($N$)         &        30 \\
Populations ($k$)          &         3 \\
Test statistic             &    110.25 \\
$p$ value \%               &      0.00 \\
Significance ($\alpha$) \% &      5.00 \\
null hypothesis ($H_0$)    &  rejected \\
\bottomrule
\end{tabular}

\end{table}
\bigskip


### runs test for non-randomness

In [113]:
sig             = 5

# given actual raw data
runData         = np.loadtxt('runs_test.txt', delimiter=' ', dtype='int')
sample_med      = np.median(runData)
binarized       = (runData == 1).astype(int).astype(str)
raw_runs        = ''.join(binarized)


# given failure locations only
# fLoc            = np.loadtxt('runs_test.txt', delimiter=',', dtype='int')
# nTotal          = 50
# binarized       = np.ones((50))
# binarized[fLoc] = 0
# binarized       = binarized.astype(int).astype(str)
# raw_runs        = ''.join(binarized)


grouped         = [list(g) for k, g in itertools.groupby(raw_runs)]

num_runs        = len(grouped)
m,n             = np.count_nonzero(binarized == '0'), np.count_nonzero(binarized == '1')
# manual data entry
# num_runs, m, n  = 70, 60, 60

# exact p value
pVal            = 2 * min(runs_test_discrete_CDF(m,n,num_runs), 1 - runs_test_discrete_CDF(m,n,num_runs - 1))

# approximate p value using normal distribution
mu              = 1 + (2*m*n) / (m+n)
var             = (2*m*n * (2*m*n - (m+n))) / ((m+n)*(m+n)*(m+n-1))
runs_prob       = stats.norm(loc = mu, scale = math.sqrt(var))
pVal_approx     = 2 * min(runs_prob.cdf(num_runs), 1 - runs_prob.cdf(num_runs))

hypo        = 'rejected' if pVal < (0.01 * sig) else 'accepted'

#### output tables

In [114]:
outDict     ={'Observations ($N$)'          : '{:d}'.format(m+n),
            'Runs ($k$)'                    : '{:d}'.format(num_runs),
            'Failures ($m$) '               : '{:d}'.format(m),
            'Successes ($n$) '              : '{:d}'.format(n),
            '$p$ value \%'                  : '{:.2f}'.format(100 * pVal),
            '$p$ value approx \%'           : '{:.2f}'.format(100 * pVal_approx),
            'Significance ($\\alpha$) \%'   : '{:.2f}'.format(sig),
            'null hypothesis ($H_0$)'       : hypo
            }
dfb = pd.DataFrame.from_dict(outDict, orient='index', columns = ['Value'])

s4          = dfb.to_latex(escape=False, column_format='@{}lr@{}')
s4          = s4.replace('{} &     Value', '\multicolumn{2}{c}{\\texttt{Runs test for randomness}}')

print('\\begin{table}[H]')
print('\\centering')
print(s4)
print('\\end{table}')
print('\\bigskip')

\begin{table}[H]
\centering
\begin{tabular}{@{}lr@{}}
\toprule
\multicolumn{2}{c}{\texttt{Runs test for randomness}} \\
\midrule
Observations ($N$)         &        50 \\
Runs ($k$)                 &        36 \\
Failures ($m$)             &        32 \\
Successes ($n$)            &        18 \\
$p$ value \%               &      0.01 \\
$p$ value approx \%        &      0.02 \\
Significance ($\alpha$) \% &      5.00 \\
null hypothesis ($H_0$)    &  rejected \\
\bottomrule
\end{tabular}

\end{table}
\bigskip


In [269]:
(rawSign[0] - rawSign[1]) <= 0


array([False, False, False, False,  True,  True, False, False,  True])

In [6]:
n

array([[ 1. , 24.2, 23.5],
       [ 2. , 30.4, 29.6],
       [ 3. , 32.7, 32.3],
       [ 4. , 19.8, 17.6],
       [ 5. , 25. , 25.3],
       [ 6. , 24.9, 25.4],
       [ 7. , 22.2, 20.6],
       [ 8. , 21.5, 20.7]])