In [1]:
import numpy as np
import pandas as pd
import string
import calendar
import re
import math
import sympy
import matplotlib.pyplot as plt
import pprint

from fractions import Fraction
from scipy import stats
from scipy import special


print(f"numpy version is {np.__version__}")
print(f"pandas version is {pd.__version__}")


numpy version is 1.20.3
pandas version is 1.3.2


In [75]:
def getDiscreteProbsExpon(input_ranges, meanIn):
    prob_out = []
    for i in input_ranges:
        if i[0] == '<':
            high = float(i[1:])
            prob_out.append(stats.expon.cdf(high, scale = meanIn))
        elif i[0] == '>':
            low = float(i[1:])
            prob_out.append(1 - stats.expon.cdf(low, scale = meanIn))
        else:
            low, high = i.split('-')
            prob_out.append(stats.expon.cdf(float(high), scale = meanIn) - stats.expon.cdf(float(low), scale = meanIn))
    return np.array(prob_out)

### Goodness of fit test with all parameters specified

In [80]:
# manual data entry

prob        = np.array([0.4, 0.3, 0.2, 0.1])
names       = ['Top', 'High', 'Medium', 'Low']
X           = np.array([234, 117, 81, 68])

# read data from text file
# rawData     = np.loadtxt('GOF_test_specified.txt', dtype = ('str', 'str'), delimiter= ' ')
# names       = np.squeeze(rawData[:, 0]).astype(str)
# X           = np.squeeze(rawData[:, 1]).astype(int)



# probabilities of the assumed distribution function

# Uniform
# prob        = (np.ones(X.shape[0]) * 1/6).round(4)

# Poisson
# poisMean = 4.2
# prob = stats.poisson.pmf(names, poisMean)

# Exponential
# exponMean   = 50
# prob        = getDiscreteProbsExpon(names, exponMean)


sig         = 5
n           = X.sum()
k           = prob.shape[0]

testStat    = -n + np.divide((X ** 2) , n * prob).sum()
pVal        = 1 - stats.chi2.cdf(testStat, k - 1)
hypo        = 'rejected' if pVal < 0.01 * sig else 'accepted'



#### output as table

In [81]:
outDict     ={'Test Statistic'          : '{:.2e}'.format(testStat),
            '$p$ value \%'               : '{:.2f}'.format(100 * pVal),
            'Significance ($\\alpha$) \%': '{:.2f}'.format(sig),
            'null hypothesis ($H_0$)'   : hypo,
            'minimum $n p_i$'           : '{:.0f}'.format(min(np.multiply(prob, n)))
            }
dfb = pd.DataFrame.from_dict(outDict, orient='index', columns = ['Value'])

df                      = pd.DataFrame(np.vstack((X.astype(str), prob.round(4))).T, columns= ['$X_i$', '$p_i$'], index=names)
df                      = df.append(pd.Series({'$X_i$' : n, '$p_i$' : 1}, name = 'Total'))

In [82]:
s3          = df.to_latex(escape=False, column_format='@{}lrr@{}')
s3          = s3.replace('Total', '\midrule\nTotal')
s3          = s3.replace('{} &', 'Grade &')
s4          = dfb.to_latex(escape=False, column_format='@{}lr@{}')
s4          = s4.replace('{} &     Value', '\multicolumn{2}{c}{\\texttt{Goodness of Fit Test}}')

print('\\begin{table}[H]')
print('\\centering')
print('\\begin{minipage}{0.4\\textwidth}')
print('\\centering')
print(s3)
print('\\end{minipage}')
print('\\begin{minipage}{0.4\\textwidth}')
print('\\centering')
print(s4)
print('\\end{minipage}')
print('\\end{table}')
print('\\bigskip')


\begin{table}[H]
\centering
\begin{minipage}{0.4\textwidth}
\centering
\begin{tabular}{@{}lrr@{}}
\toprule
Grade & $X_i$ & $p_i$ \\
\midrule
Top    &   234 &   0.4 \\
High   &   117 &   0.3 \\
Medium &    81 &   0.2 \\
Low    &    68 &   0.1 \\
\midrule
Total  &   500 &     1 \\
\bottomrule
\end{tabular}

\end{minipage}
\begin{minipage}{0.4\textwidth}
\centering
\begin{tabular}{@{}lr@{}}
\toprule
\multicolumn{2}{c}{\texttt{Goodness of Fit Test}} \\
\midrule
Test Statistic             &  2.31e+01 \\
$p$ value \%               &      0.00 \\
Significance ($\alpha$) \% &      5.00 \\
null hypothesis ($H_0$)    &  rejected \\
minimum $n p_i$            &        50 \\
\bottomrule
\end{tabular}

\end{minipage}
\end{table}
\bigskip


0.0026949434242697717

In [72]:
rawData     = np.loadtxt('GOF_test_specified.txt', dtype = ('str', 'str'), delimiter= ' ')
names       = np.squeeze(rawData[:, 0]).astype(str)
X           = np.squeeze(rawData[:, 1]).astype(int)



[0.4511883639059735,
 0.2476174241818243,
 0.13589532369061563,
 0.16529888822158656]

In [60]:
stats.expon.cdf(30, scale=50)

0.4511883639059735

In [56]:
names

array(['<30', '30-60', '60-90', '>90'], dtype='<U5')