In [1]:
## housekeeping
import numpy as np
import matplotlib.pyplot as plt

#### Use the chi-squared test to evaluate goodness of fit of a generalized binomial distribution  
This answers the question:  "Does my data come from any distribution within the family of binomial distributions?"  

In [2]:
## define problem
i = np.arange(4) # vector of modality indices
K = i[len(i)-1] # max index
N = np.array([339, 455, 180, 26]) # number of observations of each modality
n = np.sum(N) # total number of observations
tMLE =  1 / (n * K) * np.sum(np.multiply(i, N)) # parameter MLE for each modality
that = N / n
choices = np.array([1, 3, 3, 1]) # K choose i
ft = np.zeros(4) # init pmf
for j in range(4): # calc PMF
    ft[j] = choices[j] * tMLE**j * (1-tMLE)**(K-j)
T = n * np.sum(np.multiply((that - ft)**2, ft**-1)) # calc test statistic

In [3]:
## helper functions

## define gamma function
def gamma(z, lo=0.001, hi=10000, step=0.001):
    t = np.arange(lo, hi, step) # domain of integration
    return np.sum(step * np.multiply(t**(z-1), np.exp(-t)))

## define chi-squared pdf
def X2f(x, k):
    if np.min(x) <= 0: # return 0 if x <= 0
        return 0
    else:
        num = np.multiply(x**(k/2 - 1), np.exp(-x/2)) # numerator
        den = 2**(k/2) * gamma(k/2) # denominator
        return num / den
    
## approximate chi-squared cdf numerically
def X2F(k, T, lo=0.001, hi=10000, step=0.001):
    x = np.arange(lo, hi, step) # independent variable
    F = np.cumsum(step * X2f(x, k)) # integrate
    d = np.abs(x - T) # T argument distance 
    xstar = x[d == np.min(d)]# get x closest to T
    return F[x == xstar][0]

In [4]:
## test output for tail probability
k = K-1 # degrees of freedom
p = X2F(k, T) # complement probability
print("P(T > " + str(np.round(T, decimals=4)) + "), " + str(k) + " degrees of freedom = " + str(np.round(1 - p, decimals=5)))

P(T > 0.8829), 2 degrees of freedom = 0.64298
