In [1]:
## housekeeping
import numpy as np
import matplotlib.pyplot as plt

#### Use the chi-squared test to evaluate goodness of fit on a categorical (multinomial) discrete distribution

In [2]:
## define problem
n = 100 # number of observations
k = 3 # number of modalities
phat = np.array([20/100, 20/100, 40/100]) # vector of MLE for each modality
p0 = np.array([1/3, 1/3, 1/3]) # vector of true parameter under null
T = n * np.sum(np.multiply((phat - p0)**2, p0**-1)) # test statistic

In [3]:
## helper functions

## define gamma function
def gamma(z, lo=0.001, hi=10000, step=0.001):
    t = np.arange(lo, hi, step) # domain of integration
    return np.sum(step * np.multiply(t**(z-1), np.exp(-t)))

## define chi-squared pdf
def X2f(x, k):
    if np.min(x) <= 0: # return 0 if x <= 0
        return 0
    else:
        num = np.multiply(x**(k/2 - 1), np.exp(-x/2)) # numerator
        den = 2**(k/2) * gamma(k/2) # denominator
        return num / den
    
## approximate chi-squared cdf numerically
def X2F(k, T, lo=0.001, hi=10000, step=0.001):
    x = np.arange(lo, hi, step) # independent variable
    F = np.cumsum(step * X2f(x, k)) # integrate
    d = np.abs(x - T) # T argument distance 
    xstar = x[d == np.min(d)]# get x closest to T
    return F[x == xstar][0]

In [4]:
## test output for tail probability
k = 2 # degrees of freedom
p = X2F(k, T) # complement probability
print("P(T > " + str(np.round(T, decimals=4)) + ") = " + str(np.round(1 - p, decimals=5)))

P(T > 12.0) = 0.00223
