In [16]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    import scipy.stats
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

## Z-критерий для разности долей (независимые выборки)

  |   -   | $X_1$ | $X_2$  
   ------|---    | ------
  1      | a     | b 
  0      | c     | d 
  $\sum$ | $n_1$ | $n_2$
  

$$ \hat{p}_1 = \frac{a}{n_1}$$

$$ \hat{p}_2 = \frac{b}{n_2}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\; \hat{p}_1 - \hat{p}_2 \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{\hat{p}_1(1 - \hat{p}_1)}{n_1} + \frac{\hat{p}_2(1 - \hat{p}_2)}{n_2}}$$

$$Z-статистика: Z({X_1, X_2}) =  \frac{\hat{p}_1 - \hat{p}_2}{\sqrt{P(1 - P)(\frac{1}{n_1} + \frac{1}{n_2})}}$$
$$P = \frac{\hat{p}_1{n_1} + \hat{p}_2{n_2}}{{n_1} + {n_2}} $$

In [17]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):
    """Confidence interval for two independent proportions
    
    """
    import scipy.stats
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [18]:
def proportions_diff_z_stat_ind(sample1, sample2):
    import scipy.stats
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [19]:
import numpy as np
X1 = np.array([1]*944 + [0]*(1600-944))
X2 = np.array([1]*880 + [0]*(1600-880))

In [20]:
proportions_diff_confint_ind(X1, X2)

(0.00572163631706115, 0.07427836368293869)

In [28]:
proportions_diff_z_stat_ind(X1,X2)

2.285247956160165

In [29]:
proportions_diff_z_test(proportions_diff_z_stat_ind(X1,X2))

0.022298292683339493

## Z-критерий для разности долей (связанные выборки)

  |$X_1$ \ $X_2$ | 1| 0 | $\sum$
  ------|------- | -----|--------|
  1  | e | f | e + f
  0  | g | h | g + h
  $\sum$ | e + g| f + h | n  
  

$$ \hat{p}_1 = \frac{e + f}{n}$$

$$ \hat{p}_2 = \frac{e + g}{n}$$

$$ \hat{p}_1 - \hat{p}_2 = \frac{f - g}{n}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\;  \frac{f - g}{n} \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{f + g}{n^2} - \frac{(f - g)^2}{n^3}}$$

$$Z-статистика: Z({X_1, X_2}) = \frac{f - g}{\sqrt{f + g - \frac{(f-g)^2}{n}}}$$

In [23]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    import scipy.stats
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [24]:
def proportions_diff_z_stat_rel(sample1, sample2):
    import scipy.stats
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [25]:
proportions_diff_confint_rel(X1, X2)

(0.03039817664728938, 0.04960182335271062)

In [26]:
proportions_diff_z_stat_rel(X1, X2)

8.16496580927726

In [27]:
proportions_diff_z_test(proportions_diff_z_stat_rel(X1, X2))

2.220446049250313e-16