In [20]:
import math

import numpy

from scipy import stats

## Exercise 4.3, Comparison of proportions

> You want to gather data to determine which of two students is a better
> basketball shooter. One of them shoots with 30% accuracy and the other is a
> 40% shooter. Each student takes 20 shots and you then compare their shooting
> percentages. What is the probability that the better shooter makes more shots
> in this small experiment?

In [21]:
total_prob = 0.0
for good_hits in range(21):
    log_pr_good_hits = stats.binom.logpmf(good_hits, 20, 0.4)
    log_sf_bad_hits = stats.binom.logsf(good_hits + 1, 20, 0.3)
    increment = math.exp(log_pr_good_hits + log_sf_bad_hits)
    total_prob += increment
    print(f'{good_hits}:\t{math.exp(log_sf_bad_hits):0.5f}, {increment:0.5f}, {total_prob:0.5f}')

0:	0.99236, 0.00004, 0.00004
1:	0.96452, 0.00047, 0.00051
2:	0.89291, 0.00276, 0.00326
3:	0.76249, 0.00942, 0.01268
4:	0.58363, 0.02042, 0.03310
5:	0.39199, 0.02926, 0.06236
6:	0.22773, 0.02833, 0.09069
7:	0.11333, 0.01880, 0.10949
8:	0.04796, 0.00862, 0.11811
9:	0.01714, 0.00274, 0.12085
10:	0.00514, 0.00060, 0.12145
11:	0.00128, 0.00009, 0.12154
12:	0.00026, 0.00001, 0.12155
13:	0.00004, 0.00000, 0.12155
14:	0.00001, 0.00000, 0.12155
15:	0.00000, 0.00000, 0.12155
16:	0.00000, 0.00000, 0.12155
17:	0.00000, 0.00000, 0.12155
18:	0.00000, 0.00000, 0.12155
19:	0.00000, 0.00000, 0.12155
20:	0.00000, 0.00000, 0.12155


# Exercise 4.4, Designing an experiment

> You want to gather data to determine which of two students is a better
> basketball shooter. You plan to have each student take $N$ shots and then
> compare their shooting percentages. Roughly how large does $N$ have to be for
> you to have a good chance of distinguishing a 30% shooter from a 40% shooter?

In [22]:
N = 20
while True:
    se_N = math.sqrt(0.45 / N)
    t_term = stats.t.ppf(0.975, 2*N - 2)
    prod = se_N * t_term
    if prod < 0.1:
        print(f'{N}:\t{prod:0.5f}')
        break
    N += 1

175:	0.09974


## Exercise 4.6, Hypothesis testing

> The following are the proportions of girl births in Vienna for each month in 
> 1908 and 1909 (out of an average of 3900 births per month):
> 
>  `.4777 .4875 .4859 .4754 .4874 .4864 .4813 .4787 .4895 .4797 .4876 .4859`
>  `.4857 .4907 .5010 .4903 .4860 .4911 .4871 .4725 .4822 .4870 .4823 .4973`
> 
> The data are in the folder `Girls`. These proportions were used by
> von Mises (1957) to support a claim that that the sex ratios were less
> variable than would be expected under the binomial distribution. We think
> von Mises was mistaken in that he did not account for the possibility that
> this discrepancy could arise just by chance.
> 
> (a) Compute the standard deviation of these proportions and compare to the
>   standard deviation that would be expected if the sexes of babies were
>   independently decided with a constant probability over the 24-month period.
> 
> (b) The observed standard deviation of the 24 proportions will not be
>   identical to its theoretical expectation. In this case, is this difference
>   small enough to be explained by random variation?  Under the randomness
>   model, the actual variance should have a distribution with expected value
>   equal to the theoretical variance, and proportional to a $\chi^2$ random
>   variable with 23 degrees of freedom; see page 53.

In [25]:
GIRLS = [0.4777, 0.4875, 0.4859, 0.4754, 0.4874, 0.4864,
         0.4813, 0.4787, 0.4895, 0.4797, 0.4876, 0.4859,
         0.4857, 0.4907, 0.5010, 0.4903, 0.4860, 0.4911,
         0.4871, 0.4725, 0.4822, 0.4870, 0.4823, 0.4973]
print(f'std: {numpy.std(GIRLS):0.5e}')
print(f'mean: {numpy.mean(GIRLS):0.5e}')

std: 6.27477e-03
mean: 4.85675e-01


In [35]:
count_sd = stats.binom.std(3900, numpy.mean(GIRLS))
print(count_sd)
print(f'{count_sd/3900:0.5e}')

31.212172274010342
8.00312e-03


In [38]:
print(math.sqrt(3900))
se = numpy.mean(GIRLS) * (1 - numpy.mean(GIRLS)) / math.sqrt(3900)
print(se)
print(stats.chi2.cdf(numpy.std(GIRLS), df=23, scale=(count_sd/3900)/23))

62.44997998398398
0.0039999179253390115
0.24429885784736377
