In [4]:
## Import required Python modules
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy, scipy.stats
import io
import base64
#from IPython.core.display import display
from IPython.display import display, HTML, Image
from urllib.request import urlopen

try:
    import astropy as apy
    import astropy.table
    _apy = True
    #print('Loaded astropy')
except:
    _apy = False
    #print('Could not load astropy')

## Customising the font size of figures
plt.rcParams.update({'font.size': 14})

## Customising the look of the notebook
display(HTML("<style>.container { width:95% !important; }</style>"))
## This custom file is adapted from https://github.com/lmarti/jupyter_custom/blob/master/custom.include
HTML('custom.css')
#HTML(urlopen('https://raw.githubusercontent.com/bretonr/intro_data_science/master/custom.css').read().decode('utf-8'))

In [5]:
## Custom imports
from scipy.stats import binom, poisson, chi2, norm, uniform
from scipy.optimize import curve_fit
from math import ceil, pi
from numpy import exp
from matplotlib.collections import PatchCollection
from matplotlib.patches import Circle, Rectangle
from matplotlib.colors import makeMappingArray

In [16]:
## Adding a button to hide the Python source code
HTML('''<script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the Python code."></form>''')

<div class="container-fluid">
    <div class="row">
        <div class="col-md-8" align="center">
            <h1>PHYS 10791: Introduction to Data Science</h1>
            <!--<h3>2019-2020 Academic Year</h3><br>-->
        </div>
        <div class="col-md-3">
            <img align='center' style="border-width:0" src="images/UoM_logo.png"/>
        </div>
    </div>
</div>

<div class="container-fluid">
    <div class="row">
        <div class="col-md-2" align="right">
            <b>Course instructors:&nbsp;&nbsp;</b>
        </div>
        <div class="col-md-9" align="left">
            <a href="http://www.renebreton.org">Prof. Rene Breton</a> - Twitter <a href="https://twitter.com/BretonRene">@BretonRene</a><br>
            <a href="http://www.hep.manchester.ac.uk/u/gersabec">Dr. Marco Gersabeck</a> - Twitter <a href="https://twitter.com/MarcoGersabeck">@MarcoGersabeck</a>
        </div>
    </div>
</div>

# Chapter 7 - Problem Sheet

### Problem 1: $\chi^2$ test

Tables to help with this problem are given below.

#### Problem 1.1:
You conduct a series $\chi^2$ tests resulting in $\chi^2$ values of 34, 37, and 40 for 25 degrees of freedom. Which of these results would you accept when checking against a probability of at least $5\%$ to exceed the $\chi^2$ value in question?

#### Problem 1.2:
You are conducting 50 experiments involving one $\chi^2$ test each. 
- If you conducted these tests with a $5\%$ probability as used in the previous problem, how many experiments would you expect to fail due to random fluctuations?
- What would a reasonable probability be for your $\chi^2$ tests to avoid random rejections of your experiments?
- If your experiments have 50 degrees of freedom, what would the largest acceptable $\chi^2$ value be?

In [37]:
ndfs = np.array([1,2,3,4,5,6,7,8,9,10,15,20,25,30,40,50,75,100,150,200,300,500,750,1000])
chi2Ns = np.array([1.0,1.02,1.05,1.1,1.25,1.5,1.75,2.0,3.0,5.0,7.5,10.0])

np.set_printoptions(precision=2,suppress=True)
print('This table gives probabilities for exceeding the chi2/NDF value at the top of the column for the NDF at the beginning of the row.')
print('N \ chi2/N',chi2Ns)

np.set_printoptions(precision=3,suppress=True)
for n in ndfs:
    ps = []
    for c in chi2Ns:
        ps.append(1.-chi2.cdf(c*n, n))
    ps = np.array(ps)
    print('{0:10d}'.format(n),ps)
    
    
ps = np.array([0.2,0.1,0.05,0.01,0.001])
np.set_printoptions(precision=3,suppress=True)
print('\n\n\n')
print('This table gives chi2/NDF values the integral to which from 0 corresponds to the probability at the top of the column for the NDF at the beginning of the row.')
print('N \ prob',ps)

np.set_printoptions(precision=3,suppress=True)
for n in ndfs:
    cs = []
    for p in ps:
        cs.append(chi2.ppf(1.-p, n)/n)
    cs = np.array(cs)
    print('{0:8d}'.format(n),cs)

This table gives probabilities for exceeding the chi2/NDF value at the top of the column for the NDF at the beginning of the row.
N \ chi2/N [ 1.    1.02  1.05  1.1   1.25  1.5   1.75  2.    3.    5.    7.5  10.  ]
         1 [0.317 0.313 0.306 0.294 0.264 0.221 0.186 0.157 0.083 0.025 0.006 0.002]
         2 [0.368 0.361 0.35  0.333 0.287 0.223 0.174 0.135 0.05  0.007 0.001 0.   ]
         3 [0.392 0.382 0.369 0.348 0.29  0.212 0.154 0.112 0.029 0.002 0.    0.   ]
         4 [0.406 0.395 0.38  0.355 0.287 0.199 0.136 0.092 0.017 0.    0.    0.   ]
         5 [0.416 0.404 0.386 0.358 0.283 0.186 0.119 0.075 0.01  0.    0.    0.   ]
         6 [0.423 0.41  0.39  0.359 0.277 0.174 0.105 0.062 0.006 0.    0.    0.   ]
         7 [0.429 0.414 0.393 0.36  0.271 0.162 0.093 0.051 0.004 0.    0.    0.   ]
         8 [0.433 0.418 0.395 0.359 0.265 0.151 0.082 0.042 0.002 0.    0.    0.   ]
         9 [0.437 0.421 0.397 0.359 0.259 0.141 0.072 0.035 0.001 0.    0.    0.   ]
        10 [0.44  0.

### Problem 2: Comparing fit models

Assess the following fit results according to both the Akaike and Bayesian Information Criteria. Determine the optimal model and judge which models would be acceptable at the $1\%$ and $10\%$ level.

#### Problem 2.1

You conduct a number of maximum likelihood fits to an unbinned dataset of 300 values, which have constant errors. The results are as follows:
- A fit with a parabola returns a maximum log likelihood value of $\ln{\mathcal L}=-145$.
- A fit with a constant returns a maximum log likelihood value of $\ln{\mathcal L}=-160$.
- A fit with the sum of a Gaussian distribution and a linear function returns a maximum log likelihood value of $\ln{\mathcal L}=-144$.

#### Problem 2.2

You conduct a number of $\chi^2$ fits to an unbinned dataset of 1000 values, which have constant errors. The results are as follows:
- A fit with an exponential function returns a minimum $\chi^2$ value of $\chi^2=1050$.
- A fit with the sum of an exponential function and a constant returns a minimum $\chi^2$ value of $\chi^2=980$.
- A fit with a fourth-order polynommial returns a minimum $\chi^2$ value of $\chi^2=970$.

### Solution to Problem 2

#### Solution to 2.1

| Model &nbsp; | $-2\ln{\mathcal L}=\chi^2$ | $k$ | $n$ | AIC | $\exp((AIC_{\rm min}-AIC_i)/2)$ | BIC | $\exp((BIC_{\rm min}-BIC_i)/2)$ |
| :- | :- | :- | :- | :- | :- | :- | :- |
| Parabola  | 290 | 4 | 300 | 298 | 1 | 318.8 | 1 |
| Constant  | 320 | 2 | 300 | 324 | 0.000002 | 331.4 | 0.002 |
| Gauss+lin | 288 | 6 | 300 | 300 | 0.37 | 322.2 | 0.18 |

#### Solution to 2.2

| Model &nbsp; | $-2\ln{\mathcal L}=\chi^2$ | $k$ | $n$ | AIC | $\exp((AIC_{\rm min}-AIC_i)/2)$ | BIC | $\exp((BIC_{\rm min}-BIC_i)/2)$ |
| :- | :- | :- | :- | :- | :- | :- | :- |
| Exponential    | 1050 | 3 | 1000 | 1056 | $2\times10^{-16}$ | 1070.7 | $2\times10^{-14}$ |
| Exp+const      |  980 | 4 | 1000 |  988 | 0.14 | 1007.6 | 1 |
| 5th ord. poly. |  970 | 7 | 1000 |  984 | 1 | 1018.4 | 0.005 |

### Problem 3: Two ensembles of measurements

Assume that we have two ensembles with a known spread and we want to test the compatibility of their means. An example of this scenario is your exams, which should neither be too easy nor too hard.

Let's assume one course has 310 students taking the exam and an average grade of $65.8\%$. Another course has 55 students who achieve on average $72.3\%$. Both distributions have a standard deviation of $8\%$.

Evaluate the statistical compatibility of the average grades. Discuss wheter this level of compatibility is acceptable. Are there any other aspects to be considered when making this comparison?

### Solution to Problem 3

The difference to assess is $6.5\%$. 

The uncertainty on the mean of the first course is $8\%/\sqrt{310}=0.45\%$ and for the second one it is $8\%/\sqrt{55}=1.08\%$. Hence, the uncertainty on the difference is $1.17\%$.

The discrepancy is over 5 standard deviations and is therefore statistically significant.

#### Additional information:

However, also here **systematic uncertainties** apply. For example, the smaller course might have attracted students who, on average, tend to do better in exams. It is also not possible to set exams with the level of predictability suggested by the $1/\sqrt{N}$ scaling of the large courses.

Nevertheless, while the numbers here are made up, the rest of the discussion is definitely part of reality and exam outcomes are analysed very carefully and appropriate actions are taken where required.

### Problem 4: Kolmogorov-Smirnov test

You are given the following test results for $D={\rm max}|{\rm cum}(x)-{\rm cum}(y)|$ alongside the sample sizes involved. Where only one sample size is given, the test reflects a comparison of a sample with a function, otherwise it refers to a two-sample comparison.

Evaluate whether you would accept or reject each sample at the $1\%$, $5\%$, and $10\%$ level according to the table given in the lecture.

| $D$ | $N_1$ | $N_2$ |
| :- | -: | -: |
|  0.256 | 30 | - |
|  0.058 | 450 | - |
|  0.249 | 30 | 300 |
|  0.233 | 100 | 100 |
|  0.066 | 1000 | 500 |

### Solution to Problem 4

Based on the fomula and table given in the lecture:

$$d=\sqrt{\frac{N_xN_y}{N_x+N_y}}D.$$

The critical values are:

| &nbsp; $c(\alpha)$ | $\alpha$ |
|-----|-----|
|  1.63 | 0.01 |
|  1.36 | 0.05 |
|  1.22 | 0.10 |
|  1.07 | 0.20 |

we can evaluate the table as follows, where $\checkmark$ implies accept and $\times$ implies rejecting the result:

| $D$ | $N_1$ | $N_2$ | $d$ | $\alpha=0.10$ | $\alpha=0.05$ | $\alpha=0.01$ |
| :- | -: | -: | :- | :-: | :-: | :-: |
|  0.256 | 30 | - | 1.40 | $\times$ | $\times$ | $\checkmark$ |
|  0.058 | 450 | - | 1.23 | $\times$ | $\checkmark$ | $\checkmark$ |
|  0.249 | 30 | 300 | 1.30 | $\times$ | $\checkmark$ | $\checkmark$ |
|  0.233 | 100 | 100 | 1.65 | $\times$ | $\times$ | $\times$ |
|  0.066 | 1000 | 500 | 1.20 | $\checkmark$ | $\checkmark$ | $\checkmark$ |

<div class="well" align="center">
    <div class="container-fluid">
        <div class="row">
            <div class="col-md-3" align="center">
                <img align="center" alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" width="60%">
            </div>
            <div class="col-md-8">
            This work is licensed under a <a href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>).
            </div>
        </div>
    </div>
    <br>
    <br>
    <i>Note: The content of this Jupyter Notebook is provided for educational purposes only.</i>
</div>