# PH2255 Week 17. Statistical Data Analysis 1.

Generic Python header: here the interactive/inline plotting style is selected, relevant libraries are imported, python2/3 compatibility is addressed.

In [1]:
#%matplotlib inline 
# this line is required for the plots to appear in the Jupyter cells, rather than launching the matplotlib GUI
%matplotlib widget 
#this allows interactive view but you need to be in classic rather than CoCalc Jupyter notebook for this to work
from __future__ import division,print_function
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit

# Example 1: Straight line fit
Consider the following set of $(x, y, \sigma)$ data points:

In [2]:
x   = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
y   = np.array([2.7, 3.9, 5.5, 5.8, 6.5, 6.3, 7.7, 8.5, 8.7])
sig = np.array([0.3, 0.5, 0.7, 0.6, 0.4, 0.3, 0.7, 0.8, 0.5])

In this example we will fit a straight line parameterised by a two-dimensional vector $\theta = (\theta_0, \theta_1)$
$$
f(x; \theta) = \theta_0 + \theta_1 x
$$
through the data.

In [3]:
# define fit function
def func(x, *theta):
    theta0, theta1 = theta
    return theta0 + theta1*x

The following code finds the least-square estimator vector $\hat\theta$ and the covariance matrix $\mathrm{cov}[\hat\theta_i, \hat\theta_j]$ by minimising
$$
\chi^2 = \sum_i (y_i - f(x_i;\theta))^2 / \sigma_i^2.
$$
The input to 'scipy.optimize.curve_fit' function includes the initial guess 'p0' for $\theta$, here set to an array of ones.
For some fit functions it is important that 'p0' is sufficiently close to the least-square estimator;
polynomial fits converge even if 'p0' and the estimator are far apart.

In [4]:
# set default parameter values and do the fit
p0 = np.array([1.0, 1.0])
thetaHat, cov = curve_fit(func, x, y, p0, sig, absolute_sigma=True)

Having obtained the estimator $\hat\theta$, we can calculate $\chi^2$ and compare it
to the number of degrees of freedom (number of data points minus number of parameters), which determine the quality of the fit:

In [5]:
# Retrieve minimized chi-squared, etc.
numPoints = len(x)
numPar = len(p0)
ndof = numPoints - numPar
chisq = sum(((y - func(x, *thetaHat))/sig)**2)
print ("chisq = ", chisq, ",     ndof = ", ndof)

chisq =  8.25153611783541 ,     ndof =  7


The diagonal elements of the covariance matrix represent the variances (squares of standard deviations) of the obtained fit parameters $\hat\theta_0$ and $\hat\theta_1$: 

In [6]:
# Print fit parameters
print ("\n", "Fitted parameters and standard deviations:")
sigThetaHat = np.sqrt(np.diag(cov))
for i in range(len(thetaHat)):
    print ("thetaHat[", i, "] = ", thetaHat[i], "  +-  ", sigThetaHat[i])


 Fitted parameters and standard deviations:
thetaHat[ 0 ] =  2.2576981889195182   +-   0.29218909382046193
thetaHat[ 1 ] =  0.7409333605720615   +-   0.05723132195270343


In general these uncertainties are correlated as discussed in Section 4.1 and Fig. 4 of the "Introduction to Statistical Methods" Script. This is represented by the off-diagonal elements of the covariance matrix $\mathrm{cov}[\hat\theta_i, \hat\theta_j]$. The entire matrix is printed below, together with its normalised version $\rho$.

In [7]:
# Print covariance matrix
print ("\n", "i, j, cov[i,j], rho[i,j]:")
for i in range(len(thetaHat)):
    for j in range(len(thetaHat)):
        rho = cov[i][j] / (sigThetaHat[i]*sigThetaHat[j])
        print (i, "  ", j, "  ", cov[i][j], "  ", rho)


 i, j, cov[i,j], rho[i,j]:
0    0    0.08537446654762271    1.0
0    1    -0.014376325915897156    -0.8597063424480256
1    0    -0.014376325915897159    -0.8597063424480258
1    1    0.0032754242124539935    0.9999999999999999


Finally we plot the data and the fitted straight line

In [8]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.errorbar(x, y, yerr=sig, color='black', fmt='o')
# add an empty dataset to the axes to provide legend
ax.plot([], [], 'o', color='black', label='data')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')

# manually choose the x and y ranges for the plot
xMin = 0
xMax = 10
yMin = 0
yMax = 10
ax.set_xlim(xMin, xMax)
ax.set_ylim(yMin, yMax)

# generate the array of x for plotting a smooth fitted curve
xPlot = np.linspace(xMin, xMax, 100)
# calculate the fitted function for the above x
fit = func(xPlot, *thetaHat)
ax.plot(xPlot, fit, color='red', linewidth=2, label='fit result')

ax.legend(loc='lower right', frameon=False)

# Make and store plot
plt.tight_layout()
plt.show()
fig.savefig("simpleFit.pdf", format='pdf')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Exercise 1(a)
Using the same data carry out the least-squares fit of an $M$th order polynomial,
$$
f(x; \theta) = \sum_{i=0}^{M} \theta_i x^i,
$$
(with $M + 1$ adjustable parameters), for $M = 1, 2, 3$. The code needs to be general enough to allow arbitrary positive integer $M$.

In [9]:
# Excercise 1a


# Exercise 1(b)
For each fit, use the error propagation formula
$$
\sigma_f^2 \approx \sum_{i,j=0}^{M} \frac{\partial f(x; \hat\theta)}{\partial \hat\theta_i}\frac{\partial f(x; \hat\theta)}{\partial \hat\theta_j} \mathrm{cov}[\hat\theta_i, \hat\theta_j]
$$
(this is a special case of Eq.&nbsp;(26) from "Introduction to Statistical Methods") to find the standard deviation of the fitted function σf as a function of $x$. Note that to do this you will need to compute
the derivatives of $f(x; \hat\theta)$ with respect to the components of $\hat\theta$. Display the fitted curve plus-or-minus one standard deviation as a shaded band, and extend the $x$ axis to at least $2\theta$.
(The shaded band can be made with the function `matplotlib.fill_between`.) Note how the size of the error band increases when one goes to $x$ values outside the region where data are available;
investigate how this behaviour changes as the order of the polynomial is increased.

In [10]:
# Excercise 1b

# Exercise 1(c - optional)
Consider the fit with $M = 3$ and define the difference
$$
\Delta_{ab}(\hat\theta) = f(a; \hat\theta) − f(b; \hat\theta).
$$
Using error propagation, find the standard deviation of $\Delta_{ab}(\hat\theta)$ for $a = 5$ and $b = 6, 10, 20$.
Compare these values you find with the standard deviation of $f$ that you plotted for this fit as a shaded band evaluated at both $a$ and $b$.

In [11]:
# Excercise 1c