# Computing model residuals

This notebook computes the residuals between data and a model fitted to this data, and makes a statistical analysis of residuals. This notebook builds upon tools developed earlier in this class.

#### First, import useful libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize

#### Plotting options

In [None]:
%matplotlib inline
plt.style.use("astr19_matplotlib_style.txt")

#### Define a model

In [None]:
# the reference underlying model is a sine wave
def sinusoid(x,amplitude,start,period,offset):
    return amplitude * np.sin(2.0 * np.pi * (x-start)/period ) + offset

In [None]:
#function that will generate some data
def generate_data(xmin=0.0,xmax=10.0,amplitude=1.0,start=0.5,period=1.0,offset=0.0,sigma=0.25,n=100):
    #xmin is the minimum range of the data
    #xmax is the maximum range of the data
    #sigma is used for the uncertainty (error bars) and the noise
    #n is the number of samples

    #create n points randomly (uniformly) distributed between xmin and xmax
    x = np.random.uniform(low=xmin,high=xmax,size=n)
    # sort the data by increasing x
    idx_model = np.argsort(x)
    x_data = x[idx_model]
    
    #y have a value centered on the real curve, but with gaussian error
    y = sinusoid(x,amplitude,start,period,offset) + sigma*np.random.randn(n)
    y = np.asarray(y,dtype=np.float32)

    y_err = np.full(n,sigma,dtype=np.float32)

    #return x, y, and y_err values
    return x,y,y_err

#### Generate the data

In [None]:
# choose a seed
n_seed = 11
np.random.seed(n_seed)
# generate data with all default values
x, y, y_err = generate_data()

#### Make the fit

In [None]:
params,pcov = optimize.curve_fit(sinusoid,x,y,sigma=y_err,p0=[1,0.5,1,0])
a_fit = params[0]
s_fit = params[1]
p_fit = params[2]
o_fit = params[3]
print(a_fit,s_fit,p_fit,o_fit)

#### Plot the data and the model

In [None]:
x_fit = np.linspace(0,10,1000)
y_fit = sinusoid(x_fit,a_fit,s_fit,p_fit,o_fit)

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,7))

ax.errorbar(x,y,y_err,fmt='o',zorder=5,label='Data')
ax.plot(x_fit,y_fit,color='magenta',zorder=6,label='Best-fit Model')

ax.set_xlim([-0.1,10.1])
ax.set_ylim([-2.,2.])

ax.set_xlabel('x',fontsize=20)
ax.set_ylabel('y',fontsize=20)
plt.legend(frameon=True,fontsize=10,handletextpad=1)

plt.show()

#### Now let's have a look at residuals between data and model

In [None]:
# compute the y values predicted by the model
y_model = sinusoid(x,a_fit,s_fit,p_fit,o_fit)

# compute residuals = data - model
residuals = y - y_model

#### Plot the residuals

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,7))

ax.errorbar(x,residuals,y_err,fmt='o',label='Residuals')

ax.set_xlim([-0.1,10.1])
ax.set_ylim([-2.,2.])

ax.set_xlabel('x',fontsize=20)
ax.set_ylabel('y',fontsize=20)
plt.legend(frameon=True,fontsize=10,handletextpad=1)

plt.show()

#### What are the statistical properties of residuals?

In [None]:
residuals_mean = np.mean(residuals)
residuals_std = np.std(residuals)
residuals_rms = np.sqrt(np.mean(y**2))

print(f"Residuals mean is  {residuals_mean:.16f}")
print(f"Residuals stdev is {residuals_std:.16f}")
print(f"Residuals RMS is   {residuals_std:.16f}")

#### Define a gaussian function

In [None]:
def gaussian(x,mu,s):
    return 1./np.sqrt(2.0*np.pi*s**2) * np.exp(-0.5*((x-mu)/s)**2)

#### Histogram the residuals

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,7))

counts, bins_custom = np.histogram(residuals,bins=50,range=(-2,2))
ax.hist(residuals,bins=bins_custom,alpha=0.5,edgecolor="white",density=True)

# draw the gaussian function based on mean and stdev of residuals
x_g = np.linspace(-5*residuals_std,5*residuals_std,1000)
ax.plot(x_g,gaussian(x_g,residuals_mean,residuals_std),color="red")

ax.set_xlim([-2,2])
ax.set_xlabel('x',fontsize=20)
ax.set_ylabel('freq of x',fontsize=20)

plt.show()

#### Can we test whether any point is an outlier? 

In [None]:
from scipy.special import erf #importing the error function from scipy

# returns the probability of being x sigma away from the mean
def event_probability(x,mu=0.0,s=1.0):
	z = np.fabs((x-mu)/s)
	return 1.0 - erf(z/np.sqrt(2))

# given prior measurements, determines if the new value is an outlier
def chauvenet_criterion(prior_measurements, outlier):
    mean = np.mean(prior_measurements)
    std = np.std(prior_measurements)
    outlier_probability = event_probability(outlier,mu=mean,s=std)
    N = len(prior_measurements)+1
    if( N*outlier_probability < 0.5):
        return True
    return False

# give a set of measurements, and test each individual measurement for being an outlier
def test_dataset(x,measurements_all):
    # get mean and stdev
    meas_mean = np.mean(measurements_all)
    meas_std = np.std(measurements_all)
    
    #number of outliers
    num_out = 0
    
    # loop through each point, and test whether it can be considered as an outlier
    # according to the Chauvenet's criterion
    for i in range(len(measurements_all)):
        meas_test = measurements_all[i] # value to test
        meas_sliced = np.delete(measurements_all,i) # all measurements except the value to test
        test_chauvenet = chauvenet_criterion(meas_sliced, meas_test)
        if test_chauvenet:
            print(f"Residual {i:3d} at x={x[i]:.3f} is an outlier, "
                  +f"significance: {np.abs(meas_test-meas_mean)/meas_std:.5f} sigmas, "
                  +f"probability: {100*event_probability(meas_test,mu=meas_mean,s=meas_std):.5f}%")
            num_out += 1

    print(f"Number of outliers: {num_out}")

In [None]:
#test our dataset
test_dataset(x,residuals)

#### Add an outlier to the data

In [None]:
# generating a random outlier location and value
np.random.seed(n_seed)
x_out,y_out,y_out_err = generate_data(n=1)
y_out += 5*y_out_err

# adding outlier to the data
x_new = np.append(x,x_out)
y_new = np.append(y,y_out)
y_err_new = np.append(y_err,y_out_err)

# sort the data by increasing x
idx_model = np.argsort(x_new)
x_new = x_new[idx_model]
y_new = y_new[idx_model]

In [None]:
print(x_out,y_out,y_out_err)

#### Recompute the model including the new data location

In [None]:
# compute values predicted for the model (again)
y_model_new = sinusoid(x_new,a_fit,s_fit,p_fit,o_fit)

# compute residuals
residuals_new = y_new - y_model_new

#### Plot the data, including the outlier

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,7))

ax.errorbar(x_new,y_new,y_err_new,fmt='o',zorder=5,label='Data')
ax.plot(x_fit,y_fit,color='magenta',zorder=6,label='Best-fit Model')

ax.set_xlim([-0.1,10.1])
ax.set_ylim([-2.,2.])
ax.set_xlabel('x',fontsize=20)
ax.set_ylabel('y',fontsize=20)
plt.legend(frameon=True,fontsize=10,handletextpad=1)

plt.show()

#### Plot the residuals, including the outlier

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,7))

ax.errorbar(x_new,residuals_new,y_err_new,fmt='o',label='Residuals')

ax.set_xlim([-0.1,10.1])
ax.set_ylim([-2.,2.])
ax.set_xlabel('x',fontsize=20)
ax.set_ylabel('y',fontsize=20)
plt.legend(frameon=True,fontsize=10,handletextpad=1)

plt.show()

#### Histogram the residuals, with the outlier

In [None]:
f,ax = plt.subplots(1,1,figsize=(7,7))

ax.hist(residuals_new,bins=bins_custom,alpha=0.5,edgecolor="white",density=True)

x_g = np.linspace(-5*residuals_std,5*residuals_std,1000)
ax.plot(x_g,gaussian(x_g,residuals_mean,residuals_std),color="red")

ax.set_xlim([-2,2])
ax.set_xlabel('x',fontsize=20)
ax.set_ylabel('freq of x',fontsize=20)

plt.show()

#### Can we diagnose the outlier?

In [None]:
#test our new dataset
test_dataset(x_new,residuals_new)

#### Summary of encountered cases:

- Easy case ($n_{seed}=11$): 
- Realistic case
- Difficult case
- Ideal case