Example of linear fitting.

In [None]:
# Start by importing relevant libraries:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
data = pd.read_csv('data-a.txt', delim_whitespace=True)  # Read raw data.

In [None]:
data.head()  # Show first few rows of data:

In [None]:
# It is always a good idea to look at your raw data:
plt.style.use('seaborn-talk')
fig1, axes = plt.subplots(nrows=2, ncols=2)
xvar = ['x1', 'x2', 'x3', 'x4']
yvar = ['y1', 'y2', 'y3', 'y4']
for i, (x, y, axi) in enumerate(zip(xvar, yvar, axes.flatten())):
    axi.set_title('Data set no. {}'.format(i + 1))
    axi.scatter(data[x], data[y], s=200)
    axi.set(xlabel=x, ylabel=y)
fig1.tight_layout()

**Q: How would you describe the four plots above? Linear? Constant? Do they contain outliers?**

In [None]:
# We will try to do a linear fitting to this data.
# For this purpose we will use "polyfit" from numpy.

# First we make a helper method to estimate R² (the coefficient of determination):
def calculate_r_squared(y, y_hat):
    """Calculate the coeffcient of determination given real & estimated values."""
    ss_tot = np.sum((y - y.mean())**2)
    ss_res = np.sum((y - y_hat)**2)
    rsq = 1.0 - (ss_res / ss_tot)
    return rsq

r_squared = []  # Store the values for R².
y_hat_values = []  # Store the estimated y-values.
parameters = []  # Store the parameters.
for x, y in zip(xvar, yvar):
    param = np.polyfit(data[x], data[y], 1)  # Do a linear fit.
    y_new = np.polyval(param, data[x])  # Estimate y
    # Store values for later plotting:
    y_hat_values.append(y_new)
    parameters.append(param)
    r_squared.append(calculate_r_squared(data[y], y_new))

# Print out parameters:
print('Parameters for y = a + b*x:')
for i, param in enumerate(parameters):
    print('- Data set no. {}: a = {:4.2f}, b = {:4.2f}'.format(i + 1, param[1], param[0]))

In [None]:
# Plot the fitted linear lines:
fig2, axes2 = plt.subplots(nrows=2, ncols=2)
for i, (x, y, y_hat, rsq, axi) in enumerate(zip(xvar, yvar, y_hat_values, r_squared, axes2.flatten())):
    axi.set_title('Data set no. {}'.format(i + 1))
    axi.scatter(data[x], data[y], s=200)
    axi.plot(data[x], y_hat,
             label='R² = {:4.2f}'.format(rsq), color='darkorange')
    axi.set(xlabel=x, ylabel=y)
    axi.legend()
fig2.tight_layout()

**Q: How would you describe the fitted lines above? Do you think they look good? (Why/Why not)**

In [None]:
# If we have several x variables, it can be difficult to visualize the fitted lines.
# One option is then to plot the measured y vs. the estimated y. Let's see what this
# looks like.

# We first define a helper method:
def add_x_y_line_to_axes(axi):
    """Plot the line y=x in the given axes."""
    axmin = np.min((axi.get_xlim(), axi.get_ylim()))
    axmax = np.max((axi.get_xlim(), axi.get_ylim()))
    axi.plot([axmin, axmax], [axmin, axmax], 'black')

# Create the plot:
fig3, axes3 = plt.subplots(nrows=2, ncols=2)
for i, (y, y_hat, axi) in enumerate(zip(yvar, y_hat_values, axes3.flatten())):
    axi.set_title('Data set no. {}'.format(i + 1))
    axi.scatter(data[y], y_hat, s=200)
    axi.set(xlabel=y, ylabel='$\hat{{{}}}$'.format(y))
    add_x_y_line_to_axes(axi)
fig3.tight_layout()

In [None]:
# The residuals can be visualized to give some indication about the fit.
# We plot those as well:
fig4, axes4 = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
for i, (y, y_hat, axi) in enumerate(zip(yvar, y_hat_values, axes4.flatten())):
    axi.set_title('Data set no. {}'.format(i + 1))
    axi.scatter(np.arange(1, len(data[y]) + 1) , data[y] - y_hat, s=200)
    axi.set(xlabel='Point no.', ylabel='${0:} - \hat{{{0:}}}$'.format(y))
    axi.axhline(y=0, ls=':', color='black')
fig4.tight_layout()

In [None]:
# There are also some other statistical quantities we can calculate:
from scipy.stats import pearsonr, spearmanr, kendalltau
numbers = []
for x, y, rsq in zip(xvar, yvar, r_squared):
    new_number = {
        'pearson': pearsonr(data[x], data[y])[0],
        'spearman': spearmanr(data[x], data[y])[0],
        'kendall': kendalltau(data[x], data[y])[0],
        'rsq': rsq,
    }
    numbers.append(new_number)

In [None]:
# Let us show these as well in a plot for comparison:
fig5, axes5 = plt.subplots(nrows=2, ncols=2)
text_fmt = r'$\rho = {pearson:4.2f}, r_s = {spearman:4.2f}, \tau = {kendall:4.2f}, R^2 = {rsq:4.2f}$'
for i, (x, y, y_hat, number, axi) in enumerate(zip(xvar, yvar, y_hat_values, numbers, axes5.flatten())):
    axi.scatter(data[x], data[y], s=200)
    axi.plot(data[x], y_hat, color='darkorange')
    title = ['Data set no. {}'.format(i + 1), text_fmt.format(**number)]
    axi.set_title('\n'.join(title))
    axi.set(xlabel=x, ylabel=y)
fig5.tight_layout()