In [None]:
# Import libraries:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tabulate import tabulate
from IPython.display import Markdown, display

In [None]:
data = pd.read_csv('data-b.txt', delim_whitespace=True)

In [None]:
data.head()

In [None]:
# Define some variables for convenience:
y = data['y1']
x1 = data['x1']
x2 = data['x2']

In [None]:
# We first plot the raw data and we see that x1 and x2 are strongly correlated.
plt.style.use('seaborn-darkgrid')
fig1, axes = plt.subplots(nrows=1, ncols=3)
axes[0].scatter(x1, y, s=200)
axes[1].scatter(x2, y, s=200)
axes[2].scatter(x1, x2, s=200)
axes[0].set(xlabel='$x_1$', ylabel='$y$')
axes[1].set(xlabel='$x_2$', ylabel='$y$')
axes[2].set(xlabel='$x_1$', ylabel='$x_2$')
fig1.tight_layout()

In [None]:
# We are making a linear model here. Let us try with the standard equation: b = (X.T X)⁻1 X.T y
# We start by creating the X matrix:
ones = np.ones_like(x1)  # Make a column of [1 1 1 ... 1]
X = np.column_stack((ones, x1, x2))  # Make the X matrix.
# Print it out
display(Markdown('$X=$'))
print(tabulate(X, tablefmt='fancy_grid'))

In [None]:
# Calculate the (X.T X)^-1 matrix:
mat = np.dot(X.T, X)
display(Markdown('$(X^T X)=$'))
print(tabulate(mat, tablefmt='fancy_grid'))

In [None]:
# Take the inverse:
mat_inv = np.linalg.inv(mat)

In [None]:
# Aww, that did not work... The matrix is actually singular since x2 = 2 * x1...
# Let us try the pseudo inverse:
pinv = np.linalg.pinv(X)
display(Markdown('$X^+=$'))
pd.DataFrame(pinv)

In [None]:
# Let us get the parameters for the linear model (y = a_0 + a_1 * x1 + a_2 * x2)
b = np.dot(pinv, y)
print('a_0 = {:4.2f}\na_1 = {:4.2f}\na_2 = {:4.2f}'.format(*b))

In [None]:
# Plot the measured y vs. the predicted y:
y_hat = np.dot(X, b)
fig2, ax2 = plt.subplots(nrows=1, ncols=1)
ax2.scatter(y, y_hat, s=200)
ax2.set(xlabel='$y$', ylabel='$\hat{y}$')
# Add a x=y line:
lims = [
    np.min([ax2.get_xlim(), ax2.get_ylim()]),  # min of both axes
    np.max([ax2.get_xlim(), ax2.get_ylim()]),  # max of both axes
]
ax2.plot(lims, lims, 'black')
fig2.tight_layout()

In [None]:
# Let's try another library, statsmodels:
import statsmodels.api as sm  # Import statsmodels
model = sm.OLS(y, X).fit()  # Do the least-squares fitting

In [None]:
print(model.summary())

In [None]:
# Recalculate y using the model:
y_hat = model.predict(X)

In [None]:
# Plot the measured y vs. the predicted y:
fig2, ax2 = plt.subplots(nrows=1, ncols=1)
ax2.scatter(y, y_hat, s=200)
ax2.set(xlabel='$y$', ylabel='$\hat{y}$')
# Add a y=y_hat line:
lims = [
    np.min([ax2.get_xlim(), ax2.get_ylim()]),  # min of both axes
    np.max([ax2.get_xlim(), ax2.get_ylim()]),  # max of both axes
]
ax2.plot(lims, lims, 'black')
fig2.tight_layout()