In [None]:
# Imports
import pandas as pd # To read the data
import matplotlib.pyplot as plt # For plotting
from pandas.plotting import scatter_matrix  # For plotting
import numpy as np # For matrices and vectors
from sklearn.preprocessing import StandardScaler  # For scaling data
from sklearn.linear_model import LinearRegression, Lasso  # Alternative for fitting
from sklearn.metrics import r2_score  # For calculating R²
from sklearn.model_selection import train_test_split  # For splitting into test and training set
plt.style.use('seaborn-notebook')

The data we will look at is:

| Label  | Description              |             Unit |
|:-------|:-------------------------|-----------------:|
| BP     | Blood pressure           |             mmHg |
| Age    | Age                      |            years |
| Weight | Weight                   |               kg |
| BSA    | Body surface area        |            m$^2$ |
| DUR    | Duration of hypertension |            years |
| Pulse  | Basal heart rate         | beats per minute |
| Stress | Stress index             |              --- |

And we will attempt to model the blood pressure with a linear model:

$$\text{BP} = \text{constant} + a_0 \times \text{Age} + a_1 \times \text{Weight} + a_2 \times \text{BSA} + a_3 \times \text{DUR} + a_4 \times \text{Pulse} + a_4 \times \text{Stress}$$

In [None]:
# We start by getting the raw data:
data = pd.read_csv('Data/bloodpress.txt', delim_whitespace=True)
data

In [None]:
# Let ut scale the raw data to zero average and unit variance:
data = pd.DataFrame(StandardScaler().fit_transform(data),columns=data.columns)
data

Before we start making our model, it is a good idea to check possible correlations between the different variables.

One way of investigating correlations (to see what variables might be correlated) is to calculate the
[Pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) between all pairs of variables. This is relatively easy with pandas as there is a [method to do just that](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html). Let us do this, and display the results to get some idea about possible correlations between variables:

In [None]:
# Check correlation between variables:
correlation = data.corr()
correlation.style.background_gradient(cmap='Blues')

In [None]:
# We can also visualize correlations by plotting pairs of variables:
scatter_matrix(data, alpha=0.8, figsize=(12, 12), diagonal='hist');

In [None]:
y = data['BP'].values
variables = ['Age', 'Weight', 'BSA', 'Dur', 'Pulse', 'Stress']
X = data[variables].values

In [None]:
# Split into training and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=10)

In [None]:
# y for training set:
y_train

In [None]:
# y for test set:
y_test

In [None]:
def fit_model(model, X_train, y_train, X_test, y_test):
    """Method for fitting a model and calculating R²"""
    model.fit(X_train, y_train)
    y_hat = {
        'train': model.predict(X_train),
        'test': model.predict(X_test),
    }
    r2 = {
        'train': r2_score(y_train, y_hat['train']),
        'test': r2_score(y_test, y_hat['test']),
    }
    return y_hat, r2


def add_xy_line(axi):
    """Add x=y line to a plot."""
    xlim = axi.get_xlim()
    ylim = axi.get_ylim()
    limits = min(xlim + ylim), max(xlim + ylim)
    axi.plot(limits, limits, color='black', ls=':')
    axi.set_xlim(xlim)
    axi.set_xlim(ylim)

In [None]:
linear = LinearRegression(fit_intercept=False)
y_hat_linear, r2_linear = fit_model(linear, X_train, y_train, X_test, y_test)

# Plot measured vs. predicted:
fig_linear, ax_linear = plt.subplots(constrained_layout=True)
ax_linear.scatter(y_train, y_hat_linear['train'], label='Training', s=150)
ax_linear.scatter(y_test, y_hat_linear['test'], label='Test', s=150, marker='X')
ax_linear.set(xlabel='y', ylabel='ŷ')
ax_linear.legend()
add_xy_line(ax_linear)

# Print R²:
print(f"R² (training) = {r2_linear['train']:6.4f}")
print(f"R² (test) = {r2_linear['test']:6.4f}")

In [None]:
table_linear = pd.DataFrame.from_dict(
    {'b': linear.coef_, 'variable': variables}
)
table_linear

In [None]:
ax0 = table_linear.plot.bar(x='variable', y='b');
ax0.axhline(y=0, ls=':', color='black');

In [None]:
lasso = Lasso(alpha=0.05, fit_intercept=False)
y_hat_lasso, r2_lasso = fit_model(lasso, X_train, y_train, X_test, y_test)

# Plot measured vs. predicted:
fig_lasso, ax_lasso = plt.subplots(constrained_layout=True)
ax_lasso.scatter(y_train, y_hat_lasso['train'], label='Training', s=150)
ax_lasso.scatter(y_test, y_hat_lasso['test'], label='Test', s=150, marker='X')
ax_lasso.set(xlabel='y', ylabel='ŷ')
ax_lasso.legend()
add_xy_line(ax_lasso)

# Print R²:
print(f"R² (training) = {r2_lasso['train']:6.4f}")
print(f"R² (test) = {r2_lasso['test']:6.4f}")

In [None]:
table_lasso = pd.DataFrame.from_dict(
    {
        'b (least squares)': linear.coef_,
        'b (lasso)': lasso.coef_,
        'variable': variables,
    }
)
table_lasso

In [None]:
table_lasso.plot.bar(x='variable', y='b (lasso)');

In [None]:
axl = table_lasso.plot.bar(x='variable', y=['b (lasso)', 'b (least squares)'])
axl.axhline(y=0, ls=':', color='black');

In [None]:
# We can also try manually to create a model with just one of the variables:
y = data['BP'].values
W = data[['Weight']].values
linear2 = LinearRegression(fit_intercept=False)
linear2.fit(W, y)
r2_linear2 = r2_score(y, linear2.predict(W))

fig_linear2, ax_linear2 = plt.subplots(constrained_layout=True)
ax_linear2.scatter(W, y, s=150)
ax_linear2.set(xlabel='weight', ylabel='BP')
x = np.array([-2, 2]).reshape(-1, 1)
y_p = linear2.predict(x)
ax_linear2.plot(x, y_p, color='black',
                label=f'BP = {linear2.coef_[0]:4.2f} × weight, R² = {r2_linear2:6.4f}')
ax_linear2.legend();