Get the Coronary Risk-Factor Study (CORIS)
data from the book web site (https://www.stat.cmu.edu/~larry/all-of-statistics/=data/coris.dat).
Use backward stepwise logistic regression based on AIC to select a model.
Summarize your results.

In [1]:
from collections import namedtuple
from tabulate import tabulate

import functools # For the reduce() function
import numpy as np
import pandas as pd
import scipy.special # For the expit() function

In [2]:
# Read the data into a pandas data frame
coris_df = pd.read_csv('../data/coris_clean.dat')

# Print the data frame, as a sanity check
coris_df

Unnamed: 0,row.names,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,1,160,12.00,5.73,23.11,1,49,25.30,97.20,52,1
1,2,144,0.01,4.41,28.61,0,55,28.87,2.06,63,1
2,3,118,0.08,3.48,32.28,1,52,29.14,3.81,46,0
3,4,170,7.50,6.41,38.03,1,51,31.99,24.26,58,1
4,5,134,13.60,3.50,27.78,1,60,25.99,57.34,49,1
...,...,...,...,...,...,...,...,...,...,...,...
457,458,214,0.40,5.98,31.72,0,64,28.45,0.00,58,0
458,459,182,4.20,4.41,32.10,0,52,28.61,18.72,52,1
459,460,108,3.00,1.59,15.23,0,40,20.09,26.64,55,0
460,461,118,5.40,11.61,30.79,0,64,27.35,23.97,40,0


Note that performing the weighted least squares minimization
$$
\beta = \text{argmin} {|| \mathbb{Z} - \mathbb{X}\beta ||}_{ \mathbb{W} }^2
$$
is equivalent to performing the *unweighted* least squares minimization
$$
\beta = \text{argmin} {|| \mathbb{W}^{1/2} \mathbb{Z} - \mathbb{W}^{1/2} \mathbb{X}\beta ||}^2.
$$
Since `numpy` does only implements unweighted least squares minimization we will therefore use the latter formulation.

In [3]:
Logistic_data = namedtuple('Logistic_data', ['X', 'Y', 'covariates'])
Auxiliary_tensors = namedtuple('Auxiliary_tensors', ['p', 'W', 'Z', 'J'])
Logistic_result = namedtuple('Logistic_result', ['beta', 'std_err'])

sigmoid = scipy.special.expit

def augment_with_leading_ones(np_array):
    n = np_array.shape[0]
    return np.column_stack([
        np.ones(n),
        np_array
    ])
    
def compute_auxiliary_tensors(logistic_data, beta):
    
    p = sigmoid(np.matmul(logistic_data.X, beta))
    W = np.diag(p*(1-p))
    Z = np.matmul(logistic_data.X, beta) + np.matmul(
        np.linalg.inv(W), logistic_data.Y - p
    )
    # Inverse Fisher information matrix
    J = np.linalg.inv(
        functools.reduce(np.matmul, [logistic_data.X.transpose(), W, logistic_data.X])
    )
    
    return Auxiliary_tensors(p, W, Z, J)

def newton_step(logistic_data, auxiliary_tensors):

    beta, _, _, _ = np.linalg.lstsq(
        np.matmul(np.sqrt(auxiliary_tensors.W), logistic_data.X),
        np.matmul(np.sqrt(auxiliary_tensors.W), auxiliary_tensors.Z)
    )
    
    return beta

def newton_decrement_square(logistic_data, auxiliary_tensors):
    
    v = np.matmul(
        logistic_data.X.transpose(), logistic_data.Y - auxiliary_tensors.p
    )
    
    # lambda^2 = - Jv.v
    lambda_square = functools.reduce(np.matmul, [v.transpose(), auxiliary_tensors.J, v])
    
    return lambda_square

def logistic_regression(logistic_data, stopping_value=1e-10):
    
    # Initialize beta
    (_, l) = logistic_data.X.shape # Note that l = k + 1 (because of the affine term)
    beta = np.zeros(l)
    auxiliary_tensors = compute_auxiliary_tensors(logistic_data, beta)

    while newton_decrement_square(logistic_data, auxiliary_tensors) > 2*stopping_value:
        beta = newton_step(logistic_data, auxiliary_tensors)
        auxiliary_tensors = compute_auxiliary_tensors(logistic_data, beta)
    
    # Compute the standard error
    auxiliary_tensors = compute_auxiliary_tensors(logistic_data, beta)
    std_err = np.sqrt(np.diag(auxiliary_tensors.J))
    
    return Logistic_result(beta, std_err)

def report_results(logistic_data, logistic_result, alpha=0.05):
    
    # Compute the Normal adjustment to the standard error estimate
    # used to produce Normal confidence intervals
    z = scipy.stats.norm.isf(alpha/2)
    
    covariates_list_local = ["Constant term"] + logistic_data.covariates
    table = [
        [
            covariate,
            logistic_result.beta[j],
            logistic_result.std_err[j],
            logistic_result.beta[j] - z*logistic_result.std_err[j],
            logistic_result.beta[j] + z*logistic_result.std_err[j],
        ]
        for j, covariate in enumerate(covariates_list_local)
    ]

    print(tabulate(
        table,
        headers = ["Feature", "Beta_j", "Std. error", "Lower bound", "Upper bound"],
        floatfmt=".3" # Only print three significant digits
    ))

In [4]:
covariates_list = ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age']

current_covariates = covariates_list
X = augment_with_leading_ones(coris_df[current_covariates].to_numpy())
Y = coris_df['chd'].to_numpy()

data = Logistic_data(X, Y, current_covariates)
report_results(
    data,
    logistic_regression(data)
)

Feature           Beta_j    Std. error    Lower bound    Upper bound
-------------  ---------  ------------  -------------  -------------
Constant term  -6.15           1.31          -8.71          -3.59
sbp             0.0065         0.00573       -0.00473        0.0177
tobacco         0.0794         0.0266         0.0272         0.132
ldl             0.174          0.0597         0.057          0.291
adiposity       0.0186         0.0293        -0.0388         0.076
famhist         0.925          0.228          0.479          1.37
typea           0.0396         0.0123         0.0154         0.0637
obesity        -0.0629         0.0442        -0.15           0.0238
alcohol         0.000122       0.00448       -0.00867        0.00891
age             0.0452         0.0121         0.0215         0.069


In [5]:
# We remove all covariates
# whose parameter confidence interval contains 0
# (which means that we cannot reject the null
# that this parameter is zero with a Wald test)

current_covariates = ['tobacco', 'ldl', 'famhist', 'typea', 'age']
X = augment_with_leading_ones(coris_df[current_covariates].to_numpy())

data = Logistic_data(X, Y, current_covariates)
report_results(
    data,
    logistic_regression(data)
)

Feature          Beta_j    Std. error    Lower bound    Upper bound
-------------  --------  ------------  -------------  -------------
Constant term   -6.45          0.921         -8.25          -4.64
tobacco          0.0804        0.0259         0.0297         0.131
ldl              0.162         0.055          0.0543         0.27
famhist          0.908         0.226          0.466          1.35
typea            0.0371        0.0122         0.0133         0.061
age              0.0505        0.0102         0.0305         0.0705
