In [25]:
import pandas as pd
import numpy as np
import re
from numpy.linalg import inv
from sklearn.metrics import accuracy_score

First let's process the data into a dataframe.

In [26]:
# An array to store the rows of data.
rows = []

In [27]:
with open('coris.dat') as f:
    for line in list(f)[4:]:
        row = re.split(',| |\n', line)
        row = list(filter(lambda x: x != '', row))
        row = row[1:]
        rows.append(row)

In [28]:
with open('coris.dat') as f:
    # An array for the column names of the dataframe.
    col_names = list(f)[0].split(',')[1:]

In [29]:
col_names[-1] = 'chd'

In [30]:
df = pd.DataFrame.from_records(rows)

In [31]:
df.columns = col_names

In [32]:
# Process some strings into integers.
for col in ['sbp', 'famhist', 'typea', 'age', 'chd']:
    df[col] = df[col].astype('int')

In [33]:
# Process other strings into floats.
for col in ['tobacco', 'ldl', 'adiposity', 'obesity', 'alcohol']:
    df[col] = df[col].astype('float')

In [34]:
df

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.00,5.73,23.11,1,49,25.30,97.20,52,1
1,144,0.01,4.41,28.61,0,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,1,52,29.14,3.81,46,0
3,170,7.50,6.41,38.03,1,51,31.99,24.26,58,1
4,134,13.60,3.50,27.78,1,60,25.99,57.34,49,1
...,...,...,...,...,...,...,...,...,...,...
457,214,0.40,5.98,31.72,0,64,28.45,0.00,58,0
458,182,4.20,4.41,32.10,0,52,28.61,18.72,52,1
459,108,3.00,1.59,15.23,0,40,20.09,26.64,55,0
460,118,5.40,11.61,30.79,0,64,27.35,23.97,40,0


Now let's carry out the backwards stepwise selection.

In [35]:
# The variables to use in the logistic regression.
variables = ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age']

In [36]:
# The variables encounted so far with the lowest AIC encountered so far during backward stepwise selection.
cur_best_vars = variables[:]

In [37]:
logistic = lambda x: 1 / (1 + np.exp(-x))

In [38]:
logit = lambda x: np.log(x / (1 - x))

In [39]:
Y = df['chd'].to_numpy()

This cell performs the first round of stepwise selection, computing the AIC with all the variables used in the model.

In [40]:
base_vars = cur_best_vars[:]
X_base = df[cur_best_vars].to_numpy()
X_base = np.concatenate((np.ones((len(X_base), 1)), X_base), axis=1)
beta_base = np.zeros(shape=(X_base.shape[1],1))

# Perform 1000 steps of the reweighted least squares algorithm.
for i in range(1000):
    p = logistic(X_base @ beta_base)
    p = p.flatten()
    Z = logit(p) + (Y - p)/(p*(1 - p))
    likelihood = np.prod(p**Y * (1 - p)**(1 - Y))
    print(f'Iteration {i} - likelihood: {likelihood}')
    W = np.diag((p * (1 - p)))
    beta_base = inv(X_base.T @ W @ X_base) @ X_base.T @ W @ Z

likelihood = np.prod(p**Y * (1 - p)**(1 - Y))
log_lik = np.log(likelihood)
# These variables will store the best AIC computed over all
# rounds of stepwise selection together with the corresponding vector beta.
best_aic = log_lik - (len(beta_base) - 1)
best_beta = beta_base

Iteration 0 - likelihood: 8.397345134458861e-140
Iteration 1 - likelihood: 1.097513659608382e-105
Iteration 2 - likelihood: 2.426120859398368e-103
Iteration 3 - likelihood: 2.991430103915645e-103
Iteration 4 - likelihood: 2.9929166762294787e-103
Iteration 5 - likelihood: 2.9929166858183677e-103
Iteration 6 - likelihood: 2.99291668581836e-103
Iteration 7 - likelihood: 2.9929166858183657e-103
Iteration 8 - likelihood: 2.992916685818393e-103
Iteration 9 - likelihood: 2.9929166858183707e-103
Iteration 10 - likelihood: 2.9929166858183925e-103
Iteration 11 - likelihood: 2.992916685818371e-103
Iteration 12 - likelihood: 2.992916685818362e-103
Iteration 13 - likelihood: 2.9929166858184024e-103
Iteration 14 - likelihood: 2.9929166858183603e-103
Iteration 15 - likelihood: 2.9929166858183613e-103
Iteration 16 - likelihood: 2.992916685818382e-103
Iteration 17 - likelihood: 2.992916685818381e-103
Iteration 18 - likelihood: 2.9929166858183484e-103
Iteration 19 - likelihood: 2.9929166858183707e-103
I

Iteration 352 - likelihood: 2.99291668581836e-103
Iteration 353 - likelihood: 2.992916685818374e-103
Iteration 354 - likelihood: 2.9929166858183573e-103
Iteration 355 - likelihood: 2.992916685818381e-103
Iteration 356 - likelihood: 2.9929166858183796e-103
Iteration 357 - likelihood: 2.99291668581837e-103
Iteration 358 - likelihood: 2.9929166858183652e-103
Iteration 359 - likelihood: 2.9929166858183945e-103
Iteration 360 - likelihood: 2.992916685818374e-103
Iteration 361 - likelihood: 2.9929166858183925e-103
Iteration 362 - likelihood: 2.992916685818374e-103
Iteration 363 - likelihood: 2.9929166858183836e-103
Iteration 364 - likelihood: 2.9929166858183667e-103
Iteration 365 - likelihood: 2.9929166858183553e-103
Iteration 366 - likelihood: 2.9929166858183806e-103
Iteration 367 - likelihood: 2.9929166858183534e-103
Iteration 368 - likelihood: 2.992916685818363e-103
Iteration 369 - likelihood: 2.9929166858183885e-103
Iteration 370 - likelihood: 2.9929166858183866e-103
Iteration 371 - likel

Iteration 732 - likelihood: 2.992916685818366e-103
Iteration 733 - likelihood: 2.992916685818359e-103
Iteration 734 - likelihood: 2.9929166858183955e-103
Iteration 735 - likelihood: 2.9929166858183697e-103
Iteration 736 - likelihood: 2.992916685818377e-103
Iteration 737 - likelihood: 2.992916685818382e-103
Iteration 738 - likelihood: 2.9929166858183543e-103
Iteration 739 - likelihood: 2.992916685818383e-103
Iteration 740 - likelihood: 2.992916685818371e-103
Iteration 741 - likelihood: 2.9929166858184183e-103
Iteration 742 - likelihood: 2.992916685818401e-103
Iteration 743 - likelihood: 2.9929166858183925e-103
Iteration 744 - likelihood: 2.9929166858183707e-103
Iteration 745 - likelihood: 2.9929166858183836e-103
Iteration 746 - likelihood: 2.992916685818391e-103
Iteration 747 - likelihood: 2.9929166858183563e-103
Iteration 748 - likelihood: 2.992916685818379e-103
Iteration 749 - likelihood: 2.9929166858183876e-103
Iteration 750 - likelihood: 2.9929166858183895e-103
Iteration 751 - likel

In [41]:
print('Round 0')
print('-----')
print(f'{cur_best_vars} - AIC: {best_aic:.4f}')
print('')

Round 0
-----
['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age'] - AIC: -245.0700



The following cell performs the remaining steps of backwards stepwise selection.

In [42]:
for i in range(len(variables)-1):
    print(f'Round {i+1}')
    print('-----')
    
    # The best AIC, variables, and beta for the current round
    # of stepwise selection.
    cur_best_aic = -float('inf')
    cur_best_var = None
    cur_best_beta = None
    
    for var in variables:
        the_vars = cur_best_vars[:]
        the_vars.remove(var)
        X = df[the_vars].to_numpy()
        X = np.concatenate((np.ones((len(X), 1)), X), axis=1)
        beta = np.zeros(shape=(X.shape[1],1))
        
        for i in range(1000):
            p = logistic(X @ beta)
            p = p.flatten()
            Z = logit(p) + (Y - p)/(p*(1 - p))
            likelihood = np.prod(p**Y * (1 - p)**(1 - Y))
            W = np.diag((p * (1 - p)))
            beta = inv(X.T @ W @ X) @ X.T @ W @ Z
        
        likelihood = np.prod(p**Y * (1 - p)**(1 - Y))
        log_lik = np.log(likelihood)
        aic = log_lik - (len(beta) - 1)
        
        print(f'{the_vars} - AIC: {aic:.4f}'.format(the_vars, beta, aic))
        if aic > cur_best_aic:
            cur_best_aic = aic
            cur_best_var = var
            cur_best_beta = beta
    
    if cur_best_aic > best_aic:
        cur_best_vars.remove(cur_best_var)
        best_aic = cur_best_aic
        best_beta = cur_best_beta
        best_vars = cur_best_vars[:]
        
        variables.remove(cur_best_var)
        print('')
    else:
        break

Round 1
-----
['tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age'] - AIC: -244.7186
['sbp', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age'] - AIC: -248.8372
['sbp', 'tobacco', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age'] - AIC: -248.5351
['sbp', 'tobacco', 'ldl', 'famhist', 'typea', 'obesity', 'alcohol', 'age'] - AIC: -244.2725
['sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'obesity', 'alcohol', 'age'] - AIC: -252.4425
['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'obesity', 'alcohol', 'age'] - AIC: -249.5233
['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'alcohol', 'age'] - AIC: -245.1166
['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'age'] - AIC: -244.0704
['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol'] - AIC: -251.2642

Round 2
-----
['tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'age'] - AIC: -243.7325
['sbp', 'ldl', 'adiposity'

So there were four rounds of backwards selection, and AIC failed to improve after on the fifth round.

In [43]:
X_best = df[best_vars].to_numpy()
X_best = np.concatenate((np.ones((len(X_best), 1)), X), axis=1)

In [44]:
Y_hat = logistic(X_best @ best_beta) > 0.5

In [45]:
print(f'Best set of variables found for AIC: {best_vars}')
print(f'Corresponding coefficients for log. reg.: {best_beta}')
print(f'Corresponding accuracy score: {accuracy_score(Y_hat, Y):.4f}')

Best set of variables found for AIC: ['tobacco', 'ldl', 'famhist', 'typea', 'age']
Corresponding coefficients for log. reg.: [-6.44644451  0.08037533  0.16199164  0.90817526  0.03711521  0.05046038]
Corresponding accuracy score: 0.5779


In [46]:
Y_base = logistic(X_base @ beta_base) > 0.5

In [47]:
print(f'Baseline set of variables: {base_vars}')
print(f'Corresponding coefficients for log. reg.: {beta_base}')
print(f'Corresponding accuracy score: {accuracy_score(Y_base, Y):.4f}')

Baseline set of variables: ['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity', 'alcohol', 'age']
Corresponding coefficients for log. reg.: [-6.15072086e+00  6.50401713e-03  7.93764457e-02  1.73923898e-01
  1.85865682e-02  9.25370419e-01  3.95950250e-02 -6.29098693e-02
  1.21662401e-04  4.52253496e-02]
Corresponding accuracy score: 0.7338


So we see that the best variables identified for AIC using backward stepwise selection are tobacco, ldl, famhist, typea, and age. The coefficients for these variables are fairly close to the coefficients of the same variables in the baseline model that uses all the variables. On the other hand, the accuracy of the model using only these variables is quite a bit less than the accuracy using all the variables. So one would hope that the model identified by the stepwise selection generalizes better.