In [1]:
import numpy as np
import pandas as pd
import scipy
from numpy.linalg import inv, det
from sklearn.linear_model import LogisticRegression
from functools import partial
from scipy.stats import norm

First let's prepare the data.

In [2]:
with open('spam.txt', 'r') as f:
    arr = []
    for line in f:
        row = line.split()
        row = list(map(lambda x: float(x), row))
        arr.append(row)

In [3]:
df = pd.DataFrame(arr)

In [4]:
df[57] = df[57].astype(int)

In [5]:
ds = df.to_numpy()

In [6]:
X = ds[:,:-1]

In [7]:
Y = ds[:,-1]

In [8]:
np.random.seed(42)

# Part (a)

## Part (i)

First we classify the data using LDA. The Python function below carries out LDA and returns the resulting classifier.

In [9]:
def LDA(X, Y):
    """
        Construct the LDA classifier defined by explanatory variable X
        and response variable Y and return it.
        Returns: h, a Python function taking a row with length X.shape[0]
        and returning a class 0 or 1.
    """
    
    # The probabilities of the two classes 0 and 1
    pi0 = (1-Y).mean()
    pi1 = Y.mean()
    
    # The rows in X corresponding to the classes Y == 0
    # and Y == 1, respectively
    X0 = X[Y == 0.]
    X1 = X[Y == 1.]
    
    # The number of examples in each class
    n0 = len(X0)
    n1 = len(X1)
    
    # The means of each feature in X taken over each
    # class Y == 0 and Y == 1
    mu0 = X0.mean(axis=0)
    mu1 = X1.mean(axis=0)
    
    # The matrices defining the quadratic forms for QDA
    mat0 = \
        np.array([(X0 - mu0)[i].reshape(-1,1) @ (X0 - mu0)[i].reshape(1,-1) for i in range(n0)])
    mat1 = \
        np.array([(X1 - mu1)[i].reshape(-1,1) @ (X1 - mu1)[i].reshape(1,-1) for i in range(n1)])
    S0 = mat0.mean(axis=0)
    S1 = mat1.mean(axis=0)

    # The averaged matrix S defining the quadratic form for LDA
    S = (n0 * S0 + n1 * S1) / (n0 + n1)
    
    # The two quadratic forms used in LDA classification
    delta0 = lambda x: x @ inv(S) @ mu0.T - 0.5 * mu0 @ inv(S) @ mu0.T + np.log(pi0)
    delta1 = lambda x: x @ inv(S) @ mu1.T - 0.5 * mu1 @ inv(S) @ mu1.T + np.log(pi1)
    
    # The LDA classifier
    h = lambda x: np.argmax([delta0(x), delta1(x)])
    
    return h

Below we construct an LDA classifier and use it to predict the response for $X$.

In [10]:
# The LDA classifier
h = LDA(X,Y)

In [11]:
# The predicted classes from applying h to X
Y_pred = np.array(list(map(h, X)))

Below we display the table with entries $2x2$ table specified in the problem statement and compute the overall misclassification rate.

In [12]:
def get_table(Y, Y_pred):
    """
        Compute a 2x2 matrix with (i,j) entry
        equal to the number of entries with
        Y == i and Y_pred == j.
    """
    
    m00 = ((Y == 0) & (Y_pred == 0)).sum()
    m01 = ((Y == 0) & (Y_pred == 1)).sum()
    m10 = ((Y == 1) & (Y_pred == 0)).sum()
    m11 = ((Y == 1) & (Y_pred == 1)).sum()
    
    return np.array([[m00, m01],[m10, m11]])

In [13]:
print(f'Table for LDA: \n{get_table(Y, Y_pred)}')

Table for LDA: 
[[2663  125]
 [ 387 1426]]


|&nbsp;|$\hat h(x) = 0$| $\hat h(x) = 1$|
| :--- | :--- | :--- |
|$Y=0$|2663|125|
|$Y=1$|387|1426|

In [14]:
((m00, m01), (m10,m11)) = get_table(Y, Y_pred)

In [15]:
mis = (m01 + m10) / (m00 + m01 + m10 + m11)
print(f'LDA misclassification rate: {mis:.5f}')

LDA misclassification rate: 0.11128


## Part (ii)

Now we classify the data using QDA. The Python function below carries out QDA and returns the resulting classifier.

In [16]:
def QDA(X, Y):
    """
        Construct the LDA classifier defined by explanatory variable X
        and response variable Y and return it.
        Returns: h, a Python function taking a row with length X.shape[0]
        and returning a class 0 or 1.
    """
    
    # The probabilities of the two classes 0 and 1
    pi0 = (1-Y).mean()
    pi1 = Y.mean()

    # The rows in X corresponding to the classes Y == 0
    # and Y == 1, respectively
    X0 = X[Y == 0.]
    X1 = X[Y == 1.]

    # The number of examples in each class
    n0 = len(X0)
    n1 = len(X1)
    
    # The means of each feature in X taken over each
    # class Y == 0 and Y == 1
    mu0 = X0.mean(axis=0)
    mu1 = X1.mean(axis=0)

    # The matrices defining the quadratic forms for QDA
    mat0 = \
        np.array([(X0 - mu0)[i].reshape(-1,1) @ (X0 - mu0)[i].reshape(1,-1) for i in range(n0)])
    mat1 = \
        np.array([(X1 - mu1)[i].reshape(-1,1) @ (X1 - mu1)[i].reshape(1,-1) for i in range(n1)])
    S0 = mat0.mean(axis=0)
    S1 = mat1.mean(axis=0)

    # The two quadratic forms used in QDA classification
    delta0 = lambda x: -0.5 * np.log(det(S0)) - 0.5 * (x - mu0) @ inv(S0) @ (x - mu0).T + np.log(pi0)
    delta1 = lambda x: -0.5 * np.log(det(S1)) - 0.5 * (x - mu1) @ inv(S1) @ (x - mu1).T + np.log(pi1)

    # The QDA classifier
    h = lambda x: np.argmax([delta0(x), delta1(x)])
    
    return h

Again we construct the QDA classifier and predict $Y$ using it.

In [17]:
# The QDA classifier
h = QDA(X,Y)

In [18]:
# The predicted Y values
Y_pred = np.array(list(map(h, X)))

In [19]:
print(f'Table for QDA: \n{get_table(Y, Y_pred)}')

Table for QDA: 
[[2101  687]
 [  82 1731]]


In [20]:
((m00, m01), (m10, m1)) = get_table(Y, Y_pred)

|&nbsp;|$\hat h(x) = 0$| $\hat h(x) = 1$|
| :--- | :--- | :--- |
|$Y=0$|2101|687|
|$Y=1$|82|1731|

In [21]:
mis = (m01 + m10) / (m00 + m01 + m10 + m11)
print(f'QDA misclassification rate: {mis:.5f}')

QDA misclassification rate: 0.17900


## Part (iii)

Now we try logistic regression, using scikit-learn's built-in models.

In [22]:
# A logistic regression model
clf = LogisticRegression(penalty=None, fit_intercept=True,
                         random_state=42, max_iter=10000)

In [23]:
# Fit the model to the data
clf.fit(X, Y)

In [24]:
# The Y values
Y_pred = clf.predict(X)

In [25]:
print(f'Table for logistic regression: \n{get_table(Y, Y_pred)}')

Table for logistic regression: 
[[2665  123]
 [ 194 1619]]


|&nbsp;|$\hat h(x) = 0$| $\hat h(x) = 1$|
| :--- | :--- | :--- |
|$Y=0$|2665|123|
|$Y=1$|194|1619|

In [26]:
((m00, m01), (m10, m11)) = get_table(Y, Y_pred)

In [27]:
mis = (m01 + m10) / (m00 + m01 + m10 + m11)
print(f'Logistic regression misclassification rate: {mis:.5f}')

Logistic regression misclassification rate: 0.06890


## Part (iv)

Below we train a decision tree on the pair $(X,Y)$. We'll do this from scratch which necessitates defining some helper functions and a class to handle the classification. Additionally, we allow weights on the different examples in the dataset, in order to handle the general setting of AdaBoost later on.

In [28]:
def proportion_with_weights(Y, weights, indices):
    """
        Compute the (weighted) proportion of classes
        from the rows specified by indices in the dataset (X,Y)
        Y: classes for all rows of the dataset
        weights: an array with shape equal to Y.shape
        indices: a subset of the indices from 0 to Y.shape[0]
        Returns:
            p0: proportion in class 0
            p1: proportion in class 1
    """
    p0 = np.dot(Y[indices] == 0, weights[indices]) / len(Y[indices])
    p1 = np.dot(Y[indices] == 1, weights[indices]) / len(Y[indices])
    
    return p0, p1

In [29]:
def split_to_indices(X, Y, indices, col, t):
    """
        Calculate new indices obtained by splitting on the column col
        according to col <= t or col > t. The new indices are obtained
        by restricting (X,Y) to the rows specified by indices.
        X, Y: arrays with X.shape[0] = Y.shape[0]
        indices: a subset of the indices from 0 to Y.shape[0]
        col: the column specifying a feature to split on
        t: the value defining the split according to col <= or col > t
        Returns:
            new_indices: a pair of subsets of indices which
            partition indices when taken together
    """
    new_indices = (np.intersect1d(indices, np.where(X[:, col] <= t)),
            np.intersect1d(indices, np.where(X[:, col] > t)))
    return new_indices

In [30]:
def calc_impurity_with_weights(X, Y, weights, indices, col, t):
    """
        Calculate the Gini impurity (with weights) after splitting (X,Y)
        on the column col according to col <= t or col > t.
        X, Y: arrays with X.shape[0] = Y.shape[0]
        indices: a subset of the indices from 0 to Y.shape[0]
        weights: an array with weights.shape = Y.shape
        col: the column specifying a feature to split on
        t: the value defining the split according to col <= or col > t
    """
    
    new_indices = split_to_indices(X, Y, indices, col, t)
    
    gammas = []
    
    for new_ind in new_indices:
        p0, p1 = proportion_with_weights(Y, weights, new_ind)
        gamma = 1 - p0**2 - p1**2
        gammas.append(gamma)
    
    impurity = np.array(gammas).sum()
    
    return impurity

In [31]:
def generate_split_with_weights(X, Y, indices, weights, n_0=500, frac=4, num_ts=20):
    """
        Calculate a split to minimize the gini impurity.
        X, Y: arrays with X.shape[0] = Y.shape[0]
        indices: a subset of the indices from 0 to Y.shape[0]
        weights: an array with weights.shape = Y.shape
        n_0: do not compute a split if there are fewer than this
            number of indices in indices
        frac: only create a split in which both subsets of the
            partition of indices contain at least n_0 / frac elements
        num_ts: the number of values of t to test as in col <= t
            and col < t
        Returns:
            best_split: a partition of indices according to the best
            split
    """
    
    if len(indices) <= n_0:
        return (None, None)
    else:
        best_split = (None, None)
        min_impurity = float('inf')

        k = X.shape[1]
        for col in range(k):
            low = X[indices, col].min()
            high = X[indices, col].max()
            ts = np.linspace(low, high, num_ts)

            for t in ts:
                impurity = calc_impurity_with_weights(X, Y, weights, indices, col, t)
                new_indices = split_to_indices(X, Y, indices, col, t)
                if len(new_indices[0]) >= n_0 / frac and len(new_indices[1]) >= n_0 / frac:
                    if impurity < min_impurity:
                        min_impurity = impurity
                        best_split = (col, t)

        return best_split

In [32]:
class Node:
    """
        Represents a node of a decision tree.
    """
    
    def __init__(self, X, Y, indices, col=None, t=None, lchild=None, rchild=None):
        """
            (X, Y): the dataset to train on
            indices: the indices corresponding to the split
                represented by this node
            col: a column to split on, if the node has children
            t: the value for the split according to col <= t or col > t
            lchild, rchild: nodes representing the left and right child
        """
        self.X = X
        self.Y = Y
        self.avg = Y[indices].mean()
        self.indices = indices
        self.col = col
        self.t = t
        self.lchild = lchild
        self.rchild = rchild
        
        # Whether this node is a leaf or not
        self.is_leaf = True
    
    def split(self, weights, n_0, frac, num_ts):
        """
            Split the node using generate_split_with_weights and assign
            its children, column, and t
        """
        if self.is_leaf:
            (col, t) = generate_split_with_weights(self.X, self.Y, self.indices, weights, n_0, frac, num_ts)
            if col is not None:
                new_indices = split_to_indices(self.X, self.Y, self.indices, col, t)
                self.col = col
                self.t = t
                self.lchild = Node(self.X, self.Y, new_indices[0])
                self.rchild = Node(self.X, self.Y, new_indices[1])
            self.is_leaf = False

In [33]:
class DecisionTree:
    """
        Represents a decision tree.
    """
    
    def __init__(self, X, Y, weights=np.ones(len(Y)) / len(Y), n_0=500, frac=4, num_ts=20):
        """
            Initialize the tree with a single node.
            (X, Y): the dataset to train on
            weights: an array with weights.shape=Y.shape representing
                weights for the different examples of the dataset
            n_0: an integer such that nodes of size <= n_0 are not split
                further
            frac: an integer such that each node has size >= n_0/frac
            num_ts: the number of values of t to test for each column
        """
        self.X = X
        self.Y = Y
        self.weights = weights      
        self.n_0 = n_0
        self.frac = frac
        self.num_ts = num_ts
        
        # Initialize the tree with a single node
        # representing all indices of the dataset.
        # The children of node i in self.tree
        # lie at indices 2*i+1 and 2*i+2.
        self.root = Node(X, Y, list(range(len(X))))
        self.tree = [self.root]
        
        self.num_leaves = 1
    
    def split_leaves(self):
        """
            Split each leaf node.
            Returns:
                add_new_nodes: a Boolean saying whether any new
                nodes were created by splitting or not
        """
        
        # An array to hold the new children of the leaves
        children = []
        
        # Split each node. If the node has been split already
        # it won't be split again.
        for node in self.tree[-self.num_leaves:]:
            
            # Add the children created by the split for each leaf node.
            # If the node is None then its children will be None.
            if node is not None:
                node.split(self.weights, self.n_0, self.frac, self.num_ts)
                children += [node.lchild, node.rchild]
        
        # A boolean recording whether any new children have been created
        add_new_nodes = any(children)
        
        # If there are new children, add them to the tree.
        if add_new_nodes:
            self.tree += children
            self.num_leaves = len(children)
        
        return add_new_nodes
        
    def train(self):
        """
            Split all leaf nodes until none can be split further
            according to our end condition.
        """
        continue_splitting = True
        
        while continue_splitting:
            continue_splitting = self.split_leaves()
    
    def predict_flt_one(self, x):
        """
            Predict a float for the array x of shape
            X.shape[1]. This float will be the average
            of the classes for the node in which x lies
            in the decision tree.
        """
        node = self.root
        
        
        # Traverse the decision tree until coming to the
        # node which contains x.
        while node.col:
            if x[node.col] > node.t:
                node = node.rchild
            else:
                node = node.lchild
        
        return node.avg
    
    def predict_one(self, x):
        """
            Predict a class for the array x of
            shape X.shape[1].
        """
        return np.sign(self.predict_flt_one(x))
    
    def predict_flt(self, x):
        """
            Predict floats for the array x with
            x.shape[1] = X.shape[1].
        """
        return np.array(list(map(self.predict_flt_one, x)))
    
    def predict(self, x):
        """
            Predict classes for the array x with
            x.shape[1] = X.shape[1].
        """
        return np.array(list(map(self.predict_one, x)))    

In [34]:
clf = DecisionTree(X, Y, n_0=20, num_ts=20)

In [35]:
clf.train()

  p0 = np.dot(Y[indices] == 0, weights[indices]) / len(Y[indices])
  p1 = np.dot(Y[indices] == 1, weights[indices]) / len(Y[indices])


In [36]:
Y_pred = clf.predict(X)

In [37]:
print(f'Table for decision tree: \n{get_table(Y, Y_pred)}')

Table for decision tree: 
[[1672 1116]
 [   0 1813]]


|&nbsp;|$\hat h(x) = 0$| $\hat h(x) = 1$|
| :--- | :--- | :--- |
|$Y=0$|1672|1116|
|$Y=1$|0|1813|

In [38]:
((m00, m01), (m10, m11)) = get_table(Y, Y_pred)

In [39]:
mis = (m01 + m10) / (m00 + m01 + m10 + m11)
print(f'Tree misclassification rate: {mis:.5f}')

Tree misclassification rate: 0.24256


# Part (b)

Below we create a couple of functions to create the chunks for cross-validation and ultimately perform cross-validation.

In [40]:
def create_random_splits(X, Y, num_ch):
    """
        Split the dataset (X, Y) into num_ch random chunks
        of equal size (except for possibly the last chunk).
        (X, Y): the dataset
        num_ch: the number of chunks
    """
    
    # Randomly permute (X, Y)
    n = len(X)
    indices = np.random.permutation(n)
    X_perm = X[indices]
    Y_perm = Y[indices]
    
    k = n // num_ch
    
    # Arrays to hold the chunks
    Xs = []
    Ys = []
    
    # Append one chunk at a time
    for i in range(num_ch):
        if i < num_ch-1:
            Xs.append(X_perm[i*k:i*k + k, :])
            Ys.append(Y_perm[i*k:i*k + k])
        else:
            Xs.append(X_perm[i*k:, :])
            Ys.append(Y_perm[i*k:])
    
    return Xs, Ys

In [41]:
def cross_validation(X, Y, num_ch, model_name):
    """
        Perform n-fold cross-validation where
        (X, Y): the dataset
        num_ch: the number of folds
        model_name: either 'LDA' or 'log_reg'
        Returns:
            errors: a list of errors for each fold
    """
    errors = []
    Xs, Ys = create_random_splits(X, Y, num_ch)
    
    # Train the model for each fold and compute
    # the error on the held out test set.
    for i in range(num_ch):
        
        # (X_left_over, Y_left_over) is the training dataset
        # for this fold.
        # (X_ch, Y_ch) is the held out test set.
        X_left_over = np.concatenate(Xs[:i] + Xs[i+1:], axis=0)
        Y_left_over = np.concatenate(Ys[:i] + Ys[i+1:], axis=0)
        X_ch = Xs[i]
        Y_ch = Ys[i]
        
        if model_name == 'LDA':
            h = LDA(X_left_over, Y_left_over)
            Y_pred = np.array(list(map(h, X_ch))) 
            ((m00, m01), (m10, m11)) = get_table(Y_ch, Y_pred)
            misc = (m01 + m10) / (m00 + m01 + m10 + m11)
            errors.append(misc)
            
        elif model_name == 'log_reg':
            clf = LogisticRegression(penalty=None, fit_intercept=True,
                         random_state=42, max_iter=10000)
            clf.fit(X_left_over, Y_left_over)
            Y_pred = clf.predict(X_ch)
            ((m00, m01), (m10, m11)) = get_table(Y_ch, Y_pred)
            misc = (m01 + m10) / (m00 + m01 + m10 + m11)
            errors.append(misc)
    
    return errors

We report the CV errors below.

In [42]:
LDA_errors = cross_validation(X, Y, 5, 'LDA')

In [43]:
print(f'LDA errors under 5-fold cross validation: \n {LDA_errors}')

LDA errors under 5-fold cross validation: 
 [0.11847826086956521, 0.10652173913043478, 0.11956521739130435, 0.10652173913043478, 0.11183496199782844]


In [44]:
print(f'LDA cross validation error: {np.mean(LDA_errors):.5f}')

LDA cross validation error: 0.11258


In [45]:
log_errors = cross_validation(X, Y, 5, 'log_reg')

In [46]:
print(f'Logistic regression errors under 5-fold cross validation: \n {log_errors}')

Logistic regression errors under 5-fold cross validation: 
 [0.06956521739130435, 0.07065217391304347, 0.09347826086956522, 0.07282608695652174, 0.05537459283387622]


In [47]:
print(f'Logistic regression cross validation error: {np.mean(log_errors):.5f}')

Logistic regression cross validation error: 0.07238


It appears that logistic regression generalizes better.

# Part (c)

Now we compare means across the different variables for the two different groups.

In [48]:
N = X.shape[1]

In [49]:
X_spam = X[Y == 1.]

In [50]:
X_norm = X[Y == 0.]

The cell below reports p-values for the Wald test on difference of means between the spam and normal emails.

In [51]:
pvals = []
m = len(X_spam)
n = len(X_norm)

for i in range(N):
    mu1 = X_spam[:, i].mean()
    mu2 = X_norm[:, i].mean()
    
    s1 = X_spam[:, i].var()
    s2 = X_norm[:, i].var()
    se = np.sqrt((s1**2/m) + (s2**2/n))
    
    stat = mu1 - mu2
    Wald = stat / se
    
    dist = norm()
    pval = 2*dist.cdf(-np.abs(Wald))
    
    pvals.append(pval)
    
    print(f"Covariate {i}\nobserved mean for spam emails: {mu1:.4f}\n"
          f"Observed mean for normal emails: {mu2:.4f}\n"
          f"Estimated standard error: {se:.4f}\n"
          f"Wald statistic: {Wald:.4f}\n"
          f"Estimated p-value: {pval}\n")

Covariate 0
observed mean for spam emails: 0.1523
Observed mean for normal emails: 0.0735
Estimated standard error: 0.0028
Wald statistic: 27.9665
Estimated p-value: 4.1564769180713155e-172

Covariate 1
observed mean for spam emails: 0.1646
Observed mean for normal emails: 0.2445
Estimated standard error: 0.0506
Wald statistic: -1.5780
Estimated p-value: 0.1145663100599748

Covariate 2
observed mean for spam emails: 0.4038
Observed mean for normal emails: 0.2006
Estimated standard error: 0.0072
Wald statistic: 28.0833
Estimated p-value: 1.565013197321854e-173

Covariate 3
observed mean for spam emails: 0.1647
Observed mean for normal emails: 0.0009
Estimated standard error: 0.1156
Wald statistic: 1.4170
Estimated p-value: 0.15648638835252

Covariate 4
observed mean for spam emails: 0.5140
Observed mean for normal emails: 0.1810
Estimated standard error: 0.0137
Wald statistic: 24.2208
Estimated p-value: 1.341904234721225e-129

Covariate 5
observed mean for spam emails: 0.1749
Observed m

In [52]:
# The indices ranked according to p-value,
# from smallest p-value to largest
indices = np.argsort(pvals)

In [53]:
# The 10 covariates with smallest p-value
covars_to_keep = indices[:10]

In [54]:
# A reduced version of X with all covariates
# dropped except for the 10 with smallest p-value
X_red = X[:, covars_to_keep]

Now we compute the predictions for LDA.

In [55]:
h = LDA(X_red,Y)
Y_pred = np.array(list(map(h, X_red)))
((m00, m01), (m10, m11)) = get_table(Y, Y_pred)

In [56]:
get_table(Y, Y_pred)

array([[2699,   89],
       [ 802, 1011]])

|&nbsp;|$\hat h(x) = 0$| $\hat h(x) = 1$|
| :--- | :--- | :--- |
|$Y=0$|2699|89|
|$Y=1$|802|1011|

In [57]:
mis = (m01 + m10) / (m00 + m01 + m10 + m11)
print(f'LDA misclassification rate: {mis:.5f}')

LDA misclassification rate: 0.19365


And finally we compute the predictions for logistic regression.

In [58]:
clf = LogisticRegression(penalty=None, fit_intercept=True,
                         random_state=42, max_iter=10000)
clf.fit(X_red, Y)
Y_pred = clf.predict(X_red)
((m00, m01), (m10, m11)) = get_table(Y, Y_pred)

In [59]:
get_table(Y, Y_pred)

array([[2669,  119],
       [ 560, 1253]])

|&nbsp;|$\hat h(x) = 0$| $\hat h(x) = 1$|
| :--- | :--- | :--- |
|$Y=0$|2669|119|
|$Y=1$|560|1253|

In [60]:
mis = (m01 + m10) / (m00 + m01 + m10 + m11)
print(f'Logistic regression misclassification rate: {mis:.5f}')

Logistic regression misclassification rate: 0.14758
