In [7]:
import math
a = 0.00137714

In [8]:
math.log(a)

-6.587746394105945

In [32]:
import numpy as np
training_spam = np.loadtxt(open("training_spam.csv"), delimiter=",")
testing_spam = np.loadtxt(open("testing_spam.csv"), delimiter=",")


In [25]:
# This skeleton code simply classifies every input as ham
#
# Here you can see there is a parameter k that is unused, the
# point is to show you how you could set up your own. You might
# also pass in extra data via a train method (also does nothing
# here). Modify this code as much as you like so long as the 
# accuracy test in the cell below runs.

class SpamClassifier:
    def train(self):
        training_labels = training_spam[:,0]
        training_features = training_spam[:, 1:]
        
        # estimating log class priors from the training data
        spam_proportion = sum(training_labels) / len(training_labels)
        self.log_class_priors = np.array([np.log(1 - spam_proportion), np.log(spam_proportion)])
        
        # separating the ham from the spam
        hams_indices = np.where(training_labels == 0)
        hams = training_features[hams_indices] 
    
        spams_indices = np.where(training_labels == 1)
        spams = training_features[spams_indices]
        
        # estimating log class conditional likelihoods
        alpha = 1
        k = training_features.shape[1] # total number of keywords

        spam_featurewise_totals = np.sum(spams, axis = 0) # an array showing the total for each keyword for spam with shape = (1, n_features) 
        spam_total_features = sum(spam_featurewise_totals) # total number of all features
        spam_theta = (spam_featurewise_totals + alpha) / (spam_total_features + (k * alpha)) # relative spam feature frequencies calculated with laplace smoothing

        ham_featurewise_totals = np.sum(hams, axis = 0)
        ham_total_features = sum(ham_featurewise_totals)
        ham_theta = (ham_featurewise_totals + alpha) / (ham_total_features + (k * alpha))
        
        self.log_class_conditionals = np.array([np.log(ham_theta), np.log(spam_theta)])
        
    def predict(self, data):
        rows, cols = data.shape
    
        class_predictions = np.zeros(rows)

        for i in range(rows):
            # initialising log posterior probabilities
            ham_email = 0
            spam_email = 0
            for j in range(cols):
                # for each feature, if it is in the email add the respective log probabilities for spam and ham 
                feature_response = data[i,j]

                ham_class_likelihood = self.log_class_conditionals[0,j]
                spam_class_likelihood = self.log_class_conditionals[1,j]

                ham_email += (feature_response * ham_class_likelihood)
                spam_email += (feature_response * spam_class_likelihood)
                
            # finally add the log_class priors to each, then compare whether spam or ham is more likely
            ham_email += self.log_class_priors[0]
            spam_email += self.log_class_priors[1]

            if spam_email > ham_email: # if spam more likely than ham, classify as 1, otherwise leave as 0 for ham
                class_predictions[i] = 1

        return class_predictions
    

def create_classifier():
    classifier = SpamClassifier()
    classifier.train()
    return classifier

classifier = create_classifier()

In [52]:
class SpamClassifier:
    def __init__(self, alpha = 1):
        # separating the ham from the spam
        hams_indices = np.where(training_spam[:,0] == 0)
        self.hams = training_spam[:,1:][hams_indices] 
    
        spams_indices = np.where(training_spam[:,0] == 1)
        self.spams = training_spam[:,1:][spams_indices]
        
        # for laplace smoothing:
        self.alpha = alpha
        self.k = training_spam.shape[1] - 1 # number of keyword columns
        
        
    def train(self):
        # estimating log class priors from the training data
        spam_proportion = self.spams.shape[0] / training_spam.shape[0]
        self.log_class_priors = np.array([np.log(1 - spam_proportion), np.log(spam_proportion)])
        
        # estimating log class conditional likelihoods
        spam_featurewise_totals = np.sum(self.spams, axis = 0) # an array showing the total for each keyword for spam with shape = (1, n_features) 
        spam_total_features = sum(spam_featurewise_totals) # total number of all features
        spam_theta = (spam_featurewise_totals + self.alpha) / (spam_total_features + (self.k * self.alpha)) # relative spam feature frequencies calculated with laplace smoothing

        ham_featurewise_totals = np.sum(self.hams, axis = 0)
        ham_total_features = sum(ham_featurewise_totals)
        ham_theta = (ham_featurewise_totals + self.alpha) / (ham_total_features + (self.k * self.alpha))
        
        self.log_class_conditionals = np.array([np.log(ham_theta), np.log(spam_theta)])
        
    def predict(self, data):
        rows, cols = data.shape
        class_predictions = np.zeros(rows)

        for i in range(rows):
            # initialising log posterior probabilities for both classes
            ham_email = 0
            spam_email = 0
            for j in range(cols):
                # for each feature, if it is in the email add the respective log probabilities for spam and ham 
                feature_response = data[i,j]

                ham_class_likelihood = self.log_class_conditionals[0,j]
                spam_class_likelihood = self.log_class_conditionals[1,j]

                ham_email += (feature_response * ham_class_likelihood)
                spam_email += (feature_response * spam_class_likelihood)
                
            # finally add the log_class priors to each, then compare whether spam or ham is more likely
            ham_email += self.log_class_priors[0]
            spam_email += self.log_class_priors[1]

            if spam_email > ham_email: # if spam more likely than ham, classify as 1, otherwise leave as 0 for ham
                class_predictions[i] = 1

        return class_predictions
    

def create_classifier():
    classifier = SpamClassifier()
    classifier.train()
    return classifier

classifier = create_classifier()

In [53]:
# You can use this cell to check whether the returned objects of your function are of the right data type.

training_predictions = classifier.predict(training_spam[:, 1:])
testing_predictions = classifier.predict(testing_spam[:, 1:])

# Check data type(s)
assert(isinstance(class_predictions, np.ndarray))

# Check shape of numpy array
assert(class_predictions.shape == (1000,))

# Check data type of array elements
assert(np.all(np.logical_or(class_predictions == 0, class_predictions == 1)))

# Check accuracy

training_true_classes = training_spam[:, 0]
training_set_accuracy = np.mean(np.equal(training_predictions, training_true_classes))

testing_true_classes = testing_spam[:, 0]
testing_set_accuracy = np.mean(np.equal(testing_predictions, testing_true_classes))

print(f"Accuracy on the training set: {training_set_accuracy}")
print(f"Accuracy on the testing set: {testing_set_accuracy}")

Accuracy on the training set: 0.892
Accuracy on the testing set: 0.898


In [57]:
training_accuracy = {}
testing_accuracy = {}

for i in range(100):
    classifier = SpamClassifier(alpha = i) 
    classifier.train()
    training_predictions = classifier.predict(training_spam[:, 1:])
    testing_predictions = classifier.predict(testing_spam[:, 1:])

    training_true_classes = training_spam[:, 0]
    training_set_accuracy = np.mean(np.equal(training_predictions, training_true_classes))

    testing_true_classes = testing_spam[:, 0]
    testing_set_accuracy = np.mean(np.equal(testing_predictions, testing_true_classes))
    
    training_accuracy[i] = training_set_accuracy
    testing_accuracy[i] = testing_set_accuracy

print(f"Accuracy on the training set: {training_accuracy}\n")
print(f"Accuracy on the testing set: {testing_accuracy}")
    


  self.log_class_conditionals = np.array([np.log(ham_theta), np.log(spam_theta)])
  spam_email += (feature_response * spam_class_likelihood)


Accuracy on the training set: {0: 0.613, 1: 0.892, 2: 0.893, 3: 0.891, 4: 0.889, 5: 0.889, 6: 0.889, 7: 0.887, 8: 0.888, 9: 0.888, 10: 0.888, 11: 0.888, 12: 0.888, 13: 0.888, 14: 0.888, 15: 0.887, 16: 0.888, 17: 0.888, 18: 0.888, 19: 0.887, 20: 0.887, 21: 0.89, 22: 0.89, 23: 0.89, 24: 0.89, 25: 0.89, 26: 0.889, 27: 0.889, 28: 0.888, 29: 0.888, 30: 0.888, 31: 0.888, 32: 0.889, 33: 0.888, 34: 0.888, 35: 0.889, 36: 0.889, 37: 0.889, 38: 0.889, 39: 0.894, 40: 0.894, 41: 0.894, 42: 0.894, 43: 0.896, 44: 0.896, 45: 0.896, 46: 0.896, 47: 0.895, 48: 0.896, 49: 0.897, 50: 0.896, 51: 0.896, 52: 0.896, 53: 0.895, 54: 0.894, 55: 0.894, 56: 0.894, 57: 0.894, 58: 0.894, 59: 0.892, 60: 0.892, 61: 0.893, 62: 0.893, 63: 0.893, 64: 0.893, 65: 0.894, 66: 0.894, 67: 0.894, 68: 0.894, 69: 0.894, 70: 0.894, 71: 0.894, 72: 0.895, 73: 0.895, 74: 0.895, 75: 0.893, 76: 0.893, 77: 0.897, 78: 0.897, 79: 0.897, 80: 0.897, 81: 0.896, 82: 0.896, 83: 0.896, 84: 0.896, 85: 0.896, 86: 0.896, 87: 0.896, 88: 0.896, 89: 0