In [7]:
import math
a = 0.00137714

In [8]:
math.log(a)

-6.587746394105945

In [81]:
import numpy as np
testing_spam = np.loadtxt(open("testing_spam.csv"), delimiter=",")
training_spam = np.loadtxt(open("training_spam.csv"), delimiter=",")


In [82]:
# This skeleton code simply classifies every input as ham
#
# Here you can see there is a parameter k that is unused, the
# point is to show you how you could set up your own. You might
# also pass in extra data via a train method (also does nothing
# here). Modify this code as much as you like so long as the 
# accuracy test in the cell below runs.

class SpamClassifier:
    def train(self):
        training_labels = training_spam[:,0]
        training_features = training_spam[:, 1:]
        
        # estimating log class priors from the training data
        spam_proportion = sum(training_labels) / len(training_labels)
        self.log_class_priors = np.array([np.log(1 - spam_proportion), np.log(spam_proportion)])
        
        # separating the ham from the spam
        hams_indices = np.where(training_labels == 0)
        hams = training_features[hams_indices] 
    
        spams_indices = np.where(training_labels == 1)
        spams = training_features[spams_indices]
        
        # estimating log class conditional likelihoods
        alpha = 1
        k = training_features.shape[1] # total number of keywords

        spam_featurewise_totals = np.sum(spams, axis = 0) # an array showing the total for each keyword for spam with shape = (1, n_features) 
        spam_total_features = sum(spam_featurewise_totals) # total number of all features
        spam_theta = (spam_featurewise_totals + alpha) / (spam_total_features + (k * alpha)) # relative spam feature frequencies calculated with laplace smoothing

        ham_featurewise_totals = np.sum(hams, axis = 0)
        ham_total_features = sum(ham_featurewise_totals)
        ham_theta = (ham_featurewise_totals + alpha) / (ham_total_features + (k * alpha))
        
        self.log_class_conditionals = np.array([np.log(ham_theta), np.log(spam_theta)])
        
    def predict(self, data):
        rows, cols = data.shape
    
        class_predictions = np.zeros(rows)

        for i in range(rows):
            # initialising log posterior probabilities
            ham_email = 0
            spam_email = 0
            for j in range(cols):
                # for each feature, if it is in the email add the respective log probabilities for spam and ham 
                feature_response = data[i,j]

                ham_class_likelihood = self.log_class_conditionals[0,j]
                spam_class_likelihood = self.log_class_conditionals[1,j]

                ham_email += (feature_response * ham_class_likelihood)
                spam_email += (feature_response * spam_class_likelihood)
                
            # finally add the log_class priors to each, then compare whether spam or ham is more likely
            ham_email += self.log_class_priors[0]
            spam_email += self.log_class_priors[1]

            if spam_email > ham_email: # if spam more likely than ham, classify as 1, otherwise leave as 0 for ham
                class_predictions[i] = 1

        return class_predictions
    

def create_classifier():
    classifier = SpamClassifier()
    classifier.train()
    return classifier

classifier = create_classifier()

In [84]:
class SpamClassifier:
    def __init__(self, alpha = 10):
        # separating the ham from the spam
        hams_indices = np.where(training_spam[:,0] == 0)
        self.hams = training_spam[:,1:][hams_indices] 
    
        spams_indices = np.where(training_spam[:,0] == 1)
        self.spams = training_spam[:,1:][spams_indices]
        
        # for laplace smoothing:
        self.alpha = alpha
        self.k = training_spam.shape[1] - 1 # number of keyword columns
        
        
    def train(self):
        # estimating log class priors from the training data
        spam_proportion = self.spams.shape[0] / training_spam.shape[0]
        self.log_class_priors = np.array([np.log(1 - spam_proportion), np.log(spam_proportion)])
        
        # estimating log class conditional likelihoods
        spam_featurewise_totals = np.sum(self.spams, axis = 0) # an array showing the total for each keyword for spam with shape = (1, n_features) 
        spam_total_features = sum(spam_featurewise_totals) # total number of all features
        spam_theta = (spam_featurewise_totals + self.alpha) / (spam_total_features + (self.k * self.alpha)) # relative spam feature frequencies calculated with laplace smoothing

        ham_featurewise_totals = np.sum(self.hams, axis = 0)
        ham_total_features = sum(ham_featurewise_totals)
        ham_theta = (ham_featurewise_totals + self.alpha) / (ham_total_features + (self.k * self.alpha))
        
        self.log_class_conditionals = np.array([np.log(ham_theta), np.log(spam_theta)])
        
    def predict(self, data):
        rows, cols = data.shape
        class_predictions = np.zeros(rows)

        for i in range(rows):
            # initialising log posterior probabilities for both classes
            ham_email = 0
            spam_email = 0
            for j in range(cols):
                # for each feature, if it is in the email add the respective log probabilities for spam and ham 
                feature_response = data[i,j]

                ham_class_likelihood = self.log_class_conditionals[0,j]
                spam_class_likelihood = self.log_class_conditionals[1,j]

                ham_email += (feature_response * ham_class_likelihood)
                spam_email += (feature_response * spam_class_likelihood)
                
            # finally add the log_class priors to each, then compare whether spam or ham is more likely
            ham_email += self.log_class_priors[0]
            spam_email += self.log_class_priors[1]

            if spam_email > ham_email: # if spam more likely than ham, classify as 1, otherwise leave as 0 for ham
                class_predictions[i] = 1

        return class_predictions
    

def create_classifier():
    classifier = SpamClassifier()
    classifier.train()
    return classifier

classifier = create_classifier()

In [85]:
# You can use this cell to check whether the returned objects of your function are of the right data type.

training_predictions = classifier.predict(training_spam[:, 1:])
testing_predictions = classifier.predict(testing_spam[:, 1:])

# Check data type(s)
assert(isinstance(class_predictions, np.ndarray))

# Check shape of numpy array
assert(class_predictions.shape == (1000,))

# Check data type of array elements
assert(np.all(np.logical_or(class_predictions == 0, class_predictions == 1)))

# Check accuracy

training_true_classes = training_spam[:, 0]
training_set_accuracy = np.mean(np.equal(training_predictions, training_true_classes))

testing_true_classes = testing_spam[:, 0]
testing_set_accuracy = np.mean(np.equal(testing_predictions, testing_true_classes))

print(f"Accuracy on the training set: {training_set_accuracy}")
print(f"Accuracy on the testing set: {testing_set_accuracy}")

Accuracy on the training set: 0.888
Accuracy on the testing set: 0.906


In [80]:
training_accuracy = {}
testing_accuracy = {}

for i in range(100):
    classifier = SpamClassifier(alpha = i)
    classifier.train()
    training_predictions = classifier.predict(training_spam[:, 1:])
    testing_predictions = classifier.predict(testing_spam[:, 1:])

    training_true_classes = training_spam[:, 0]
    training_set_accuracy = np.mean(np.equal(training_predictions, training_true_classes))

    testing_true_classes = testing_spam[:, 0]
    testing_set_accuracy = np.mean(np.equal(testing_predictions, testing_true_classes))
    
    training_accuracy[i] = training_set_accuracy
    testing_accuracy[i] = testing_set_accuracy

print(f"Accuracy on the training set: {training_accuracy}\n")
print(f"Accuracy on the testing set: {testing_accuracy}")

score_training = max(training_accuracy.values())
score_testing = max(testing_accuracy.values())

alpha_training = max(training_accuracy, key = training_accuracy.get)
alpha_testing = max(testing_accuracy, key = testing_accuracy.get)

print(f"\nMax alpha, accurary for training: {alpha_training}, {score_training}")
print(f"Max alpha, accurary for testing: {alpha_testing}, {score_testing}")


  self.log_class_conditionals = np.array([np.log(ham_theta), np.log(spam_theta)])
  spam_email += (feature_response * spam_class_likelihood)


Accuracy on the training set: {0: 0.602, 1: 0.9, 2: 0.9, 3: 0.898, 4: 0.898, 5: 0.896, 6: 0.894, 7: 0.894, 8: 0.89, 9: 0.892, 10: 0.894, 11: 0.894, 12: 0.894, 13: 0.894, 14: 0.892, 15: 0.892, 16: 0.89, 17: 0.89, 18: 0.89, 19: 0.89, 20: 0.892, 21: 0.892, 22: 0.892, 23: 0.892, 24: 0.892, 25: 0.892, 26: 0.9, 27: 0.9, 28: 0.9, 29: 0.902, 30: 0.9, 31: 0.9, 32: 0.898, 33: 0.898, 34: 0.898, 35: 0.898, 36: 0.898, 37: 0.898, 38: 0.898, 39: 0.898, 40: 0.898, 41: 0.898, 42: 0.898, 43: 0.898, 44: 0.898, 45: 0.898, 46: 0.898, 47: 0.898, 48: 0.896, 49: 0.896, 50: 0.898, 51: 0.898, 52: 0.898, 53: 0.898, 54: 0.896, 55: 0.898, 56: 0.898, 57: 0.898, 58: 0.898, 59: 0.898, 60: 0.898, 61: 0.898, 62: 0.898, 63: 0.898, 64: 0.898, 65: 0.9, 66: 0.9, 67: 0.9, 68: 0.9, 69: 0.9, 70: 0.9, 71: 0.898, 72: 0.898, 73: 0.898, 74: 0.894, 75: 0.894, 76: 0.894, 77: 0.894, 78: 0.894, 79: 0.894, 80: 0.894, 81: 0.894, 82: 0.894, 83: 0.894, 84: 0.894, 85: 0.894, 86: 0.894, 87: 0.894, 88: 0.894, 89: 0.894, 90: 0.894, 91: 0.896

In [94]:
train_hams_indices = np.where(training_spam[:,0] == 0)
train_hams = training_spam[:,1:][train_hams_indices]
    
train_spams_indices = np.where(training_spam[:,0] == 1)
train_spams = training_spam[:,1:][train_spams_indices]

test_hams_indices = np.where(testing_spam[:,0] == 0)
test_hams = testing_spam[:,1:][test_hams_indices] 
    
test_spams_indices = np.where(testing_spam[:,0] == 1)
test_spams = testing_spam[:,1:][test_spams_indices]

ham_training_predictions = classifier.predict(train_hams)
spam_training_predictions = classifier.predict(train_spams)

ham_testing_predictions = classifier.predict(test_hams)
spam_testing_predictions = classifier.predict(test_spams)

ham_class = 0
spam_class = 1

hams_training_accuracy = np.mean(np.equal(ham_training_predictions, 0))
spams_training_accuracy = np.mean(np.equal(spam_training_predictions, 1))

hams_testing_accuracy = np.mean(np.equal(ham_testing_predictions, 0))
spams_testing_accuracy = np.mean(np.equal(spam_testing_predictions, 1))

print(f"Spam training: {spams_training_accuracy}")
print(f"Ham training: {hams_training_accuracy}")

print(f"\nSpam testing: {spams_testing_accuracy}")
print(f"Ham testing: {hams_testing_accuracy}")

Spam training: 0.9457364341085271
Ham training: 0.8515497553017944

Spam testing: 0.9346733668341709
Ham testing: 0.8870431893687708
