In [1]:
from math import exp
from math import pi

import numpy as np
from numpy import log

import pandas as pd
import scipy.io

In [2]:
data = scipy.io.loadmat('../data/spamData.mat')

In [3]:
def log_transform(feature):
    return log(feature + 0.1)

In [4]:
x_train = pd.DataFrame(data['Xtrain'])
y_train = pd.DataFrame(data['ytrain'], columns=['y_true'])
x_test = pd.DataFrame(data['Xtest'])
y_test = pd.DataFrame(data['ytest'], columns=['y_true'])

In [5]:
# binarize features in training and test data
x_train_l = x_train.apply(log_transform)
x_test_l = x_test.apply(log_transform)

In [6]:
training_set = pd.concat([x_train_l, y_train], axis=1)
training_set['y_predict'] = 0
test_set = pd.concat([x_test_l, y_test], axis=1)
test_set['y_predict'] = 0

In [7]:
lambda_ML = y_train.sum() / y_train.count()
num_features = 57
n_train = len(training_set.index)
n_test = len(test_set.index)

In [8]:
def gaussian_pdf(x, mu, sigma):
    p = (1 / ((2 * pi * (sigma ** 2)) ** 0.5)) * exp((-0.5 * ((x - mu) ** 2)) / (sigma ** 2))
    return p

In [9]:
training_set_spam = training_set[training_set['y_true']==1]
eta_c1_array = []

for j in range(num_features):
    mu_j1 = np.mean(training_set_spam.iloc[:,j])
    sigma_j1 = np.std(training_set_spam.iloc[:,j])       
    eta_c1_array.append(tuple((mu_j1, sigma_j1)))

In [10]:
training_set_not_spam = training_set[training_set['y_true']==0]
eta_c0_array = []
    
for j in range(num_features):
    mu_j0 = np.mean(training_set_not_spam.iloc[:,j])
    sigma_j0 = np.std(training_set_not_spam.iloc[:,j])      
    eta_c0_array.append(tuple((mu_j0, sigma_j0)))

In [11]:
# Test Error Rate
test_error_rate = 0
error_count = 0

for n in range(n_test):

    post_predict_c0 = log(1 - lambda_ML)
    post_predict_c1 = log(lambda_ML)
    
    for j in range(num_features):
        post_predict_c0 += log(gaussian_pdf(test_set[j][n],eta_c0_array[j][0] , eta_c0_array[j][1]))
        post_predict_c1 += log(gaussian_pdf(test_set[j][n],eta_c1_array[j][0] , eta_c1_array[j][1]))
   
    post_predict_c0 = float(post_predict_c0)
    post_predict_c1 = float(post_predict_c1)
    
    if (post_predict_c1 > post_predict_c0):
        test_set.iloc[n,58] = 1
    else:
        test_set.iloc[n,58] = 0
    
    if test_set.iloc[n,58] != test_set.iloc[n,57]:
        error_count += 1

test_error_rate = error_count / n_test
print("Test Error Rate is: "+ str(test_error_rate))

  if sys.path[0] == '':
  # This is added back by InteractiveShellApp.init_path()


Test Error Rate is: 0.16341145833333334


In [14]:
# Train Error Rate
train_error_rate = 0
error_count = 0

for n in range(n_train):

    post_predict_c0 = log(1 - lambda_ML)
    post_predict_c1 = log(lambda_ML)
    
    for j in range(num_features):
        post_predict_c0 += log(gaussian_pdf(training_set[j][n],eta_c0_array[j][0] , eta_c0_array[j][1]))
        post_predict_c1 += log(gaussian_pdf(training_set[j][n],eta_c1_array[j][0] , eta_c1_array[j][1]))
   
    post_predict_c0 = float(post_predict_c0)
    post_predict_c1 = float(post_predict_c1)
    
    if (post_predict_c1 > post_predict_c0):
        training_set.iloc[n,58] = 1
    else:
        training_set.iloc[n,58] = 0
    
    if training_set.iloc[n,58] != training_set.iloc[n,57]:
        error_count += 1

train_error_rate = error_count / n_train
print("Training Error Rate is: "+ str(train_error_rate))

  if sys.path[0] == '':
  # This is added back by InteractiveShellApp.init_path()


Training Error Rate is: 0.16802610114192496
