In [102]:
import csv
import math
import random
import numpy as np

In [103]:
def load_dataset(x_filename, y_filename):
    with open(x_filename, 'r') as x_csv:
        lines_x = csv.reader(x_csv, delimiter=',')
        dataset_x = np.array([[int(row[col_i]) for col_i in range(len(row))] for row in lines_x]) # List comprehension is used.

    with open(y_filename, 'r') as y_csv:
        lines_y = csv.reader(y_csv)
        dataset_y = np.array([int(row[0]) for row in lines_y])

    set_size = int(len(dataset_x))

    return dataset_x, dataset_y, set_size

In [104]:
train_x, train_y, set_size = load_dataset("x_train.csv", "y_train.csv")

In [105]:
print( "Question 3.1\nSpam ratio in train_y: ")
spam_ratio = np.count_nonzero(train_y == 1) / set_size
normal_ratio = 1 - spam_ratio
print(spam_ratio * 100)

Question 3.1
Spam ratio in train_y: 
71.26070991432069


In [106]:
def naive_bayes_train(train_x, train_y, set_size, alpha, mult):
  # if mult is true => multinomial, if mult is false => bernoulli naive bayes model
  spam_ratio = np.count_nonzero(train_y == 1) / set_size
  normal_ratio = 1 - spam_ratio
  spam_data = np.array([(train_x[:,i] * train_y) for i in range(train_x.shape[1])]).T
  if mult:
    spam_occurences = np.sum(spam_data, axis = 0) # for each j take train_x[k][j] * train_y[]
    normal_occurences = np.sum(train_x - spam_data, axis = 0)
    spam_occurence_ratios = (spam_occurences + alpha) / (np.sum(spam_occurences, axis = 0) + alpha * len(train_y)) 
    normal_occurence_ratios = (normal_occurences + alpha) / (np.sum(normal_occurences, axis = 0) + alpha * len(train_y))
  else:
    spam_occurences = np.count_nonzero(spam_data, axis = 0)
    normal_occurences = np.count_nonzero(train_x - spam_data, axis = 0)
    spam_occurence_ratios = (spam_occurences + alpha) / (np.count_nonzero(train_y) + alpha * len(train_y)) 
    normal_occurence_ratios = (normal_occurences + alpha) / (set_size - np.count_nonzero(train_y) + alpha * len(train_y))
  
  # P(Xj|y=spam) = Nj,spam / Nspam
  # alpha -> smoothing for part 3.3
  
  return spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios

In [107]:
def naive_bayes_test(test_x, test_y, spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios, mult):
  if mult:
    spam_prob = np.array([np.log(spam_ratio) + np.nansum(test_x[i] * np.log(spam_occurence_ratios)) for i in range(test_x.shape[0])])
    normal_prob = np.array([np.log(normal_ratio) + np.nansum(test_x[i] * np.log(normal_occurence_ratios)) for i in range(test_x.shape[0])])
  else:
    spam_test_x = np.array([[spam_occurence_ratios[i] if row[i] != 0 else 1 - spam_occurence_ratios[i] for i in range(len(row))] for row in test_x])    
    spam_contr = np.log(np.prod(spam_test_x, axis = 1))
    spam_prob = np.log(spam_ratio) + np.where(np.isnan(spam_contr), 0, spam_contr)
    normal_test_x = np.array([[normal_occurence_ratios[i] if row[i] != 0 else 1 - normal_occurence_ratios[i] for i in range(len(row))] for row in test_x])    
    normal_contr = np.log(np.prod(normal_test_x, axis = 1))
    normal_prob = np.log(normal_ratio) + np.where(np.isnan(normal_contr), 0, normal_contr)
  results = np.array([1 if spam_prob[i] > normal_prob[i] else 0 for i in range(len(spam_prob))])
  print(results.shape)
  # both are -inf => choose 0
  # on is -inf => choose other
  no_of_wrong = np.sum(np.array([1 if results[i] != test_y[i] else 0 for i in range(len(results))]))
  accuracy = (len(results) - no_of_wrong) / len(results)
  true_positive = np.sum(results * test_y)
  true_negative = len(results) - no_of_wrong - true_positive # all trues - true positives
  false_positive = np.sum(np.array([1 if results[i] == 1 and test_y[i] == 0 else 0 for i in range(len(results))]))
  false_negative = no_of_wrong - false_positive
  confusion_matrix = np.array([[true_positive, false_positive], [false_negative, false_positive]])
  return results, no_of_wrong, accuracy, confusion_matrix
  

In [108]:
test_x, test_y, test_set_size = load_dataset("x_test.csv", "y_test.csv")

In [109]:
spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios = naive_bayes_train(train_x, train_y, set_size, 0, True)
# alpha = 0, Multinomial

In [110]:
results, no_of_wrong, accuracy, confusion_matrix = naive_bayes_test(test_x, test_y, spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios, True)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


(1086,)


In [111]:
print("\nQuestion 3.2")
print("Accuracy = ", accuracy)
print("Number of wrong estimations = ", no_of_wrong)
print("Confusion matrix = \n", confusion_matrix)


Question 3.2
Accuracy =  0.85451197053407
Number of wrong estimations =  158
Confusion matrix = 
 [[611   8]
 [150   8]]


In [112]:
spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios = naive_bayes_train(train_x, train_y, set_size, 1, True)
# alpha = 1, Multinomial
results, no_of_wrong, accuracy, confusion_matrix = naive_bayes_test(test_x, test_y, spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios, True)

(1086,)


In [113]:
print("\nQuestion 3.3")
print("Accuracy = ", accuracy)
print("Number of wrong estimations = ", no_of_wrong)
print("Confusion matrix = \n", confusion_matrix)


Question 3.3
Accuracy =  0.9751381215469613
Number of wrong estimations =  27
Confusion matrix = 
 [[742   8]
 [ 19   8]]


In [114]:
spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios = naive_bayes_train(train_x, train_y, set_size, 0, False)
# alpha = 1, Binomial
results, no_of_wrong, accuracy, confusion_matrix = naive_bayes_test(test_x, test_y, spam_ratio, normal_ratio, spam_occurence_ratios, normal_occurence_ratios, False)

  import sys


(1086,)


  # Remove the CWD from sys.path while we load stuff.


In [115]:
print("\nQuestion 3.4")
print("Accuracy = ", accuracy)
print("Number of wrong estimations = ", no_of_wrong)
print("Confusion matrix = \n", confusion_matrix)


Question 3.4
Accuracy =  0.8360957642725598
Number of wrong estimations =  178
Confusion matrix = 
 [[608  25]
 [153  25]]
