In [19]:
import numpy as np
import math

class NaiveBayes:
  def __init__(self, output, ngram=1):
    self.count = np.zeros((output, ngram))
    self.prob = []
    self.ngram = ngram
    for i in range(output):
      dicarr = []
      for j in range(ngram):
        dicarr.append({"unk": 1})
      self.prob.append(dicarr)
  
  def calcProb1(self, i, x, ngram):
    prob = 0
    for k in range(len(x) - ngram):
      word = ' '.join(x[k:k+ngram+1])
      # print(k, ngram, word)
      if word not in self.prob[i][ngram]:
        prob += math.log(self.prob[i][ngram]["unk"] / self.count[i, ngram])
      else:
        prob += math.log(self.prob[i][ngram][word] / self.count[i, ngram])

    counts = 0
    for j in range(len(self.count[i])):
      counts += self.count[i, j]

    prob += math.log(self.count[i, ngram] / counts)

    return prob
  
  def maxProb(self, x):
    maxProb = -100000000000000000000
    maxIndex = -1
    for i in range(len(self.count)):
      prob = 0
      for j in range(self.ngram):
        prob += self.calcProb1(i, x, j)
      if prob > maxProb:
        maxProb = prob
        maxIndex = i
    return maxIndex
  
  def trainNgram(self, x, y, ngram):
    for i in range(len(x)):
      for j in range(len(x[i]) - ngram):
        self.count[y[i], ngram - 1] += 1
        word = ' '.join(x[i][j:j+ngram+1])
        if word not in self.prob[y[i]][ngram]:
          self.prob[y[i]][ngram][word] = 1
        else:
          self.prob[y[i]][ngram][word] += 1
  
  def train(self, x, y):
    for i in range(self.ngram):
      self.trainNgram(x, y, i)

  def predict(self, x):
    # return (0 if self.maxProb(x) == 1 else 1)
    return self.maxProb(x)
  
  def test(self, x, y):
    correct = 0
    for i in range(len(x)):
      if self.predict(x[i]) == y[i]:
        correct += 1
    return correct / len(x)
  
  


In [20]:
import pandas as pd
import re

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', str(text).strip())

def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

df = pd.read_csv('Data/spam_or_not_spam/spam_or_not_spam.csv')
# df['Text'] = df['Message'] if not NaN else df['Subject']

# df['Text'] = df['Message'].fillna(df['Subject'])

# df = df.drop(columns=['Message', 'Subject', 'Message ID', 'Date'], axis=1)

df["Text"] = df["email"].apply(remove_punctuation)
df["Text"] = df["Text"].str.replace('\n', ' ')
df["Text"] = df["Text"].apply(remove_extra_spaces)
df["Text"] = df["Text"].str.strip()
df["Text"] = df["Text"].str.lower()
df["Text"] = df["Text"].str.split(' ')

df['Spam'] = df['label']

df = df.drop(columns=['email', 'label'], axis=1)
# df["Spam/Ham"] = df["Spam/Ham"].map({'spam': 1, 'ham': 0})

df

Unnamed: 0,Text,Spam
0,"[date, wed, number, aug, number, number, numbe...",0
1,"[martin, a, posted, tassos, papadopoulos, the,...",0
2,"[man, threatens, explosion, in, moscow, thursd...",0
3,"[klez, the, virus, that, won, t, die, already,...",0
4,"[in, adding, cream, to, spaghetti, carbonara, ...",0
...,...,...
2995,"[abc, s, good, morning, america, ranks, it, th...",1
2996,"[hyperlink, hyperlink, hyperlink, let, mortgag...",1
2997,"[thank, you, for, shopping, with, us, gifts, f...",1
2998,"[the, famous, ebay, marketing, e, course, lear...",1


In [21]:
df['Spam'].value_counts()

Spam
0    2500
1     500
Name: count, dtype: int64

In [22]:
X = df['Text']
Y = df['Spam'].values

random = np.random.permutation(len(X))
theshold = int(0.9*len(random))
X_train = [X[i] for i in random[0:theshold]]
Y_train = Y[random[0:theshold]]

X_test = [X[i] for i in random[theshold:]]
Y_test = Y[random[theshold:]]

# print(len(X_train), len(Y_train), len(X_test), len(Y_test))
# print(Y_train[0:5])

In [23]:
model = NaiveBayes(2, 1)
model.train(X, Y)

In [24]:
model.test(X, Y)

0.9913333333333333

In [25]:
print("Train accuracy:", model.test(X_train, Y_train))
print("Test accuracy:", model.test(X_test, Y_test))

Train accuracy: 0.9911111111111112
Test accuracy: 0.9933333333333333


In [26]:
# model.prob[1][3]