In [11]:
import numpy as np
import math

class NaiveBayes:
  def __init__(self, output, ngram=1):
    self.count = np.zeros((output, ngram))
    self.prob = []
    self.ngram = ngram
    for i in range(output):
      dicarr = []
      for j in range(ngram):
        dicarr.append({"unk": 1})
      self.prob.append(dicarr)
  
  def calcProb1(self, i, x, ngram):
    prob = 0
    for k in range(len(x) - ngram):
      word = ' '.join(x[k:k+ngram+1])
      # print(k, ngram, word)
      if word not in self.prob[i][ngram]:
        prob += math.log(self.prob[i][ngram]["unk"] / self.count[i, ngram])
      else:
        prob += math.log(self.prob[i][ngram][word] / self.count[i, ngram])

    counts = 0
    for j in range(len(self.count[i])):
      counts += self.count[i, j]

    prob += math.log(self.count[i, ngram] / counts)

    return prob
  
  def maxProb(self, x):
    maxProb = -100000000000000000000
    maxIndex = -1
    for i in range(len(self.count)):
      prob = 0
      for j in range(self.ngram):
        prob += self.calcProb1(i, x, j)
      if prob > maxProb:
        maxProb = prob
        maxIndex = i
    return maxIndex
  
  def trainNgram(self, x, y, ngram):
    for i in range(len(x)):
      for j in range(len(x[i]) - ngram):
        self.count[y[i], ngram - 1] += 1
        word = ' '.join(x[i][j:j+ngram+1])
        if word not in self.prob[y[i]][ngram]:
          self.prob[y[i]][ngram][word] = 1
        else:
          self.prob[y[i]][ngram][word] += 1
  
  def train(self, x, y):
    for i in range(self.ngram):
      self.trainNgram(x, y, i)

  def predict(self, x):
    # return (0 if self.maxProb(x) == 1 else 1)
    return self.maxProb(x)
  
  def test(self, x, y):
    correct = 0
    for i in range(len(x)):
      if self.predict(x[i]) == y[i]:
        correct += 1
    return correct / len(x)
  
  def precision(self, x, y):
    tp = 0
    fp = 0
    for i in range(len(x)):
      if self.predict(x[i]) == 1 and y[i] == 1:
        tp += 1
      if self.predict(x[i]) == 1 and y[i] == 0:
        fp += 1
    return tp / (tp + fp)
  
  


In [12]:
# import pandas as pd
# import re

# def remove_punctuation(text):
#     return re.sub(r'[^\w\s]', '', str(text).strip())

# def remove_extra_spaces(text):
#     return re.sub(' +', ' ', text)

# df = pd.read_csv('Data/spam_or_not_spam/spam_or_not_spam.csv')
# # df['Text'] = df['Message'] if not NaN else df['Subject']

# # df['Text'] = df['Message'].fillna(df['Subject'])

# # df = df.drop(columns=['Message', 'Subject', 'Message ID', 'Date'], axis=1)

# df["Text"] = df["email"].apply(remove_punctuation)
# df["Text"] = df["Text"].str.replace('\n', ' ')
# df["Text"] = df["Text"].apply(remove_extra_spaces)
# df["Text"] = df["Text"].str.strip()
# df["Text"] = df["Text"].str.lower()
# df["Text"] = df["Text"].str.split(' ')

# df['Spam'] = df['label']

# df = df.drop(columns=['email', 'label'], axis=1)
# # df["Spam/Ham"] = df["Spam/Ham"].map({'spam': 1, 'ham': 0})

# df

In [13]:
import pandas as pd
import re

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', str(text).strip())

def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

df = pd.read_csv('Data/enron_spam_data/enron_spam_data.csv')
# df['Text'] = df['Message'] if not NaN else df['Subject']

df['Text'] = df['Message'].fillna(df['Subject'])

df = df.drop(columns=['Message', 'Subject', 'Message ID', 'Date'], axis=1)

df["Text"] = df["Text"].apply(remove_punctuation)
df["Text"] = df["Text"].str.replace('\n', ' ')
df["Text"] = df["Text"].apply(remove_extra_spaces)
df["Text"] = df["Text"].str.strip()
df["Text"] = df["Text"].str.lower()
df["Text"] = df["Text"].str.split(' ')

df["Spam"] = df["Spam/Ham"].map({'spam': 1, 'ham': 0})

df = df.drop('Spam/Ham', axis=1)

df

Unnamed: 0,Text,Spam
0,"[christmas, tree, farm, pictures]",0
1,"[gary, production, from, the, high, island, la...",0
2,"[calpine, daily, gas, nomination, 1, doc]",0
3,"[fyi, see, note, below, already, done, stella,...",0
4,"[fyi, forwarded, by, lauri, a, allen, hou, ect...",0
...,...,...
33711,"[hello, welcome, to, gigapharm, onlinne, shop,...",1
33712,"[i, got, it, earlier, than, expected, and, it,...",1
33713,"[are, you, ready, to, rock, on, let, the, man,...",1
33714,"[learn, how, to, last, 5, 10, times, longer, i...",1


In [14]:
df['Spam'].value_counts()

Spam
1    17171
0    16545
Name: count, dtype: int64

In [15]:
X = df['Text']
Y = df['Spam'].values

random = np.random.permutation(len(X))
theshold = int(0.9*len(random))
X_train = [X[i] for i in random[0:theshold]]
Y_train = Y[random[0:theshold]]

X_test = [X[i] for i in random[theshold:]]
Y_test = Y[random[theshold:]]

# print(len(X_train), len(Y_train), len(X_test), len(Y_test))
# print(Y_train[0:5])

In [16]:
model = NaiveBayes(2, 4)
model.train(X_train, Y_train)

In [17]:
# model.test(X, Y)

In [18]:
print("Train accuracy:", model.test(X_train, Y_train))
print("Test accuracy:", model.test(X_test, Y_test))

print("Train precision:", model.precision(X_train, Y_train))
print("Test precision:", model.precision(X_test, Y_test))

Train accuracy: 0.991003163722647
Test accuracy: 0.9881376037959668
Train precision: 0.9888609877020154
Test precision: 0.9832466782206817


In [19]:
# model.prob[1][3]

In [20]:
'the', 'is', 'a'
'the is', 'is a'
'this is a'

'this is a'