In [6]:
import pandas as pd
# from google.colab import files

In [7]:
# Read in data and make all texts lowercase
spam = pd.read_csv("SpamData.csv")
spam.Message = spam.Message.str.lower() # convert to lower case

In [9]:
# Drop unnecessary columns
spam.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
spam.head(20)

KeyError: "['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'] not found in axis"

In [4]:
# Create a list of stop words
with open('StopWords.txt') as file:
    lines = [line.rstrip() for line in file]

FileNotFoundError: [Errno 2] No such file or directory: 'StopWords.txt'

In [None]:
def getTopSpam(df, commonWords, num):
  '''
  Returns the top 'num' spam words in the dataset 'df'
  '''
  spam = {}
  ham = []

  # Gets count of all words in spam that are not in the list of common words
  for idx in range(len(df.index)):
    message = df.iat[idx, 1]
    words = message.split()
    for word in words:
      if word not in commonWords and df.iat[idx, 0] == "spam":
        if word not in spam:
          spam[word] = 1
        elif word in spam:
          spam[word] = spam[word] + 1

  spam = sorted(spam.items(), key=lambda x:x[1], reverse=True)
  return dict(spam[:num])

result = getTopSpam(spam, lines, 5)
result

In [None]:
def vectorize(df, top):
  '''
  Convert texts to vectors of integers.
  0 is not present and 1 is present.
  The order of the words in the vector are dependent on their rank.
  Highest count is at index 0.
  '''
  vectors = []
  for i in range(len(df.index)):
    message = df.iat[i, 1]
    message = message.split()
    vector = []
    for word in top:
      if word in message:
        vector.append(1)
      else:
        vector.append(0)
    vectors.append(vector)

  df['Vectors'] = vectors

vectorize(spam, result)

In [None]:
def trainTestSplit(df):
  length = len(spam.index)
  split_limit = int(length * 0.7)
  train = spam[0:split_limit]
  test = spam[split_limit:length]
  return train, test

train, test = trainTestSplit(spam)

In [None]:
def makeProbTable(trainDF):
  '''
  Compute probability table given a training dataframe.
  Need to look up P(x_i | y) where x_i is one of the 'spam words' and y is spam/ham
  '''
  feature_records = []
  for idx in range(len(trainDF)):
    feature_vector = []
    msg = trainDF.iat[idx, 1]
    label = trainDF.iat[idx, 0]
    for word in result.keys():
      feature_vector.append(1 if word in msg else 0)
    feature_vector.append(label)
    feature_records.append(feature_vector)
  feature_df = pd.DataFrame(feature_records)
  columns = list(result.keys())
  columns.append('label')
  feature_df.columns = columns
  return feature_df

probTable = makeProbTable(train)
probTable

In [None]:
# Get probabilty of labels appearing in dataset
p_spam = probTable[probTable.label == 'spam'].label.count() / probTable.shape[0]
p_ham = probTable[probTable.label == 'ham'].label.count() / probTable.shape[0]
print(p_spam, p_ham)

In [None]:
def getProbWord(label, probTable):
  '''
  Calculates the probability of a 'top' word appearing in the dataset given a label
  '''
  df = probTable[probTable.label == label]
  counts = df[result.keys()].sum(axis=0) # compute # of 1s
  word_probs = counts / df.shape[0]
  word_probs = word_probs.to_dict()
  return word_probs

# each of these dictionaries has key (feature word): value (p(feature=1|label))
spam_probs = getProbWord("spam", probTable)
ham_probs = getProbWord("ham", probTable)

In [None]:
def predict(feature_vector):
  '''
  Performs the Naive Bayes calculation P(y | x_i) = P(x_i) * P(y)
  Returns the label with the higher probability.
  '''
  ham_prob = p_ham
  spam_prob = p_spam
  feature_words = list(result.keys())
  for idx in range(len(feature_vector)):
    current_word = feature_words[idx]
    if feature_vector[idx] == 1:
      ham_prob = ham_prob * ham_probs[current_word]
      spam_prob = spam_prob * spam_probs[current_word]
    else:
      # to compute p(x_i=0|label), we just subtract that
      # probably from 1: so p(x_i=0|label) = 1 - p(x_i=1|label).
      # Why do you think that is? Would that work with multiclass
      # naive bayes?
      ham_prob = ham_prob * (1-ham_probs[current_word])
      spam_prob = spam_prob * (1-spam_probs[current_word])
  print(f"P(spam) = {spam_prob:.2f}")
  print(f"P(ham) = {ham_prob:.2f}")
  if spam_prob > ham_prob:
    return 'spam'
  else:
    return 'ham'

In [None]:
# Calculates the percent of messages correctly labeled of the test dataset
correct = 0
for idx, (label, msg, feature_vector) in test.iterrows():
  print(msg)
  predicted_label = predict(feature_vector)
  print(label, predicted_label)
  if label == predicted_label:
    correct += 1
print(correct / test.shape[0])