<a href="https://colab.research.google.com/github/apriljoymiller4592/NaiveBayes/blob/main/CS471Assignment3_MillerApril.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from collections import defaultdict
import string

#Step one: Load the dataset and split into training and testing sets (first 20 into training and the rest into testing)
dataset = np.genfromtxt('SpamDetection.csv', delimiter=',', dtype=None, names=True, encoding='utf-8')

trainingSet = dataset[:20]
testingSet = dataset[20:30]

#Step two: Compute the prior probabilities: P(spam) and P(ham)

for entry in trainingSet:
  spamCount = sum(1 for item in trainingSet if item[0] == 'spam')
  hamCount = sum(1 for item in trainingSet if item[0] == 'ham')

trainingSetLength = len(trainingSet)

pSpam = spamCount/trainingSetLength
pHam = hamCount/trainingSetLength

#Step three: Compute the conditional probabilities P(sentence/spam)

spamWordCounts = defaultdict(int)
hamWordCounts = defaultdict(int)

#Count how many times a word is in a class.
for label, phrase in trainingSet:
    words = phrase.split()
    if label == "spam":
        for word in words:
          spamWordCounts[word] += 1
    else:
        for word in words:
         hamWordCounts[word] += 1

totalSpamWords = sum(spamWordCounts.values())
totalHamWords = sum(hamWordCounts.values())

uniqueWords = set()

#Clean the string and make a set of all the unique words
for label, phrase in dataset:
    cleanedPhrase = phrase.lower().translate(str.maketrans('', '', string.punctuation))
    words = cleanedPhrase.split()
    uniqueWords.update(words)

totalWords = totalSpamWords + totalHamWords

#Laplace smoothing numerator and denominator
laplaceNumer = 1
laplaceDenom = len(uniqueWords)

#Function to compute probability of a word being in a sentence given its class
def compute_conditional_probability(word, label):
  if label == "spam":
    return ((spamWordCounts[word] + laplaceNumer) / (totalSpamWords + laplaceDenom))
  else:
    return ((hamWordCounts[word] + laplaceNumer) / (totalHamWords + laplaceDenom))

#Helper function to multiply numbers in a list
def multiply(lis):
  product = 1
  for i in lis:
    product *= i

  return product

#Step four: Compute the posterior probabilities (probability of a sentence belonging to a spam or ham)

correctPredictions = 0

#Calculate the probability of a sentence belonging to spam or ham
for trueLabel, sentence in testingSet:

  #Split the sentences into words
  wordsList = sentence.split()

  #Probability that sentence is spam or ham
  pSentenceGivenSpam = pSpam * multiply(compute_conditional_probability(word, "spam") for word in wordsList)
  pSentenceGivenHam = pHam * multiply(compute_conditional_probability(word, "ham") for word in wordsList)

  #Make a prediction based off of which one has a higher probability
  if pSentenceGivenSpam > pSentenceGivenHam:
    predictedLabel = "spam"
  else:
    predictedLabel = "ham"

  #If the prediction is correct, increment the counter of correct predictions
  if predictedLabel == trueLabel:
    correctPredictions += 1

  #Print the results
  print("Sentence | " + sentence)
  print("P(sentence|spam) | " + str(pSentenceGivenSpam))
  print("P(sentence|ham) | " + str(pSentenceGivenHam))
  print("Prediction | " + predictedLabel)
  print("Actual | " + trueLabel)
  print("----------------------------------------------------\n")

#Calculate the accuracy
totalSentencesCount = len(testingSet)
accuracy = correctPredictions/totalSentencesCount

print("Accuracy | " + str(accuracy))



Sentence | Tell where you reached
P(sentence|spam) | 1.4823612450921087e-10
P(sentence|ham) | 6.396976624774373e-10
Prediction | ham
Actual | ham
----------------------------------------------------

Sentence | Your gonna have to pick up a burger for yourself on your way home
P(sentence|spam) | 4.935312943730125e-33
P(sentence|ham) | 2.1419375154813147e-32
Prediction | ham
Actual | ham
----------------------------------------------------

Sentence | As a valued customer I am pleased to advise you that for your recent review you are awarded a Bonus Prize
P(sentence|spam) | 1.0025444045979708e-47
P(sentence|ham) | 6.988890714711307e-47
Prediction | ham
Actual | spam
----------------------------------------------------

Sentence | Urgent you are awarded a complimentary trip to EuroDisinc To claim text immediately
P(sentence|spam) | 1.9797655351420274e-30
P(sentence|ham) | 1.0763236015293605e-30
Prediction | spam
Actual | spam
----------------------------------------------------

Sentence 