# CS344 - Homework 2
## Andrew Quist
3/8/2019


## 1. spam filter

In [None]:
"""
spamFilter.py

provides a class and test case for spam recognition by using statistical probability models.

author:Andrew Quist
"""

class spamFilterProbability:
    """
    input:  spamCorpus is a list of word vectors that contain spam-related words
            legitCorpus is a list of word vectors that contain non-spam words
    """
    def __init__(self, spamCorpus, legitCorpus):
        size = len(spamCorpus)
        """
        spamdict is the dictionary of spam words, and nbad is the number of spam messages
        """
        self.spamDict = {}
        self.nbad = 0
        for i in range(0, size):
            self.nbad += 1
            for j in range(0, len(spamCorpus[i])):
                word = spamCorpus[i][j].lower()
                if word in self.spamDict.keys():
                    self.spamDict[word] += 1
                else:
                    self.spamDict[word] = 1
        """
        legitDict is the dictionary of non-spam words, and ngood is the number of non-spam messages
        """
        self.legitDict = {}
        self.ngood = 0
        for i in range(0, size):
            self.ngood += 1
            for j in range(0, len(legitCorpus[i])):
                word = legitCorpus[i][j].lower()
                if word in self.legitDict.keys():
                    self.legitDict[word] += 1
                else:
                    self.legitDict[word] = 1

    """
    A simple function that returns the good/bad distribution in the order [num spam words, num non-spam words]
    """
    def getWordFrequency(self, word):
        word = word.lower()
        b = 0
        if word in self.spamDict:
            b = self.spamDict[word]

        g = 0
        if word in self.legitDict:
            g = self.legitDict[word]

        return [b, g]

    """
    Calculates the probability that the word is from a spam email. if not found in either realm,
    the function returns .05
    """
    def getWordProbability(self, word):
        comparison = self.getWordFrequency(word)
        b = comparison[0]
        g = comparison[1] * 2

        if (b + g) != 0:
            answer = max(0.01, min(0.99, min(1.0, b/self.nbad) / (min(1.0, g/self.ngood) + min(1.0, b/self.nbad))))

            if answer >= 1:
                answer = .99
            elif answer <= 0:
                answer = .01

            return answer
        else:
            return 0.5
    """
    returns a list of [word, probability] for all words in the full text
    """
    def getProbabilityList(self, fullText):
        allWordProbabilities = {}
        for i in range(0, len(fullText)):
            newWord = fullText[i]
            if newWord not in allWordProbabilities:
                allWordProbabilities[newWord] = self.getWordProbability(newWord)
        answerList = []
        for key, value in allWordProbabilities.items():
            temp = [key, value]
            answerList.append(temp)
        return answerList

    "returns the combined probability of all words, gives an estimate on how likely that the message is spam."
    def getCombinedProbability(self, fullText):
        probList = self.getProbabilityList(fullText)
        product = 0
        invProduct = 0
        for word in probList:
            if product != 0:
                product *= word[1]
            else:
                product = word[1]
            if invProduct != 0:
                invProduct *= (1 - word[1])
            else:
                invProduct = (1 - word[1])
        return product / (product + invProduct)

spamTest = spamFilterProbability([["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]], [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]])
testBody = "I am not spam spamiam please don't delete me"
testBody2 = "I like you and also green eggs and ham"

Implementation code here:

In [None]:
print("The word probability of 'I':")
print(spamTest.getWordProbability("I"))
print("\nThe word probability of 'Spam':")
print(spamTest.getWordProbability("Spam"))
print("\nThe list of probabilities for testBody:")
print(spamTest.getProbabilityList(testBody.split()))
print("\nThe combined probabilities of testBody (likelihood of it being spam):")
print(spamTest.getCombinedProbability(testBody.split()))

print("\nThe list of probabilities for testBody2:")
print(spamTest.getProbabilityList(testBody2.split()))
print("\nThe combined probabilities of testBody2")
print(spamTest.getCombinedProbability(testBody2.split()))

>Graham argues that this is a Baysian approach to SPAM. What makes it Bayesian?

Graham is using a statistical model that boils down to a likelihood of two options: Either the outcome is spam or it is not spam. The word hashes are just a giant probability distribution table. It uses these distributions to find a reasonable expectation (I.E. "This is very likely to be spam").



## 2. Cloudy with a chance

>Implement the network using the AIMA Python tools.

Implementation below

In [None]:
from probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

# Utility variables
T, F = True, False

sprinklerProblem = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.10, F: 0.5}),
    ('Rain', 'Cloudy', {T: 0.80, F: 0.2}),
    ('WetGrass', 'Cloudy Rain', {(T, T): 0.9, (T, F): 0.9, (F, T): 0.9, (F, F): 0.0})
    ])

>Compute the number of independent values in the full joint probability distribution for this domain. Assume that no conditional independence relations are known to hold between these values.

2x2x2x2 = 16. All four nodes have 2 outcomes each.

>Compute the number of independent values in the Bayesian network for this domain. Assume the conditional independence relations implied by the Bayes network.

The network contains 9 independent variables. The rest are dependent on other variables to be calculated.

>Compute probabilities for the following:

In [None]:
print("i. P(Cloudy) = ")
print("0.5")
"""This one is easy: Cloudiness is already defined as a 0.5 chance"""

print("\nii. (Sprinker | cloudy) = ")
print(enumeration_ask('Sprinkler', dict(Cloudy=T), sprinklerProblem).show_approx())
"""We know that the Sprinkler has a .1% chance of going off on a cloudy day."""

print("\niii. P(Cloudy| the sprinkler is running and it’s not raining) =")
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), sprinklerProblem).show_approx())

print("\niv. P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining) =")
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), sprinklerProblem).show_approx())

print("\nv. P(Cloudy | the grass is not wet) =")
print(enumeration_ask('Cloudy', dict(WetGrass=F), sprinklerProblem).show_approx())

i.
This one is easy: Cloudiness is already defined as a 0.5 chance. This means the likelihood of sunny weather is also .5

ii.
We know that the Sprinkler has a .1 chance of going off on a cloudy day. This means that it has a .9 chance of not going off on a cloudy day.

iii.
@<.5 * 0.1 * 0.5 * 0.2 * 0.5, 0.5 * 0.5 * 0.5 * 0.8 * 0.5>
= @<.0025, .05> = <.047, .952>
T = 0.0476 F = 0.952

iv.
if Sprinkler and Rain are both True, then we already have the answer: .99