# CS 344 HW2
# Part 1

In [2]:
'''
A spam filter based off http://www.paulgraham.com/spam.html
For CS 344 at Calvin College

@student: ajs94
@version March 7, 2019
'''


class Filter:

    def __init__(self, bad_corpus, good_corpus, text):
        self.spam_corpus = self.sentences_to_corpus(bad_corpus)
        self.good_corpus = self.sentences_to_corpus(good_corpus)
        self.combined_corpus = text
        self.corpus_len = 0
        self.corpus_counts = self.hash_words()
        self.corpus_probs = {}

    # change a list of sentences into a list of just words
    def sentences_to_corpus(self, text):
        corpus = []
        for sentence in text:
            for word in sentence:
                corpus.append(word)
        return corpus

    # find the number of occurrences of each word
    def hash_words(self):
        word_dict = {}
        for sentence in self.combined_corpus:
            for word in sentence:
                if word not in word_dict.keys():
                    word_dict[word] = 0
                self.corpus_len += 1
                word_dict[word] += 1
        return word_dict

    def evaluate(self):
        # print(self.spam_corpus)
        # print(self.good_corpus)
        # print(self.corpus_counts)
        # print(self.combined_corpus)

        prob = 0
        for sentence in self.combined_corpus:
            for word in sentence:
                g = 0
                b = 0
                if word in self.good_corpus:
                    g = 2 * self.corpus_counts[word]
                if word in self.spam_corpus:
                    b = self.corpus_counts[word]

                if g + b >= 1:
                    temp = max(0.01,
                            min(0.99, min(1.0, b / len(self.spam_corpus)) /
                            (min(1.0, g / len(self.good_corpus)) +
                            min(1.0, b / len(self.spam_corpus)))))
                    prob += temp
                    self.corpus_probs[word] = temp
        return prob / self.corpus_len


if __name__ == '__main__':

    spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
    ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
    sample_mail = [["I", "am", "sam", "sam", "I", "am"], ["I", "do", "not", "like", "green", "eggs", "and", "ham"]]
    # sample_mail = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
    # sample_mail = [["Sample", "message", "with", "nothing", "in", "common"], ["Idk", "what", "else", "to", "say"]]

    filter = Filter(spam_corpus, ham_corpus, sample_mail)
    print("Spam Probability: ", filter.evaluate())
    print("Nonzero word probabilities:\n\t" + str(filter.corpus_probs))

    '''
    This approach is Bayesian in that it determines the probabilities of individual words and uses
        them to determine the overall probability that an email is spam ae th conditional probability that
        if words are spam then email is spam.
    '''

Spam Probability:  0.466103896103896
Nonzero word probabilities:
	{'I': 0.99, 'am': 0.99, 'do': 0.27272727272727276, 'not': 0.99, 'like': 0.27272727272727276, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01}


# Part 2

    b. We have 4 T/F variables in the distribution so 2^4 = 16 independent values.
    c. We have 9 values possible in the Bayesian network. Cloudy has 1, rain and sprinkler each have 2, and wet grass has 4.
    d. ->

In [1]:
'''
This module implements the Bayesian network shown in the text, Figure 14.12.
It's taken from the AIMA Python code.

@student: ajs94
@version March 8, 2019
'''

from probability import BayesNet, enumeration_ask, elimination_ask, gibbs_ask

# Utility variables
T, F = True, False

# From AIMA code (probability.py) - Fig. 14.2 - burglary example
weather = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.1, F: 0.5}),
    ('Rain', 'Cloudy', {T: 0.8, F: 0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.9, (F, T): 0.9, (F, F): 0.0})
    ])


# i.        P(Cloudy) = 0.5
#           Given from info
#           = <0.5, 0.5>
print(enumeration_ask('Cloudy', dict(), weather).show_approx())

# ii.       P(Sprinkler | cloudy) = 0.1
#           Given from info
#           = <0.1, 0.9>
print(enumeration_ask('Sprinkler', dict(Cloudy=T), weather).show_approx())

# iii.      P(Cloudy | the sprinkler is running and it’s not raining)
#           = a * P(Sprinkler, -Rain|Cloudy) * P(Cloudy)
#           = a * 0.1 * 0.2 * 0.5
#           = a * <0.01, .2>
#           = <0.0476, 0.952>
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), weather).show_approx())

# iv.       P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining) = 0.99
#           Given from info
#           = <0.01, 0.99>
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), weather).show_approx())

# v.        P(Cloudy | -WestGrass)
#           = a * ...?
#           I have no idea how to start/work through this...
#           = <0.361, 0.639>
print(enumeration_ask('Cloudy', dict(WetGrass=F), weather).show_approx())


False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361
