# Homework 2

## Problem 1

In [1]:
from random import *
from math import *
from functools import reduce
import operator

spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

# from some stack exchange post
def prod(iterable):
    return reduce(operator.mul, iterable, 1)

def to_lower(corpus):
    corpus_copy = corpus[:]
    for i, mail in enumerate(corpus_copy):
        for j, token in enumerate(mail):
            corpus_copy[i][j] = token.lower()
    return corpus_copy

def concat(lst1, lst2): 
    lst3 = []
    for d in lst1:
        if d not in lst3:
            lst3.append(d)
    for e in lst2:
        if e not in lst3:
            lst3.append(e)
    return lst3 

class SpamFilter():
    
    def __init__(self, good_corpus, bad_corpus, email=[]):
        self.email = email
        self.good_corpus = good_corpus
        self.bad_corpus = bad_corpus
        # these count tables are intrinsic to the filter
        # b/c we may want to update them as new spam is processed
        self.good_table = self.token_count(good_corpus)
        self.bad_table = self.token_count(bad_corpus)

    def token_count(self, corpus):
        '''takes corpus and returns dictionary with each token of the corpus
        as a key and the number of times the token has appeared as a value'''
        words = {}
        for mail in corpus:
            for token in mail:
                if token not in words:
                    words[token] = 1
                else:
                    words[token] += 1
        return words  
        
    def gen_mail(self, isGood=bool(getrandbits(1)), length=20):
        '''returns email randomly generated'''
        if isGood:
            email = []
            for i in range(length):
                email.append(choice(choice(self.good_corpus)))
            self.email = email
            return email
        else:
            email = []
            for i in range(length):
                email.append(choice(choice(self.bad_corpus)))
            self.email = email
            return email
        
    def pr_token(self, word):
        '''calculates probability token is a spam token'''
        ngood = len(self.good_corpus)
        nbad = len(self.bad_corpus)
        try:
            g = 2 * self.good_table[word]
        except KeyError:
            g = 0
        try:
            b = self.bad_table[word]
        except KeyError:
            b = 0
        if (g + b) >= 1:
            return max( 0.01, min( 0.99, float((min(1, b / nbad))/(min(1, g / ngood) + min(1, b / nbad)))))
        else:
            return None
    
    def dump_token_pr(self):
        print('Token, Probability')
        out = {}
        for token in concat(self.good_table.keys(),self.bad_table.keys()):
            out[token] = self.pr_token(token)
        return out
        
    def pr_mail(self, ntokens=5):
        '''takes number of interesting tokens to use;
        using self.email; calculate probability an email is spam;
        return confidence an email is spam'''
        
        # from the email, collect ntokens number of tokens which are the spammiest or hammiest
        interest_table = {}
        mail_table = {}
        for token in self.email:
            mail_table[token] = self.pr_token(token)
            token_interest = fabs(0.50 - self.pr_token(token))
            interest_table[token] = token_interest
        interesting_tokens = sorted(interest_table, key=interest_table.get, reverse=True)[:ntokens]
        
        # iterate over the email and record the value of each instance of an interesting token
        interesting_token_values = []
        for token in self.email:
            if token in interesting_tokens:
                interesting_token_values.append(mail_table[token])
        
        # calculate confidence email is spam
        polynom = []
        for v in interesting_token_values:
            polynom.append(1 - v)
        
        confidence = prod(interesting_token_values) / (prod(interesting_token_values) + prod(polynom))
        return confidence
            
        
filter0 = SpamFilter(good_corpus=to_lower(ham_corpus), bad_corpus=to_lower(spam_corpus))

# dump probabilities
for k, v in filter0.dump_token_pr().items():
    print(k, v)

#randomly generate a spam or ham email
email = filter0.gen_mail(isGood=True, length=10)
print('generated email:', email)

# confidence it is spam
print(filter0.pr_mail(3))



Token, Probability
do 0.3333333333333333
i 0.5
like 0.3333333333333333
green 0.01
eggs 0.01
and 0.01
ham 0.01
am 0.99
spam 0.99
not 0.99
that 0.99
spamiam 0.99
generated email: ['green', 'do', 'i', 'i', 'ham', 'do', 'do', 'do', 'and', 'i']
1.0306090899721738e-06


This approach is Baysian because of the 
`prod(interesting_token_values) / (prod(interesting_token_values) + prod(polynom))` 
expression. The confidence that an email is spam is calculated by using Bayes rule by multiplying the probabilities token is spam given a token for each token in the email.

## Problem 2

### a.

In [2]:
from probability import BayesNet, elimination_ask

# Utility variables
T, F = True, False

cloudy = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.1, F: 0.5}),
    ('Rain', 'Cloudy', {T: 0.8, F: 0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.9, (F, T): 0.9, (F, F): 0.00})
    ])

### b.

4 variables each with two possible values gives

In [3]:
print(2**4)

16


### c.

two values for cloudy + two times two values for sprinkler and rain + two for wet grass

In [4]:
print(2+4+2)

8


### d.

#### i.

In [5]:
print('P(Cloudy)=', elimination_ask('Cloudy',{}, cloudy).show_approx())

P(Cloudy)= False: 0.5, True: 0.5


probability distrobution is already in table. <0.5 0.5>

#### ii.

In [6]:
print('P(Sprinker | cloudy)=', elimination_ask('Sprinkler', dict(Cloudy=T), cloudy).show_approx())

P(Sprinker | cloudy)= False: 0.9, True: 0.1


probability distrobution is already in table. F,T = <0.9, 0.1>

#### iii.

In [7]:
print('P(Cloudy| the sprinkler is running and it’s not raining)=', elimination_ask('Cloudy', dict(Sprinkler=T, Rain=F), cloudy).show_approx())

P(Cloudy| the sprinkler is running and it’s not raining)= False: 0.952, True: 0.0476


$$ \alpha P(C)P(s|C)P(\not r|C) = \alpha <0.5*0.1*0.2, 0.5*0.5*0.8>\\
= <\frac{0.5*0.1*0.2}{(0.5*0.1*0.2 + 0.5*0.5*0.8)}, \frac{0.5*0.5*0.8}{(0.5*0.1*0.2 + 0.5*0.5*0.8)}>\\
= <0.0476, 0.952>
$$

#### iv.

In [8]:
print('P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)=', elimination_ask('WetGrass', dict(Cloudy=T, Rain=T, Sprinkler=T), cloudy).show_approx())

P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)= False: 0.01, True: 0.99


This probability can be read from the table. The cloudy value has no bearing on the outcome since it is already given that it's raining and the sprinkler is running. F,T = <0.01, 0.99>

#### v.

In [9]:
print('P(Cloudy | the grass is not wet)=', elimination_ask('Cloudy', dict(WetGrass=F), cloudy).show_approx())

P(Cloudy | the grass is not wet)= False: 0.639, True: 0.361


$$P(C|\not g) = \alpha P(C)\sum_{s,r}{P(s|C)P(r|C)P(\not g| s,r)}\\
= \alpha <P(C)(P(s|C)P(r|C)P(\not g| s,r)\\
+P(\not s|C)P(r|C)P(\not g| \not s,r))\\
+P(s|C)P(\not r|C)P(\not g| s,\not r))\\
+P(\not s|C)P(\not r|C)P(\not g| \not s,\not r)),\\
+P(\not C)(P(s|\not C)P(r|\not C)P(\not g| s,r)\\
+P(\not s|\not C)P(r|\not C)P(\not g| \not s,r))\\
+P(s|\not C)P(\not r|\not C)P(\not g| s, \not r)\\
+P(\not s|\not C)P(\not r|\not C)P(\not g| \not s, \not r)>\\
=\alpha <0.5*(0.1*0.8*0.01+0.9*0.8*0.1+0.1*0.2*0.1+0.5*0.9*0.2*1),\\
0.5*(0.5*0.2*0.01+0.5*0.2*0.1+0.5*0.8*0.1+0.5*0.5*0.8*1)>\\
=<(0.1*0.8*0.01+0.9*0.8*0.1+0.1*0.2*0.1+0.9*0.2*1)/((0.1*0.8*0.01+0.9*0.8*0.1+0.1*0.2*0.1+0.9*0.2*1)+(0.5*0.2*0.01+0.5*0.2*0.1+0.5*0.8*0.1+0.5*0.8*1)),(0.5*0.2*0.01+0.5*0.2*0.1+0.5*0.8*0.1+0.5*0.8*1)/((0.1*0.8*0.01+0.9*0.8*0.1+0.1*0.2*0.1+0.9*0.2*1)+(0.5*0.2*0.01+0.5*0.2*0.1+0.5*0.8*0.1+0.5*0.8*1))>\\
=<0.361,0.639>
$$