In [4]:
spam_corpus = [["i", "am", "spam", "spam", "i", "am"], ["i", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]

# Number of spam messages and number of good messages
nbad = len(spam_corpus)
ngood = len(ham_corpus)

In [3]:
spam_word_count = {}
ham_word_count = {}

for message in spam_corpus:
    for word in message:
        if word in spam_word_count:
            spam_word_count[word] += 1 
        else:
            spam_word_count[word] = 1
        
for message in ham_corpus:
    for word in message:
        if word in ham_word_count:
            ham_word_count[word] += 1 
        else:
            ham_word_count[word] = 1

In [15]:
# Create a union of the spam/ham corpus tokens
known_words = list(set(spam_word_count.keys()) | set(ham_word_count.keys()))

# Initialize a dictionary for word spam probabilities
spam_probability = {}

for word in known_words:
    g = 2 * (0 if word not in ham_word_count else ham_word_count[word])
    b = 0 if word not in spam_word_count else spam_word_count[word]
    if g+b > 1:
        spam_probability[word] = max(0.01,
                                     min(0.99, min(1.0, b/nbad) / (min(1.0, g / ngood) + min(1.0, b/nbad))))
    
    # You don't want this else statement because if you have it, any word that doesn't
    # cross the threshold causes the whole message to have a 0 probability of being spam.
    # This would mean that magic words that weren't over the threshold would trick the spam filter,
    
    # else:
    #    spam_probability[word] = 0

In [19]:
print(spam_probability)

{'eggs': 0.01, 'do': 0.3333333333333333, 'ham': 0.01, 'i': 0.01, 'spam': 0.99, 'am': 0.99, 'and': 0.01, 'green': 0.01, 'I': 0.99, 'like': 0.3333333333333333}


In [20]:
def probability_is_spam(message, spam_prob_table):
    tokens = message.split(' ')
    probabilities = [spam_prob_table[word] for word in tokens if word in spam_prob_table]

    prod = 1
    compliment_prod = 1
    for prob in probabilities:
        prod *= prob
        compliment_prod *= (1 - prob)
        
    return prod / (prod + compliment_prod)


ham_message = "i am not spam because i like green eggs and ham"
print(probability_is_spam(ham_message, spam_probability))

spam_message = "i am spam not spamiam"
print(probability_is_spam(spam_message, spam_probability))

5.2051017513329894e-09
0.99


In [21]:
from probability import BayesNet, enumeration_ask

# Utility variables
T, F = True, False

grass = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.1, F: 0.5}),    
    ('Rain', 'Cloudy', {T: 0.8, F: 0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99, (T, F): 0.90,
                                    (F, T): 0.90, (F, F): 0.00})

])

In [4]:
# P(Cloudy) = 0.5, it is given so a calculation by hand is not needed
print(enumeration_ask('Cloudy', dict({}), grass).show_approx())

False: 0.5, True: 0.5


In [6]:
# P(Sprinkler | Cloudy) = 0.1, it is given
print(enumeration_ask('Sprinkler', dict(Cloudy=T), grass).show_approx())

False: 0.9, True: 0.1


In [23]:
# P(Cloudy | Sprinkler ^ Rain)
# = alpha * <P(Cloudy) * P(Sprinkler | Cloudy) * P(-Rain | Cloudy),
#            P(-Cloudy) * P(Sprinkler | -Cloudy) * P(-Rain | -Cloudy)>
# = alpha * <0.5 * 0.1 * 0.2, 0.5 * 0.5 * 0.8>
# = alpha * <0.01, 0.2>
# = <0.0476,0.952>
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), grass).show_approx())

False: 0.952, True: 0.0476


In [25]:
# P(WetGrass | Cloudy ^ Sprinkler ^ Rain)
# = alpha * P(Rain | Cloudy) * P(Sprinkler | Cloudy) * <P(WG | Sprinkler ^ Rain),
#                                                       P(-WG | Sprinkler ^ Rain)>
# = alpha * 0.8 * 0.1 * <0.99,0.01>
# = <0.99,0.01>
print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), grass).show_approx())

False: 0.01, True: 0.99


In [19]:
# P(Cloudy | -WetGrass)
# = alpha * <P(Cloudy) * P(-WetGrass | Sprinkler ^ Rain) * P(Rain | Cloudy) * P(Sprinkler | Cloudy)
#          + P(Cloudy) * P(-WetGrass | Sprinkler ^ -Rain) * P(-Rain | Cloudy) * P(Sprinkler | Cloudy)
#          + P(Cloudy) * P(-WetGrass | -Sprinkler ^ Rain) * P(Rain | Cloudy) * P(-Sprinkler | Cloudy)
#          + P(Cloudy) * P(-WetGrass | -Sprinkler ^ -Rain) * P(-Rain | Cloudy) * P(-Sprinkler | Cloudy),

#            P(-Cloudy) * P(-WetGrass | Sprinkler ^ Rain) * P(Rain | -Cloudy) * P(Sprinkler | -Cloudy)
#          + P(-Cloudy) * P(-WetGrass | Sprinkler ^ -Rain) * P(-Rain | -Cloudy) * P(Sprinkler | -Cloudy)
#          + P(-Cloudy) * P(-WetGrass | -Sprinkler ^ Rain) * P(Rain | -Cloudy) * P(-Sprinkler | -Cloudy)
#          + P(-Cloudy) * P(-WetGrass | -Sprinkler ^ -Rain) * P(-Rain | -Cloudy) * P(-Sprinkler | -Cloudy)>

# = alpha * <(0.5 * 0.01 * 0.8 * 0.1) + (0.5 * 0.1 *0.2 * 0.1) + (0.5 * 0.1 * 0.8 * 0.9) + (0.5 * 1 * 0.2 * 0.9),
#            (0.5 * 0.01 * 0.2 * 0.5) + (0.5 * 0.1 *0.8 * 0.5) + (0.5 * 0.1 * 0.2 * 0.5) + (0.5 * 1 * 0.8 * 0.5),
# = alpha * <0.1274, 0.2255>
# = <0.361, 0.639>
print(enumeration_ask('Cloudy', dict(WetGrass=F), grass).show_approx())

False: 0.639, True: 0.361
