# CS 229 
## PS 2.2 – Spam Classification


Code can be found in <u>spam.py</u>

**(a)** <br>
Size of dictionary: 1721 

**(b)** <br>
To handle the underflow error from multiply probabilities together, it is valuable to note that $\prod_{i=1}^n P(x_i) = \exp({\log (\prod_{i=1}^n P(x_i))}) = \exp({\sum_{i=1}^n (\log P(x_i))}) $

Thus the class that is more likely will be the one that has a larger sum of the log probabilities.

**Test set accuracy:** 0.978494623655914


**(c)** <br>
The top 5 indicative words for Naive Bayes are:  ['claim' 'won' 'prize' 'tone' 'urgent!']

**(d)** <br>
The optimal SVM radius was 10
The SVM model had an accuracy of 0.8799283154121864 on the testing set


In [2]:
import numpy as np

In [2]:
import spam

In [3]:
spam.main()

Size of dictionary:  1721
Naive Bayes had an accuracy of 0.978494623655914 on the testing set
The top 5 indicative words for Naive Bayes are:  ['claim', 'won', 'prize', 'tone', 'urgent!']
The optimal SVM radius was 10
The SVM model had an accuracy of 0.8799283154121864 on the testing set


In [20]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """

    # *** START CODE HERE ***
    words = message.split(' ')
    norm_words = []
    for idx,wrd in enumerate(words):
        norm_words.append(wrd.lower()) if wrd != '' else None 
    return norm_words
    # *** END CODE HERE ***


def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message.

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    # *** START CODE HERE ***
    #add all words to dictionary with a count for each of how many messages they appear in 
    word_count_dict = {}
    for message in messages:
        words = get_words(message)
        words_appeared = set() #use set to store words seen in message so far 
        for word in words:
            if word in word_count_dict and word not in words_appeared:
                words_appeared.add(word)
                word_count_dict[word] = word_count_dict[word] + 1
            if word not in word_count_dict:
                words_appeared.add(word)
                word_count_dict[word] = 1 
    
        
    #add words from previous dictionary if appear in 5+ messages 
    word_dict = {}
    i=0
    for word,ct in word_count_dict.items():
        if ct>=5:
            word_dict[word] = i #add to spam_dictionary
            i+=1 
    return word_dict  
    
    
    
    # *** END CODE HERE ***


In [41]:
matrix = np.array([[3,2,1],[5,1,1],[0,2,7],[1,1,8]])
labels = np.array([1,1,0,0])

In [52]:
#compute phi_y estimate
phi_y = np.mean(np.array(labels))
#compute posterior probabilities
phi_given_pos = np.ones(matrix.shape[1]) #since Laplace smoothing
phi_given_neg = np.ones(matrix.shape[1])
pos_denom = matrix.shape[1] #since Laplace smoothing

neg_denom = matrix.shape[1]

for i in range(matrix.shape[0]):
    pos = (labels[i]==1)
    if pos:
        pos_denom+= np.sum(matrix[i]) #add d
    else:
        neg_denom+= np.sum(matrix[i]) #add d

    #add number of that vocab that appear
    if pos:
        phi_given_pos += matrix[i]
    else:
        phi_given_neg += matrix[i]
        
phi_given_pos = phi_given_pos/pos_denom
phi_given_neg = phi_given_neg/neg_denom

#store all 3 in a dictionary
dict_ = {}
dict_['phi_y'] = phi_y
dict_['phi_pos'] = phi_given_pos
dict_['phi_neg'] = phi_given_neg
 

# *** END CODE HERE ***

In [64]:
model = dict_
matrix = np.array([[3,1,1],[0,3,3],[0,1,7]])

In [65]:
phi_y = model['phi_y']
phi_pos = model['phi_pos']
phi_neg = model['phi_neg']

preds = []
for x in range(matrix.shape[0]): #go through each test data 
    p_1 = np.log(phi_y)
    p_0 = np.log(1-phi_y)
    for v in range(matrix.shape[1]):
        for i in range(matrix[x][v]):
            p_1+= np.log(phi_pos[v]) #add all log probabilities 
            p_0+= np.log(phi_neg[v])
    preds.append(1) if p_1 > p_0 else preds.append(0)

preds = np.array(preds)
preds

array([1, 0, 0])

In [90]:
phi_pos = model['phi_pos']
phi_neg = model['phi_neg']

metric = np.log(phi_pos) - np.log(phi_neg)
top_five_sorted = np.flip(metric[np.argsort(metric)])
top_five = top_five_sorted[:5]
top_five

array([ 1.82253113,  0.31845373, -1.3555227 ])

0.5