# NAIVE BAYES CLASSIFIER
**Author: Zak Hussain**  
**Data: 2019/09/31**

In [1]:
import os
import math 

class HamSpamClassifier():  
    
    def __init__(self:object): 
        # constansts 
        self._alpha = 1.0 
        self._V = 200000   
        
        # model: ham table, spam table 
        self.ham_model = {}
        self.total_ham_words = 0 
        self.logProb_ham_email = 0 

        self.spam_model = {}
        self.total_spam_words = 0
        self.logProb_spam_email = 0 
        
    def fit(self:object, HAM_train:str, SPAM_train:str): 
        """Creates the model from the files' training directories."""
        
        # count the word-type frequencies. 
        self.ham_model = self.__count_freq(HAM_train) 
        self.spam_model = self.__count_freq(SPAM_train) 
        
        # compute the total words in ham, then the totala words in spam 
        self.total_ham_words =  sum([self.ham_model[key]['freq'] for key in self.ham_model.keys()])
        self.total_spam_words = sum([self.spam_model[key]['freq'] for key in self.spam_model.keys()])
            
        # compute the probability of seeing the word in the given class: P(w_i | C). 
        self.__compute_prob(self.ham_model, self.total_ham_words)
        self.__compute_prob(self.spam_model, self.total_spam_words)
              
        # Use the Laplace Smooth to bring pull our distribution towards a uniform state.
        self.__laplace(self.ham_model, self.total_ham_words)
        self.__laplace(self.spam_model, self.total_spam_words)  
        
        # map the P(w_i | C) from the multiplicative domain to the additive. 
        self.__compute_log_prob(self.ham_model) 
        self.__compute_log_prob(self.spam_model) 
        
        # compute the probability of a class, count(class)/count(P_ham and P_spam)
        self.__compute_log_P_classes(HAM_train, SPAM_train)
        
    def predict(self:object, X_test_dir:str) -> dict: 
        """given the test_set's directory path, classify the emails as Spam or Ham"""
        predictions = {}  
        
        # sum the logprob of a word_i given its class: logprob(w_i | c), for all words in test set
        for filename in os.listdir(X_test_dir): 
            email = open(X_test_dir + filename, 'r')
            LogProb_ham_sum = 0
            LogProb_spam_sum = 0
            for line in email: 
                word = line[:-1]
                
                # if I have seen the word before: 
                if word in self.ham_model.keys() and word in self.spam_model.keys(): 
                    LogProb_ham_sum += self.ham_model[word]['log_prob']
                    LogProb_spam_sum += self.spam_model[word]['log_prob']
                elif word in self.ham_model.keys():
                    # I have seen the word in ham, but not in spam 
                    LogProb_ham_sum += self.ham_model[word]['log_prob']
                    LogProb_spam_sum += math.log(self._alpha / (self.total_spam_words + (self._V*self._alpha))) 
                elif word in self.spam_model.keys():
                    # I have seen the word in spam, but not in ham 
                    LogProb_spam_sum += self.spam_model[word]['log_prob']
                    LogProb_ham_sum += math.log(self._alpha / (self.total_ham_words + (self._V*self._alpha))) 
                else: 
                    # I have not seen the word in either. 
                    LogProb_ham_sum += math.log(self._alpha / (self.total_ham_words + (self._V*self._alpha)))
                    LogProb_spam_sum += math.log(self._alpha / (self.total_spam_words + (self._V*self._alpha))) 
               
            email.close()
            
            # c_i = logprob(c) +  E(logprob(w_i | c)
            ham_pred = self.logProb_ham_email + LogProb_ham_sum 
            spam_pred = self.logProb_spam_email + LogProb_spam_sum 
        
            # prediction = argmax(c_1, c_2)
            if ham_pred >= spam_pred: 
                predictions[filename] = (ham_pred, spam_pred, 'ham')
            else: 
                predictions[filename] = (ham_pred, spam_pred, 'spam')
                
        return predictions 
        
    def evaluate_model(self:object, predictions:list, y_test:list):
        """evaluate the accuracy of the model given the predictions, and actual labels."""
        pass 
    
    def tune_Laplace_params(self:object, a:float, newV:int = 200000): 
        """Update alpha and Vfor Laplace Smoothing. the model is updated automatically."""
        self._alpha = a
        self._V = newV
        
        # automatically update the model on the new paramaters 
        self.__laplace(self.ham_model, self.total_ham_words)
        self.__laplace(self.spam_model, self.total_spam_words)
        
        # update the new logProb for both the ham and spam models
        self.__compute_log_prob(self.ham_model)
        self.__compute_log_prob(self.spam_model)
        
    def get_models(self:object): 
        """Returns the ham and spam tables."""
        return self.ham_model, self.spam_model
                       
    def __count_freq(self:object, folder_path:str) -> dict: 
        """counts instances of words found in folder.

            Args: 
                folder_path (str): folder path containing corpus.

            Return: 
                dict containing (word : word_count) KV pairs. 
        """
        d = {} # stores the frequency of a word 
        for filename in os.listdir(folder_path): 
            file = open(folder_path + filename, 'r')
            for line in file:
                key = line[:-1]

                if key in d: 
                    d[key]['freq'] += 1
                else:
                    d[key] = {'freq':1}
            file.close() 
        return d
    
    def __compute_prob(self:object, words:dict, total_words:int): 
        """updates the dictionary value associated with a word to include prob of the word occuring.
        
            Args: 
                words (dict): dictionary containing word:dict KV pairs, 
                where the word is a type gathered from the training data,
                and associated value dict contains contains characteristics about 
                that word. 
                
                total_words (int): the total number of words found within a class's 
                training set. 
        """
        for key in words: 
            words[key]['prob_word'] = words[key]['freq'] / total_words

    def __laplace(self:object, words:dict, total_words:int):   
        """Apply Laplace Smoothing to distribute frequencies"""
        for key in words: 
            words[key]['laplace'] = (words[key]['freq'] + self._alpha) / (total_words + (self._V * self._alpha)) # normalize, given unseen words
            
    def __compute_log_prob(self:object, words:dict): 
        """uses the laplace values, in the dictionary-value of each word, an finds the log_prob of the word occuring. 
        """
        for key in words: 
            words[key]['log_prob'] = math.log(words[key]['laplace'])    
  
    def __compute_log_P_classes(self:object, HAM_path:str, SPAM_path:str): 
        """Computes the log(P(Ham)) and log(P(Spam)), and updates P_ham_emails and P_spam_emails
        
            Args: 
                HAM_path (str): the filepath to the directory containing HAM emails. 
                SPAM_path (str): the filepath to the directory containing SPam emails. 
        """
        # for each directory, tally the number of files 
        HAM_file_tally = sum([1 for file in os.listdir(HAM_path)])      
        SPAM_file_tally = sum([1 for file in os.listdir(SPAM_path)])
        total_tally = HAM_file_tally + SPAM_file_tally

        # compute the probability of the classes 
        self.logProb_ham_email = math.log(HAM_file_tally / total_tally)
        self.logProb_spam_email = math.log(SPAM_file_tally / total_tally) 

### Test Model

In [2]:
ham_path = '../Data/data/ham/'
spam_path = '../Data/data/spam/'
test_path = '../Data/data/test/'

HS_clf = HamSpamClassifier() 
HS_clf.fit(ham_path, spam_path)

ham_model, spam_model = HS_clf.get_models()

ham_model['At']

{'freq': 60,
 'prob_word': 0.0007300691132093838,
 'laplace': 0.0002161710089870439,
 'log_prob': -8.439440755242437}

In [3]:
spam_model['At']

{'freq': 1,
 'prob_word': 1.2644143232854542e-05,
 'laplace': 7.166198475032965e-06,
 'log_prob': -11.84613524269797}

In [4]:
HS_clf.predict(test_path)

{'1.words': (-14163.06312692431, -13406.153539262092, 'spam'),
 '10.words': (-2893.846357314009, -2431.070884969585, 'spam'),
 '100.words': (-144.25109229726633, -144.843518949235, 'ham'),
 '11.words': (-1732.0446505019277, -1869.855967297329, 'ham'),
 '12.words': (-1011.2871210617694, -1089.831069063449, 'ham'),
 '13.words': (-1884.450426450807, -1674.5782742060924, 'spam'),
 '14.words': (-828.3443877993997, -793.3529100259112, 'spam'),
 '15.words': (-88.54534951647017, -82.80516366322941, 'spam'),
 '16.words': (-6365.249062147799, -6403.513175139366, 'ham'),
 '17.words': (-353.5568665161027, -340.7739697723117, 'spam'),
 '18.words': (-995.1632353139026, -870.1977477472184, 'spam'),
 '19.words': (-3955.387807962444, -3542.5238305386806, 'spam'),
 '2.words': (-883.0994758574856, -941.6617586498218, 'ham'),
 '20.words': (-2405.4062853667547, -2029.1792808699968, 'spam'),
 '21.words': (-772.0053100190362, -712.7061191993727, 'spam'),
 '22.words': (-604.2741977289121, -713.0585612706298, 

In [24]:
HS_clf.tune_Laplace_params(.085)
temp = HS_clf.predict(test_path)
temp

{'1.words': (-13889.803866366132, -12112.818810123108, 'spam'),
 '10.words': (-2806.472228896054, -2110.3206908330685, 'spam'),
 '100.words': (-142.8497197789069, -144.89242416522552, 'ham'),
 '11.words': (-1592.0246317527174, -1816.3439224762537, 'ham'),
 '12.words': (-939.2535584999176, -1067.7662681193776, 'ham'),
 '13.words': (-1873.2061964393417, -1500.4177414335259, 'spam'),
 '14.words': (-853.7749157228097, -771.6825300411613, 'spam'),
 '15.words': (-98.48199926759459, -79.12684755573657, 'spam'),
 '16.words': (-6520.260319432509, -6677.7621600451575, 'ham'),
 '17.words': (-358.85465514657017, -330.68649746241846, 'spam'),
 '18.words': (-1026.2760689863478, -798.4221803715476, 'spam'),
 '19.words': (-3738.600427964295, -3098.7829811053357, 'spam'),
 '2.words': (-801.2186044767166, -873.3344744827514, 'ham'),
 '20.words': (-2378.865791659679, -1806.0637582703584, 'spam'),
 '21.words': (-723.0139175060073, -623.98181291675, 'spam'),
 '22.words': (-540.0117286578485, -702.754955847

In [29]:
ham_pred = [] 
spam_pred = [] 
for key in temp: 
    if temp[key][2] == 'ham':
        ham_pred.append(int(key.split('.')[0]))
    else: 
        spam_pred.append(int(key.split('.')[0]))

spam_pred.sort() 
ham_pred.sort()

print('spam: ', spam_pred)
print()
print('ham: ', ham_pred)

spam:  [1, 5, 10, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 28, 31, 32, 35, 36, 37, 38, 40, 42, 43, 44, 45, 46, 50, 51, 55, 58, 63, 65, 66, 68, 69, 70, 71, 72, 74, 75, 76, 81, 82, 83, 84, 85, 87, 88]

ham:  [2, 3, 4, 6, 7, 8, 9, 11, 12, 16, 22, 26, 27, 29, 30, 33, 34, 39, 41, 47, 48, 49, 52, 53, 54, 56, 57, 59, 60, 61, 62, 64, 67, 73, 77, 78, 79, 80, 86, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [37]:
actual_spam = [1,5,10,13,14,15,17,18,19,20,21,23,
          24,25,28,31,32,35,36,37,38,40,42,
          43,44,45,46,50,51,55,58,63,65,66,
          68,73,88]

actual_ham = [val for val in (ham_pred + spam_pred) if val not in actual_spam]

print('num spam predictions: ', len(spam_pred))
print('num actual spam labels: ', len(actual_spam))

num spam predictions:  49
num actual spam labels:  37


In [39]:
# TP: things correctly classified as spam :  
Tp = sum([i for i in spam_pred if i in actual_spam])

# TN: Classified as ham, but it is ham
Tn = sum([i for i in ham_pred if i in actual_ham]) 

# FP Classified as spam, but it is ham
Fp = sum([i for i in spam_pred if i in actual_ham]) 

# FN: Classified as ham, but it is spam  
Fn = sum([i for i in ham_pred if i in actual_spam]) 

In [42]:
# Precision: tp / (tp + fp) 
precision = Tp / (Tp + Fp)

# Recall: tp / (tp+fn) # what proportion of the world is spam 
recall = Tp / (Tp + Fn) 

# Accuracy: (tp + tn) / (tp + tn + fp + fn)
acc = (Tp + Tn) / (Tp + Tn + Fp + Fn)

f1 = 2 * (precision*recall) / (precision + recall) 

print('precision: ', precision) 
print()
print('recall: ', recall) 
print() 
print('accuracy: ', acc)
print()
print('f1-score: ', f1)

precision:  0.5603485838779957

recall:  0.9462840323767476

accuracy:  0.7857425742574258

f1-score:  0.7038861521620142


https://classeval.wordpress.com/introduction/basic-evaluation-measures/

## REPORT

**Naive bayes:**  
    Naive bayes classification is a means of finding the most likely class label give a set of words. In this case we worked
    with a binary classification model, where the classes were ham and spam, and we pre-generated tables based on co-occurunces
    of words from a training set. the pre-computed values could then be used to classify test instances. Overall, Naive Bayes
    can be simply thought of as a way of pre-counting co-occurances of words to map new patterns in the future. 

**Explanation of Code:**   
I generated the HamSpamClassifier to predict whether an email is ham or spam. The user starts by building a model from the Ham and Spam directories. This is done using the .fit() method. the fit() will generate a table for ham and spam based on the logProb of a given word. Once the fit is completed, the user uses the .predict() function to pass in his/her test set. The user can also call the tune method, which updates all the ham and spam logprobs based on a new alpha. Later I will implement an evaluation method to compute the f1_score of the model. The above cells, where I compute the precision, recall, accuracy, would be built into the classifier so a user could evaluate their outcome directly. 