# Naive bayes classifier for email classification
# ----
# Set Directories

In [1]:
train_dir = "/Users/aboud/Desktop/train" # Training directory
test_dir = "/Users/aboud/Desktop/test" # Testing directory
stop_words_dir = "/Users/aboud/Desktop/English-Stop-Words.txt" # stop words directory
output_dir = "/Users/aboud/Desktop/" # Output destination

# Class

In [2]:
#Dependencies
import numpy as np
import re,os,math

os.chdir(train_dir)
stop_words = open(stop_words_dir,"r")
stop_words = stop_words.read()
stop_words = re.split('[^a-zA-Z]',stop_words)

class EmailNaiveBayesClassifier(object):
    
    def __init__(self):
        self.words_ham_freq = {}
        self.words_spam_freq = {}
        self.word_given_ham_freq = {}
        self.word_given_spam_freq = {}
        self.word_given_ham_prob = {}
        self.word_given_spam_prob = {}
        self.ham_prior = 1000/1997
        self.spam_prior = 997/1997
        self.ham_total_words = 0
        self.spam_total_words = 0
        self.shared_vocab = []
        self.predictions = []
        self.labels = []
        
    def generate_word_ham_freq(self,string):
        for word in (string):
            if word in self.words_ham_freq:
                self.words_ham_freq[word]+=1
            else:
                self.words_ham_freq[word]=1
                
    def generate_word_spam_freq(self,string):
        for word in (string):
            if word in self.words_spam_freq:
                self.words_spam_freq[word]+=1
            else:
                self.words_spam_freq[word]=1
    
    def find_shared_vocab(self):
        for word in self.words_ham_freq:
            if word in self.words_spam_freq:
                self.shared_vocab.append(word)
                
    def update_ham_and_spam_freq(self):
        for word in self.words_ham_freq:
            if word in self.shared_vocab:
                self.word_given_ham_freq[word] = self.words_ham_freq[word]
                
        for word in self.words_spam_freq:
            if word in self.shared_vocab:
                self.word_given_spam_freq[word] = self.words_spam_freq[word]
                

    def generate_total_words(self):
    
        for word in self.word_given_ham_freq:
            self.ham_total_words = self.ham_total_words + self.word_given_ham_freq[word]
        
        for word in self.word_given_spam_freq:
            self.spam_total_words = self.spam_total_words + self.word_given_spam_freq[word]
        

    def generate_conditional_probabilities(self,smoothing):
        
        for word in self.word_given_ham_freq:
            self.word_given_ham_freq[word]+=smoothing
            self.word_given_ham_prob[word]= self.word_given_ham_freq[word] /(self.ham_total_words + len(self.shared_vocab)*smoothing)


        for word in self.word_given_spam_freq:
            self.word_given_spam_freq[word]+=smoothing
            self.word_given_spam_prob[word]= self.word_given_spam_freq[word] /(self.spam_total_words + len(self.shared_vocab)*smoothing)
        
    def predict(self,ham,spam):
        if ham > spam:
            self.predictions.append("ham")
            return "ham"
        else:
            self.predictions.append("spam")
            return "spam"
        
    def train(self):
        self.find_shared_vocab() # Computes union
        self.update_ham_and_spam_freq() # updates dictionaries so they hold words in union only
        self.generate_total_words() # Counts total words/class
        self.generate_conditional_probabilities(0.5) # Computes conditional probabilities
    
    "METRICS"
    
    
    def metrics(self):
        
        cm = np.zeros((2,2),dtype=object)
        TP = 0
        TN = 0
        FN = 0
        FP = 0
        
        for i in range(len(self.labels)):
            if self.labels[i] == "ham" and  self.predictions[i] == "ham":
                TP+=1
            if self.labels[i] == "spam" and  self.predictions[i] == "spam":
                TN+=1
            elif self.labels[i] == "ham" and self.predictions[i] == "spam":
                FN+=1
            elif self.labels[i] == "spam" and self.predictions[i] == "ham":
                FP+=1
        
        accuracy = (TP + TN)/len(self.labels)
        percision = (TP) / (TP + FP)
        recall = (TP) / (TP + FN)
      
        f1_score =  2*((percision*recall) / (percision+recall))
        
        accuracy_ham = TP / 400
        accuracy_spam = TN / 400
        
        percision_ham = TP / (TP + FP)
        percision_spam = TN / (TN + FN)
        
        recall_ham =  TP / (TP+FN)
        recall_spam = TN / (TN+FP)
        
        f1_score_ham = 2* ((percision_ham*recall_ham)/(percision_ham+recall_ham))
        f1_score_spam = 2* ((percision_spam*recall_spam)/(percision_spam+recall_spam))
        
      
        
        print("CONFUSION-MATRIX:")
        cm[0,0]= "TP:"+str(TP)
        cm[0,1]= "FP:"+str(FP)
        cm[1,0]= "FN:"+str(FN)
        cm[1,1]= "TN:"+str(TN)
        print(cm)
        print("-------------------------------------------")
        print("METRICS:")
        print("Class Ham: ")
        print("Accuracy: " + str(accuracy_ham*100) + "%")
        print("Percision: " + str(percision_ham*100) + "%")
        print("Recall: " + str(recall_ham*100) + "%")
        print("F1-Score: " + str(f1_score_ham*100)+"%")
        print("Class Spam: ")
        print("Accuracy: " + str(accuracy_spam*100) + "%")
        print("Percision: " + str(percision_spam*100) + "%")
        print("Recall: " + str(recall_spam*100) + "%")
        print("F1-Score: " + str(f1_score_spam*100)+"%")
        print("-------------------------------------------")
        print("AVERAGED RESULTS:")
        print("ACCURACY:"+str((accuracy)*100) +"%")
        print("PERCISION:"+str((percision)*100) +"%")
        print("RECALL:"+str((recall)*100) +"%")
        print("F1-Score:"+str((f1_score)*100) +"%")



        
        
    
    "PARSING & WRITING"
    
    def parse_txt(self):
        for txt in os.listdir():
            if txt.split(".")[1] == "txt" and txt.split("-")[1] == "ham":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        self.generate_word_ham_freq(words)
                        file.close()
            elif txt.split(".")[1] == "txt" and txt.split("-")[1] == "spam":
                if txt.split("-")[1] != "Stop":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        self.generate_word_spam_freq(words)
                        file.close()
                        
    def parse_txt_swords(self,stop_words):
        for txt in os.listdir():
            if txt.split(".")[1] == "txt" and txt.split("-")[1] == "ham":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        words = [word for word in words if word not in stop_words]
                        self.generate_word_ham_freq(words)
                        file.close()
            elif txt.split(".")[1] == "txt" and txt.split("-")[1] == "spam":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        words = [word for word in words if word not in stop_words]
                        self.generate_word_spam_freq(words)
                        file.close()
                        
    def parse_txt_wlength(self):
        
        for txt in os.listdir():
            if txt.split(".")[1] == "txt" and txt.split("-")[1] == "ham":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        words = [word for word in words if len(word) > 3 and len(word) <9]
                        self.generate_word_ham_freq(words)
                        file.close()
            elif txt.split(".")[1] == "txt" and txt.split("-")[1] == "spam":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        words = [word for word in words if len(word) > 3 and len(word) <9]
                        self.generate_word_spam_freq(words)
                        file.close()
    
    def parse_txt_all(self,stop_words):
        for txt in os.listdir():
            if txt.split(".")[1] == "txt" and txt.split("-")[1] == "ham":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        words = [word for word in words if word not in stop_words]
                        words = [word for word in words if len(word) > 3 and len(word) <9]
                        self.generate_word_ham_freq(words)
                        file.close()
            elif txt.split(".")[1] == "txt" and txt.split("-")[1] == "spam":
                    with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                        words = (file.read())
                        words = re.split('[^a-zA-Z]',words)
                        words = [word.lower() for word in words if word!=""]
                        words = [word for word in words if word not in stop_words]
                        words = [word for word in words if len(word) > 3 and len(word) <9]
                        self.generate_word_spam_freq(words)
                        file.close()
        
    def write_output_model(self,txt):
        output = open(os.path.join(output_dir,txt),"w")
        for i,word in enumerate(self.shared_vocab):
            output.write(str(i+1) + "  ")
            output.write(word + "  ")
            output.write(str(self.word_given_ham_freq[word]) + "  ")
            output.write(str( round(float(self.word_given_ham_prob[word]),8) ) + "  ")
            output.write(str(self.word_given_spam_freq[word]) + "  ")
            output.write(str(round(float(self.word_given_spam_prob[word]),8) ) + "  ")
            output.write("\n")

        output.close()
    
    "MODEL TESTING + WRITING"
    def write_result_output(self,otxt):
        counter = 1
        labels = []
        output = open(os.path.join(output_dir,otxt),"w")

        for txt in os.listdir():
            if txt.split("-")[1] == "ham" or txt.split("-")[1] == "spam":
                with open(txt, 'r', encoding='utf-8',errors='ignore') as file:
                    self.labels.append(txt.split("-")[1])
                    words = (file.read())
                    words = re.split('[^a-zA-Z]',words)
                    words = [word.lower() for word in words if word!=""]
                    p_ham = 0
                    p_spam = 0
                    for word in words:
                        if word in self.word_given_ham_freq:
                            p_ham += np.log10(self.word_given_ham_prob[word])
                            p_spam += np.log10(self.word_given_spam_prob[word])

                    p_ham = np.log10(self.ham_prior) + (p_ham)
                    p_spam = np.log10(self.spam_prior) + (p_spam)

                    output.write(str(counter) + "  ")
                    output.write(txt + "  ")
                    output.write(self.predict(p_ham,p_spam) + "  ")
                    output.write(str(round(p_ham,5)) + "  ")
                    output.write(str(round(p_spam,5)) + "  ")
                    output.write(txt.split("-")[1] + "  ")
                    output.write("\n")

                    counter+=1

        file.close()
        output.close()



## Expirement 1 - baseline

# Training

In [3]:
nbo = EmailNaiveBayesClassifier() # Naive bayes class object
os.chdir(train_dir)
nbo.parse_txt()
nbo.train()
nbo.write_output_model("model.txt")

# Testing

In [4]:
os.chdir(test_dir)
if os.path.isfile(".DS_Store"):
    os.remove(".DS_Store")
nbo.write_result_output("baseline-result.txt")

# Results

In [5]:
results = (nbo.metrics())

CONFUSION-MATRIX:
[['TP:394' 'FP:74']
 ['FN:6' 'TN:326']]
-------------------------------------------
METRICS:
Class Ham: 
Accuracy: 98.5%
Percision: 84.1880341880342%
Recall: 98.5%
F1-Score: 90.78341013824884%
Class Spam: 
Accuracy: 81.5%
Percision: 98.19277108433735%
Recall: 81.5%
F1-Score: 89.07103825136612%
-------------------------------------------
AVERAGED RESULTS:
ACCURACY:90.0%
PERCISION:84.1880341880342%
RECALL:98.5%
F1-Score:90.78341013824884%


## Experiment 2 - Stop word filtering

# Training

In [112]:
nbo = EmailNaiveBayesClassifier() # Naive bayes class object
os.chdir(train_dir)
nbo.parse_txt_swords(stop_words)
nbo.train()
nbo.write_output_model("stopword-model.txt")

# Testing

In [113]:
os.chdir(test_dir)
if os.path.isfile(".DS_Store"):
    os.remove(".DS_Store")
nbo.write_result_output("stopword-result.txt")


# Results

In [114]:
results = (nbo.metrics())

CONFUSION-MATRIX:
[['TP:394' 'FP:71']
 ['FN:6' 'TN:329']]
-------------------------------------------
METRICS:
Class Ham: 
Accuracy: 98.5%
Percision: 84.73118279569893%
Recall: 98.5%
F1-Score: 91.09826589595376%
Class Spam: 
Accuracy: 82.25%
Percision: 98.2089552238806%
Recall: 82.25%
F1-Score: 89.52380952380953%
-------------------------------------------
AVERAGED RESULTS:
ACCURACY:90.375%
PERCISION:84.73118279569893%
RECALL:98.5%
F1-Score:91.09826589595376%


# Expirement 3 - Word length filtering

# Training

In [115]:
os.chdir(train_dir)
nbo = EmailNaiveBayesClassifier() # Naive bayes class object
os.chdir(train_dir)
nbo.parse_txt_wlength()
nbo.train()
nbo.write_output_model("wordlength-model.txt")

# Testing

In [116]:
os.chdir(test_dir)
if os.path.isfile(".DS_Store"):
    os.remove(".DS_Store")
nbo.write_result_output("wordlength-result.txt")

# Results

In [117]:
results = (nbo.metrics())

CONFUSION-MATRIX:
[['TP:392' 'FP:60']
 ['FN:8' 'TN:340']]
-------------------------------------------
METRICS:
Class Ham: 
Accuracy: 98.0%
Percision: 86.72566371681415%
Recall: 98.0%
F1-Score: 92.01877934272301%
Class Spam: 
Accuracy: 85.0%
Percision: 97.70114942528735%
Recall: 85.0%
F1-Score: 90.90909090909092%
-------------------------------------------
AVERAGED RESULTS:
ACCURACY:91.5%
PERCISION:86.72566371681415%
RECALL:98.0%
F1-Score:92.01877934272301%


# Expirement 4 - Stopwords + word length filtering

# Training

In [118]:
nbo = EmailNaiveBayesClassifier() # Naive bayes class object
os.chdir(train_dir)
nbo.parse_txt_all(stop_words)
nbo.train()
nbo.write_output_model("stopword-model.txt")

# Testing

In [119]:
os.chdir(test_dir)
if os.path.isfile(".DS_Store"):
    os.remove(".DS_Store")
nbo.write_result_output("stopword-result.txt")

# Results

In [120]:
results = (nbo.metrics())

CONFUSION-MATRIX:
[['TP:392' 'FP:55']
 ['FN:8' 'TN:345']]
-------------------------------------------
METRICS:
Class Ham: 
Accuracy: 98.0%
Percision: 87.69574944071589%
Recall: 98.0%
F1-Score: 92.56198347107438%
Class Spam: 
Accuracy: 86.25%
Percision: 97.73371104815864%
Recall: 86.25%
F1-Score: 91.63346613545818%
-------------------------------------------
AVERAGED RESULTS:
ACCURACY:92.125%
PERCISION:87.69574944071589%
RECALL:98.0%
F1-Score:92.56198347107438%


# Demo

# Training

In [None]:
nbo = EmailNaiveBayesClassifier() # Naive bayes class object
os.chdir(train_dir)
nbo.parse_txt()
nbo.train()
nbo.write_output_model("demo-model.txt")

# Testing

In [None]:
os.chdir(test_dir)
if os.path.isfile(".DS_Store"):
    os.remove(".DS_Store")
nbo.write_result_output("demo-result.txt")

# Results

In [None]:
results = (nbo.metrics())