# Importing Libraries

In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest,mutual_info_classif
import os
from sklearn.naive_bayes import GaussianNB
import  sklearn.metrics 


# Setting the train directories

In [2]:

spam_train_dir="/home/abhi/Desktop/Sem 5/AML/asn1/Naive Bayes/EmailsData/spam-train"
notspam_train_dir="/home/abhi/Desktop/Sem 5/AML/asn1/Naive Bayes/EmailsData/nonspam-train"


# Naive Bayes class, contain custom and scikit implementations of the same

In [3]:

class Naive_bayes:
    def __init__(self,spam_dir,non_spam_dir):
        self.spam_dir=spam_dir
        self.non_spam_dir=non_spam_dir
        self.X_train=[]
        self.vectorizer = TfidfVectorizer()
        self.select = SelectKBest(mutual_info_classif,50)
    
    # mathod to read the files in the train directory and making the train corpus
    def _make_corpus(self):
        files=[]
        for file in os.listdir(self.spam_dir):
            files += [self.spam_dir+'/'+file]
        for file in os.listdir(self.non_spam_dir):
            files+= [self.non_spam_dir+'/'+file]
        for emails in files:
            doc_string=""
            with open(emails) as email:
                for line in email:
                    doc_string = doc_string + line
                if not self.X_train:
                    self.X_train=[doc_string]
                else:
                    self.X_train.append(doc_string)
    
    # method to train the train corpus by using the tfidf values of the words in the train corpus
    def train(self):
        self._make_corpus()
        self.Y_train = np.zeros(700)
        self.Y_train[0:350] = 1
        self.Y_train[350:700] = 0
        # train_corpus_tf_idf contains a sparse matix of size 700 X number of unique words in the train corpus
        # the value of each cell of the matrix is the tfidf value of the word for the train file
        train_corpus_tf_idf = self.vectorizer.fit_transform(self.X_train)
        # select_obj contains a numpy array of shape 700 X 50
        # select k best selects k best features giving equal weights to both the classes while selecting them
        # the values of the cells of the matrix are the same as that of tfidf train corpus matrix
        self.select_obj = self.select.fit_transform(train_corpus_tf_idf.toarray(),self.Y_train)
    
    # method to calcualte mean and standard deviation for each feature from the select obj 
    # we calculate this for both the classes (i.e spam and not spam)
    def mean_std(self):
        spam_mean = np.mean(self.select_obj[:][0:350],axis=0)
        # appending a small value to std to avoid division by zero errors
        spam_std = np.std(self.select_obj[:][0:350],axis=0)+(10**(-6))  
        notspam_mean = np.mean(self.select_obj[:][350:700],axis=0) 
        notspam_std = np.std(self.select_obj[:][350:700],axis=0)+(10**(-6))
        return spam_mean,spam_std,notspam_mean,notspam_std
            
        
    # method to predict the class of the test files
    def predict(self,test_dir):
        # files contain the path of all the test files to be tested
        files=[]
        X_test=[]
    
        for file in os.listdir(test_dir):
            files += [test_dir+'/'+file]
     # X_test conatins the data of each email in a seprate list
        for emails in files:
            doc_string=""
            with open(emails) as email:
                for line in email:
                    doc_string = doc_string + line
                if not X_test:
                    X_test=[doc_string]
                else:
                    X_test.append(doc_string)  
        
        #test_corpus_tf_idf is a sparse matrix of 260 X number of unique words present in the test corpus intersection number of unique words in the train corpus
        # the value of each cell of the matrix is the tf idf value of the word for that file
        test_corpus_tf_idf = self.vectorizer.transform(X_test)
        # select test obj is a numpy array of shape 260 X (atmost 50 words )
        select_test_obj = self.select.transform(test_corpus_tf_idf.toarray())
        
                        
        spam=0
        notspam=0
        # calculating class mean and class std
        spam_mean,spam_std,notspam_mean,notspam_std = self.mean_std()
        # prediction array contains the predicted class for the test files
        prediction=[]
        
        #for each test file predict its class
        for row in select_test_obj:
            # initaillizing spam and notspam prob. as 1
            spam_prob=1
            notspam_prob=1
            
            #for each word in the select_test_obj of the test file , calculating its spaminess
            # and non spaminess(the probability of not spam if this word is present)
            # we use gaussian of the word to calculate if it is likely to be spam or notspam
            for i in range(len(row)):
                spam_prob = spam_prob * np.e**((-(row[i]-spam_mean[i])**2)/(2*spam_std[i]*spam_std[i]))/(spam_std[i])
            for i in range(len(row)):
                notspam_prob = notspam_prob * np.e**((-(row[i]-notspam_mean[i])**2)/(2*notspam_std[i]*notspam_std[i]))/(notspam_std[i])
            
            
            # predicting spam or not spam
            if spam_prob>notspam_prob:
                spam = spam+1
                prediction.append(1)
            else:
                notspam = notspam+1
                prediction.append(0)
        
        # returning prediction array
        return prediction

    # scikit implementation of naive bayes
    def sklearrnpredict(self,test_dir):
        files=[]
        X_test=[]
        for file in os.listdir(test_dir):
            files += [test_dir+'/'+file]

        for emails in files:
            doc_string=""
            with open(emails) as email:
                for line in email:
                    doc_string = doc_string + line
                if not X_test:
                    X_test=[doc_string]
                else:
                    X_test.append(doc_string)  
                
        test_corpus_tf_idf = self.vectorizer.transform(X_test)
        select_test_obj = self.select.transform(test_corpus_tf_idf.toarray())

        clf = GaussianNB()
        # fitting the train select obj , with the class labels using sklearn gaussian naive bayes
        model = clf.fit(self.select_obj,self.Y_train)
        # predicting the class of select test obj using sklearn predict
        prediction = model.predict(select_test_obj)

        spam = 0
        notspam=0
        for i in range(len(prediction)):
            if prediction[i]==1:
                spam=spam+1
            else:
                notspam = notspam+1
        # returning prediction array
        return prediction
                        



# Object to test Naive bayes methods implemented

## Training the model

In [4]:

test = Naive_bayes(spam_train_dir,notspam_train_dir)
test.train()


## Testing it using custom predict method on test files

In [8]:
spam_test_dir="/home/abhi/Desktop/Sem 5/AML/asn1/Naive Bayes/EmailsData/spam-test"
notspam_test_dir="/home/abhi/Desktop/Sem 5/AML/asn1/Naive Bayes/EmailsData/nonspam-test"

prediction=[]
prediction1=test.predict(spam_test_dir)
prediction2=test.predict(notspam_test_dir)


for i in prediction1:
    prediction.append(i)
for i in prediction2:
    prediction.append(i)



true=[1 for i in range(130)]
for i in range(130):
    true.append(0)

print "My accuracy = ",sklearn.metrics.accuracy_score(true, prediction)
print "F1 score = ",sklearn.metrics.f1_score(true, prediction)
print "Area under ROC curve = ",sklearn.metrics.roc_auc_score(true, prediction)
print "Confusion Matrix :"
print sklearn.metrics.confusion_matrix(true, prediction)

My accuracy =  0.965384615385
F1 score =  0.966037735849
Area under ROC curve =  0.965384615385
Confusion Matrix :
[[123   7]
 [  2 128]]


## testing it using the sklearn predict method on test files

In [10]:
spam_test_dir="/home/abhi/Desktop/Sem 5/AML/asn1/Naive Bayes/EmailsData/spam-test"
notspam_test_dir="/home/abhi/Desktop/Sem 5/AML/asn1/Naive Bayes/EmailsData/nonspam-test"

prediction=[]
prediction1=test.sklearrnpredict(spam_test_dir)
prediction2=test.sklearrnpredict(notspam_test_dir)


for i in prediction1:
    prediction.append(i)
for i in prediction2:
    prediction.append(i)


true=[1 for i in range(130)]
for i in range(130):
    true.append(0)
print "sklearn accuracy = ",sklearn.metrics.accuracy_score(true, prediction)
print "F1 score = ",sklearn.metrics.f1_score(true, prediction)
print "Area under ROC curve = ",sklearn.metrics.roc_auc_score(true, prediction)
print "Confusion Matrix :"
print sklearn.metrics.confusion_matrix(true, prediction)

sklearn accuracy =  0.969230769231
F1 score =  0.969465648855
Area under ROC curve =  0.969230769231
Confusion Matrix :
[[125   5]
 [  3 127]]


# Report

## Accuracy
The accuracy for both the custom implementation and the built in implementation is almost the same(pretty good too!)

## Observations

Most of the emails of my implementation are correctly classified as observed from the confusion matrix.

There are very few misclassifications for both my implementation and scikit implementation

## A note on the libraries
After training the selectk best object on the training dataset , it conviniently maps all the unique 50 words to cols.in the select_train_obj numpy array, this is good as , when we transform the tfidf sparse matrix to select_test_obj , we get the tfidf values of the same word in the mapped column.
We then can perform evaluations on the spaminess of the word(feature)

