# **Aditya Khandare** <br>
## ML HW#1 : Naive Bayes and Logistic Regression for Text Classification <br>
## netid : **ark200000** <br>
## utd email : **ark200000@utdallas.edu** <br>


In [1]:


import pandas as pd
from os import listdir
import os as os
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import math
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

#Please use this command to install nltk data if not already installed 
# nltk.download() 

rootDir = "dataset"
enron1 = []
enron4 = []
hw1 = []

All the datasets are moved in the folder named "dataset". <br>
This folder must be present in the root directory; where this ipynb is saved and will be executed.


In [2]:
def get_AllFilePaths():
    filepaths = []
    for directories, subdirs, files in os.walk(rootDir):
        for f in files:
            if f.endswith(".txt"):
                filepaths.append((os.path.join(directories, f)))
    return filepaths

In [3]:
def get_Data(dataset, test_data_type):
    filepaths = get_AllFilePaths()
    data = []
    for f in filepaths:
        file = open(f,'r',encoding = "latin1")
        eg = file.read()
        file.close()
        path = os.path.split(os.path.split(f)[0])[-1]
        if(dataset in f and test_data_type in f):
            data.append([('1' if "ham" in path.lower() else '0'), eg])
    return data

In [4]:
#Extracting the entire Training Data

enron1_df = pd.DataFrame(get_Data("enron1","train"), columns=["is_ham", "_examples"])
enron4_df = pd.DataFrame(get_Data("enron4","train"), columns=["is_ham", "_examples"])
hw1_df = pd.DataFrame(get_Data("hw1","train"), columns=["is_ham", "_examples"])

In [5]:
#Extracting the entire Testing Data

test_enron1_df = pd.DataFrame(get_Data("enron1","test"), columns=["is_ham", "_examples"])
test_enron4_df = pd.DataFrame(get_Data("enron4","test"), columns=["is_ham", "_examples"])
test_hw1_df = pd.DataFrame(get_Data("hw1","test"), columns=["is_ham", "_examples"])

In [6]:
#Use this function to preprocess the data. The function accepts parameters "stemming" and "lemmatization" 
#which can be set true in order to process the data using snowball stemmer or lemmatizer. By default the function will not use stemming or lemmatization techniques on the data.

def preprocess(example, stemming = False, lemmatization = False):
    
    processed_example = nltk.word_tokenize(example)
    processed_example = [word.lower() for word in processed_example if word.isalpha()]
    stopwords = set(nltk.corpus.stopwords.words('english'))
    processed_example = [word for word in processed_example if word not in stopwords]
    if stemming is True:
        snow_stemmer = SnowballStemmer(language='english')
        processed_example = [snow_stemmer.stem(word) for word in processed_example]
    if lemmatization is True:
        lemmatizer = WordNetLemmatizer()
        processed_example = [lemmatizer.lemmatize(word) for word in processed_example]
    return processed_example

In [7]:
#Preprocess Training data

enron1_df['_examples']  = [preprocess(example, lemmatization=True) for example in enron1_df._examples]
enron4_df['_examples']  = [preprocess(example, lemmatization=True) for example in enron4_df._examples]
hw1_df['_examples']  = [preprocess(example, lemmatization=True) for example in hw1_df._examples]


In [8]:
#Preprocess Test data

test_enron1_df['_examples']  = [preprocess(example, lemmatization=True) for example in test_enron1_df._examples]
test_enron4_df['_examples']  = [preprocess(example, lemmatization=True) for example in test_enron4_df._examples]
test_hw1_df['_examples']  = [preprocess(example, lemmatization=True) for example in test_hw1_df._examples]


In [9]:
#Function to create Bag Of Words model

def bow(dataframe):
    cv = CountVectorizer()
    temp = dataframe
    transformed_data = cv.fit_transform(" ".join(x) for x in dataframe._examples)
    dataframe=pd.DataFrame(transformed_data.toarray(),columns=cv.get_feature_names())
    dataframe.insert(0, 'is_ham', temp.is_ham)
    dataframe.insert(1, '_examples', temp._examples)
    return dataframe

In [10]:
#Function to create Bernoulli model


def bernoulli(df):
    cv = CountVectorizer(binary=True)
    temp = df
    transformed_data = cv.fit_transform(" ".join(x) for x in df._examples)
    df=pd.DataFrame(transformed_data.toarray(),columns=cv.get_feature_names())
    df.insert(0, 'is_ham', temp.is_ham)
    df.insert(1, '_examples', temp._examples)
    return df

In [11]:
# Creating Bag of words and Bernoulli models of HW1 dataset

hw1_bow = bow(hw1_df)
hw1_br = bernoulli(hw1_df)

In [12]:
# Creating Bag of words and Bernoulli models of Enron 1 dataset

enron1_bow = bow(enron1_df)
enron1_br = bernoulli(enron1_df)

In [13]:
# Creating Bag of words and Bernoulli models of Enron 4 dataset

enron4_bow = bow(enron4_df)
enron4_br = bernoulli(enron4_df)

In [14]:
# Use this function to calculate the probability of message being spam and ham in the dataset

def get_spamHamProbabilities(dataframe):
  
    prob_spam = dataframe["is_ham"].value_counts()[1] / dataframe.shape[0] 
    prob_ham = dataframe["is_ham"].value_counts()[0] / dataframe.shape[0]
    return prob_spam, prob_ham

In [15]:
# Calculating the probabilities of spam and ham for the respective datasets

enron1_prob_spam , enron1_prob_ham = get_spamHamProbabilities(enron1_bow)
enron4_prob_spam , enron4_prob_ham = get_spamHamProbabilities(enron4_bow)
hw1_prob_spam , hw1_prob_ham = get_spamHamProbabilities(hw1_bow)

In [16]:
# Use this function to calculate the probabilities of all the tokens in the dataset. The function accepts 2 values for the parameter: model.
# Use model = "bernoulli" if you want to calculate the probabilities of tokens of Bernoulli model 
# Use model = "bow" if you want to calculate the probabilities of tokend of Bag Of Words model

def prob_vocab(dataframe, model):
    alfa = 1
    
    count_ham_words_inAllEmails = 0
    count_spam_words_inAllEmails = 0
    vocab_size = 0

    temp = dataframe
    temp = temp.drop(["_examples"], axis=1)

    spam_df = temp[temp["is_ham"] == "0"]
    ham_df = temp[temp["is_ham"] == "1"]

    spam_df = spam_df.drop(["is_ham"], axis = 1)
    ham_df = ham_df.drop(["is_ham"], axis = 1)

    spam_df = pd.DataFrame( spam_df.to_numpy().sum(axis = 0).reshape(1,-1), columns = spam_df.columns)
    ham_df = pd.DataFrame( ham_df.to_numpy().sum(axis = 0).reshape(1,-1), columns = ham_df.columns)

    vocab_size = len(dataframe.columns) - 2

    if model == "bernoulli":

        spam_email_count = len(dataframe[dataframe["is_ham"] == "0"])
        ham_email_count = len(dataframe[dataframe["is_ham"] == "1"])

        for col_name, col_data in spam_df.iteritems():
            spam_df[col_name] = (spam_df[col_name] + 1) / (spam_email_count + 10)

        for col_name, col_data in ham_df.iteritems():
            ham_df[col_name] = (ham_df[col_name] + 1) / (ham_email_count + 2)

    elif model == "bow":

        for eg in range(dataframe.shape[0]):
            if dataframe["is_ham"][eg] == "1":
                count_ham_words_inAllEmails = count_ham_words_inAllEmails + len(dataframe["_examples"][eg])
            elif dataframe["is_ham"][eg] == "0":
                count_spam_words_inAllEmails = count_spam_words_inAllEmails + len(dataframe["_examples"][eg])
        
        for col_name, col_data in spam_df.iteritems():
            spam_df[col_name] = (spam_df[col_name] + alfa) / (count_spam_words_inAllEmails + alfa*vocab_size)

        for col_name, col_data in ham_df.iteritems():
            ham_df[col_name] = (ham_df[col_name] + alfa) / (count_ham_words_inAllEmails + alfa*vocab_size)

    prob_matrix = pd.concat([spam_df,ham_df], ignore_index=True)
    return prob_matrix


In [17]:
# Calculating the probabilities of tokens for the Bag of Words Model

enron1_bow_probability_vocab = prob_vocab(enron1_bow, "bow")
enron4_bow_probability_vocab = prob_vocab(enron4_bow, "bow")
hw1_bow_probability_vocab = prob_vocab(hw1_bow, "bow")

In [18]:
# Calculating the probabilities of tokens for the Bernoulli model


enron1_br_probability_vocab = prob_vocab(enron1_br , "bernoulli")
enron4_br_probability_vocab = prob_vocab(enron4_br , "bernoulli")
hw1_br_probability_vocab = prob_vocab(hw1_br , "bernoulli")

# Algorithm used to train the multinomial naive bayes:


## TRAINMULTINOMIALNB(C, D) <br>
1. V ← EXTRACTVOCABULARY(D) <br>
2. N ←COUNTDOCS(D)<br>
3. for each c ∈C<br>
4. do Nc ←COUNTDOCSINCLASS (D, c)<br>
5. prior[c] ← Nc/N<br>
6. textc ← CONCATENATETEXTOFALLDOCSINCLASS(D, c)<br>
7. for each t ∈V<br>
8. do Tct ← COUNTTOKENSOFTERM(textc, t)<br>
9. for each t ∈V<br>
10. do condprob[t][c] ←Tct+1<br>
∑t′(Tct′+1)<br>
11. return V, prior, condprob<br>


In [19]:
# Use this function for Multinomial Naive Bayes classification. All the calculations are done in log scale in order to avoid underflow.
# 

def multinomial_naiveBayes(message, prob_spam, prob_ham, prob_vocab):
    prob_spam = math.log(prob_spam)
    prob_ham = math.log(prob_ham)
    # prob_spam = (prob_spam)
    # prob_ham = (prob_ham)
    for word in message:
        if word in prob_vocab.columns.to_list():
            prob_spam += math.log(prob_vocab[word][0])
            # prob_spam *= (prob_vocab[word][0])
            prob_ham += math.log(prob_vocab[word][1])
            # prob_ham *= (prob_vocab[word][1])
        else:
             prob_spam += math.log(1)
             prob_ham += math.log(1)
            # prob_spam *= (1)
            # prob_ham *= (1)
    if prob_ham > prob_spam:
        return 'ham'
    elif prob_ham < prob_spam:
        return 'spam'

# Algorithm used to train the Bernoulli Naive Bayes:


## TRAINBERNOULLINB(C, D)
1. V ←EXTRACTVOCABULARY(D)
2. N ← COUNTDOCS(D)
3. for each c ∈C
4. do Nc ←COUNTDOCSINCLASS(D, c)
5. prior[c] ← Nc/N
6. for each t ∈V
7. do Nct ←COUNTDOCSINCLASSCONTAININGTERM (D, c, t)
8. condprob[t][c] ←(Nct + 1)/(Nc + 2)
9. return V, prior, condprob

In [20]:
def bernoulli_naiveBayes(message, prob_spam, prob_ham, prob_vocab):
    prob_spam = math.log(prob_spam)
    prob_ham = math.log(prob_ham)

    for col_name, col_data in prob_vocab.iteritems():
        if col_name in message:
            prob_spam += math.log(prob_vocab[col_name][0])
            prob_ham += math.log(prob_vocab[col_name][1])
        else:
            prob_spam += math.log(1 - prob_vocab[col_name][0])
            prob_ham += math.log(1 - prob_vocab[col_name][1])

    if prob_ham > prob_spam:
        return 'ham'
    elif prob_ham < prob_spam:
        return 'spam'

# Algorithm used to apply the Multinomial Naive Bayes and Bernoulli Naive Bayes

## APPLYMULTINOMIALNB(C, V, prior, condprob, d)<br>
1. W ← EXTRACTTOKENSFROMDOC (V, d)<br>
2. for each c ∈C<br>
3. do score[c] ←log prior[c]<br>
4. for each t ∈W<br>
5. do score[c] += log condprob[t][c]<br>
6. return arg maxc∈C score[c]<br>

## APPLYBERNOULLINB(C, V, prior, condprob, d)
1. Vd ← EXTRACTTERMSFROMDOC(V, d)
2. for each c ∈C
3. do score[c] ←log prior[c]
4. for each t ∈V
5. do if t ∈Vd
6. then score[c] += log condprob[t][c]
7. else score[c] += log(1 −condprob[t][c])
8. return arg maxc∈C score[c]


In [21]:
def run_model(dataframe, prob_spam, prob_ham, prob_vocab ,model):
    accuracy = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for x in range(dataframe.shape[0]):
        if model == "bow":
            typ = multinomial_naiveBayes(dataframe["_examples"][x], prob_spam, prob_ham,prob_vocab)
        elif model == "ber":
            typ = bernoulli_naiveBayes(dataframe["_examples"][x], prob_spam, prob_ham,prob_vocab)

        if typ == "ham" and dataframe["is_ham"][x] == "1":
            accuracy  += 1
            tp += 1

        elif typ == "spam" and dataframe["is_ham"][x] == "0":
            accuracy  += 1
            tn += 1
        elif typ == "ham" and dataframe["is_ham"][x] == "0":
            fp += 1
        elif typ == "spam" and dataframe["is_ham"][x] == "1":
            fn += 1
    print() 
    print("model : ", model)        
    print("Accuracy : ", (accuracy/dataframe.shape[0])*100)
    print("Precision : ", (tp / (tp+fp))*100)
    print("Recall : ", (tp / (tp+fn))*100)
    print("F1 score : ", 2*(((tp / (tp+fp))*100)*(tp / (tp+fn))*100)/(((tp / (tp+fp))*100)+(tp / (tp+fn))*100))


In [22]:
# Finally running the Discrete and Multinomial Naive Bayes on Enron 1, Enron 4 and HW 1 datasets

print("ENRON 1 Dataset: ") 
run_model(test_enron1_df,enron1_prob_spam, enron1_prob_ham, enron1_bow_probability_vocab, model="bow")
run_model(test_enron1_df,enron1_prob_spam, enron1_prob_ham, enron1_bow_probability_vocab, model="ber")

print("ENRON 4 Dataset: ") 
run_model(test_enron4_df,enron4_prob_spam, enron4_prob_ham, enron4_bow_probability_vocab, model="bow")
run_model(test_enron4_df,enron4_prob_spam, enron4_prob_ham, enron4_bow_probability_vocab, model="ber")

print("HW 1 Dataset: ") 
run_model(test_hw1_df,hw1_prob_spam, hw1_prob_ham, hw1_bow_probability_vocab, model="bow")
run_model(test_hw1_df,hw1_prob_spam, hw1_prob_ham, hw1_bow_probability_vocab, model="ber")

ENRON 1 Dataset: 

model :  bow
Accuracy :  92.98245614035088
Precision :  93.65079365079364
Recall :  96.09120521172639
F1 score :  94.85530546623792

model :  ber
Accuracy :  93.2017543859649
Precision :  93.67088607594937
Recall :  96.41693811074919
F1 score :  95.02407704654897
ENRON 4 Dataset: 

model :  bow
Accuracy :  94.47513812154696
Precision :  86.30952380952381
Recall :  95.39473684210526
F1 score :  90.62500000000001

model :  ber
Accuracy :  93.73848987108656
Precision :  85.97560975609755
Recall :  92.76315789473685
F1 score :  89.24050632911393
HW 1 Dataset: 

model :  bow
Accuracy :  94.56066945606695
Precision :  95.48022598870057
Recall :  97.12643678160919
F1 score :  96.29629629629629

model :  ber
Accuracy :  94.56066945606695
Precision :  95.48022598870057
Recall :  97.12643678160919
F1 score :  96.29629629629629


# MCAP Logistic Regression:


In [23]:
def split_dataset(dataset, size):
    train,test = train_test_split(dataset, test_size=size/100)
    return train.reset_index(drop = True), test.reset_index(drop = True)

In [24]:
# Splitting the datasets into 70:30 ratio for training and validation purposes. Please note that the validation data has been referred as test in the below variables. 
# However the variables contain the validation data and not test data.

enron1_lr_train_df, enron1_lr_test_df = split_dataset(enron1_df , 30)
enron4_lr_train_df, enron4_lr_test_df = split_dataset(enron4_df , 30)
hw1_lr_train_df, hw1_lr_test_df = split_dataset(hw1_df ,30)

In [25]:
# Creating Bag of Words and Bernoulli models of the training data
# We will also be creating the X and Y vectors required for LR

enron1_lr_train_bow = bow(enron1_lr_train_df)
enron1_lr_train_bow_X = enron1_lr_train_bow.iloc[:,2:].to_numpy()
enron1_lr_train_bow_Y = enron1_lr_train_bow["is_ham"].to_numpy().astype(int)

enron1_lr_train_br = bernoulli(enron1_lr_train_df)
enron1_lr_train_br_X = enron1_lr_train_bow.iloc[:,2:].to_numpy()
enron1_lr_train_br_Y = enron1_lr_train_bow["is_ham"].to_numpy().astype(int)

enron4_lr_train_bow = bow(enron4_lr_train_df)
enron4_lr_train_bow_X = enron4_lr_train_bow.iloc[:,2:].to_numpy()
enron4_lr_train_bow_Y = enron4_lr_train_bow["is_ham"].to_numpy().astype(int)

enron4_lr_train_br = bernoulli(enron4_lr_train_df)
enron4_lr_train_br_X = enron4_lr_train_br.iloc[:,2:].to_numpy()
enron4_lr_train_br_Y = enron4_lr_train_br["is_ham"].to_numpy().astype(int)

hw1_lr_train_bow = bow(hw1_lr_train_df)
hw1_lr_train_bow_X = hw1_lr_train_bow.iloc[:,2:].to_numpy()
hw1_lr_train_bow_Y = hw1_lr_train_bow["is_ham"].to_numpy().astype(int)


hw1_lr_train_br = bernoulli(hw1_lr_train_df)
hw1_lr_train_br_X = hw1_lr_train_br.iloc[:,2:].to_numpy()
hw1_lr_train_br_Y = hw1_lr_train_br["is_ham"].to_numpy().astype(int)


In [26]:
# The commomnly used functions are defined below:

def get_exponential(vector1, vector2):
    return np.exp(np.dot(vector2,vector1))

def get_dotProduct(vector1, vector2):
    return np.dot(vector2,vector1)

def prob_Y0(W, X):
    return 1/(1+get_exponential(W, X))

def prob_Y1(W, X):
    return 1-prob_Y0(W, X)

def get_conditionalLogLikelihood(W, X, Y, lmbda=0): 
    return np.sum((Y*get_dotProduct(W, X))+np.log(prob_Y1(W, X)))-np.dot(W.T,W)*lmbda/2
    # return np.sum(Y*np.log(prob_Y0(W,X)) + (1 - Y)*(np.log(prob_Y1(W,X))))

def compare_dotProductOfvectors(vector1, vector2):
    # if get_dotProduct(vector1, vector2) > 0:
    #     return 1
    # else:
    #     return 0
    return (get_dotProduct(vector1, vector2)>0).astype(int)
 
    


In [27]:
# Function to calculate the Weights using Gradient Ascent 

def get_gradientAscent(weights, VectorX, VectorY, lmbda=0.1, learningRate=0.01, iterations=500):
    for i in range(iterations):
        weights += learningRate*(np.dot(VectorX.T,(VectorY-prob_Y1(weights, VectorX)))-(lmbda*weights))
    return weights

In [28]:
def mcap_lr(testData ,vocab, W, model):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    test_count = 0
    acc = 0
    for i in range(testData.shape[0]):
        features = np.zeros((1,vocab.shape[1] - 1))
        features[0][0] = 1
        test_count = 0
        acc = 0
        for word in testData["_examples"][i]:
            acc += 1
            if word in vocab.columns:
                test_count += 1 
                if model == "bow":
                    features[0][vocab.columns.get_loc(word) - 1] += 1
                elif model == "ber":
                    features[0][vocab.columns.get_loc(word) - 1] = 1
        cals = compare_dotProductOfvectors(W,features)
        # print(test_count / acc)
        # print(int(testData["is_ham"][i]))
        # print((cals[0][0]))
        predicted = cals[0][0]
        actual = int(testData["is_ham"][i])
        if predicted == 1 and actual == 1:
            tp += 1
        elif predicted  == 1 and actual == 0:
            fp += 1
        elif predicted  == 0 and actual == 0:
            tn += 1
        elif predicted  == 0 and actual == 1:
            fn += 1


    
    final_accuracy = ((tp + tn) / testData.shape[0])*100
    print() 
    print("model : ", model)        
    print("Accuracy : ", ((tp+tn)/testData.shape[0])*100)
    print("Precision : ", (tp / (tp+fp))*100)
    print("Recall : ", (tp / (tp+fn))*100)
    print("F1 score : ", 2*(((tp / (tp+fp))*100)*(tp / (tp+fn))*100)/(((tp / (tp+fp))*100)+(tp / (tp+fn))*100))

    print()


In [29]:
def run_lr(testingData , X, Y, vocab, lmbda = 0.005 , lr = 0.01 , iterations = 1000 , model = "bow" ):
   
    X0=np.ones((X.shape[0],1), dtype=int)
    X = np.hstack((X0, X))
   
    Y.shape=(X.shape[0],1)

    # W=np.random.rand(X.shape[1], 1)
    W = np.zeros((X.shape[1], 1))
    W = get_gradientAscent(W, X, Y, lmbda ,lr , iterations)

    mcap_lr(testingData , vocab, W, model)


In [30]:
print("ENRON 1")
run_lr(enron1_lr_test_df , enron1_lr_train_bow_X , enron1_lr_train_bow_Y , enron1_lr_train_bow)
run_lr(enron1_lr_test_df , enron1_lr_train_br_X , enron1_lr_train_br_Y , enron1_lr_train_br , model = "ber")


print("ENRON 4")
run_lr(enron4_lr_test_df , enron4_lr_train_bow_X , enron4_lr_train_bow_Y , enron4_lr_train_bow)
run_lr(enron4_lr_test_df , enron4_lr_train_br_X , enron4_lr_train_br_Y ,enron4_lr_train_br , model = "ber")

print("HW 1")
run_lr(hw1_lr_test_df , hw1_lr_train_bow_X , hw1_lr_train_bow_Y , hw1_lr_train_bow)
run_lr(hw1_lr_test_df , hw1_lr_train_br_X , hw1_lr_train_br_Y , hw1_lr_train_br, model = "ber")


ENRON 1

model :  bow
Accuracy :  97.03703703703704
Precision :  98.9795918367347
Recall :  97.0
F1 score :  97.97979797979798


model :  ber
Accuracy :  94.81481481481482
Precision :  97.9381443298969
Recall :  95.0
F1 score :  96.4467005076142

ENRON 4

model :  bow
Accuracy :  95.03105590062113
Precision :  96.55172413793103
Recall :  80.0
F1 score :  87.5


model :  ber
Accuracy :  95.65217391304348
Precision :  96.66666666666667
Recall :  82.85714285714286
F1 score :  89.23076923076924

HW 1

model :  bow
Accuracy :  91.36690647482014
Precision :  94.89795918367348
Recall :  93.0
F1 score :  93.93939393939395


model :  ber
Accuracy :  89.20863309352518
Precision :  91.2621359223301
Recall :  94.0
F1 score :  92.61083743842363



In [31]:
print("ENRON 1")
run_lr(test_enron1_df , enron1_lr_train_bow_X , enron1_lr_train_bow_Y , enron1_lr_train_bow)
run_lr(test_enron1_df , enron1_lr_train_br_X , enron1_lr_train_br_Y , enron1_lr_train_br , model = "ber")


print("ENRON 4")
run_lr(test_enron4_df , enron4_lr_train_bow_X , enron4_lr_train_bow_Y , enron4_lr_train_bow)
run_lr(test_enron4_df , enron4_lr_train_br_X , enron4_lr_train_br_Y ,enron4_lr_train_br , model = "ber")

print("HW 1")
run_lr(test_hw1_df , hw1_lr_train_bow_X , hw1_lr_train_bow_Y , hw1_lr_train_bow)
run_lr(test_hw1_df , hw1_lr_train_br_X , hw1_lr_train_br_Y , hw1_lr_train_br, model = "ber")


ENRON 1

model :  bow
Accuracy :  95.39473684210526
Precision :  97.98657718120806
Recall :  95.11400651465797
F1 score :  96.52892561983471


model :  ber
Accuracy :  95.6140350877193
Precision :  98.64406779661017
Recall :  94.78827361563518
F1 score :  96.67774086378738

ENRON 4

model :  bow
Accuracy :  95.39594843462247
Precision :  100.0
Recall :  83.55263157894737
F1 score :  91.0394265232975


model :  ber
Accuracy :  95.76427255985267
Precision :  100.0
Recall :  84.86842105263158
F1 score :  91.8149466192171

HW 1

model :  bow
Accuracy :  93.72384937238493
Precision :  96.49122807017544
Recall :  94.82758620689656
F1 score :  95.65217391304347


model :  ber
Accuracy :  93.93305439330544
Precision :  95.44159544159544
Recall :  96.26436781609196
F1 score :  95.85121602288983

