### Packages

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics

### Loading data

In [None]:
from google.colab import files
uploaded = files.upload()

In [4]:
import io 

#spam
spam_df = pd.read_csv('spam_df2.csv')

#easy ham
easy_ham_df = pd.read_csv('easy_ham_df2.csv', engine = 'python',error_bad_lines= False)

#hard ham
hard_ham_df = pd.read_csv('hard_ham_df2.csv')

### Preprocessing: Train and test split 

In [5]:
#Splitting spam messages into 80% train and 20% test
spam_train = spam_df.sample(frac = 0.8,random_state=25)
spam_test = spam_df.drop(spam_train.index,axis = 0)




In [6]:
#function that creates a training and testing split (used to create a balanced dataset between
# spam and ham messages)
def train_test(df,nr_train,nr_test):
  train = df.sample(n = nr_train,random_state=25)
  index = train.index
  df_update = df.drop(index,axis = 0)
  test = df_update.sample(n = nr_test,random_state = 25)

  train = train[['text','label']]
  test = test[['text','label']]

  return train,test






In [7]:
#training and testing for ham(easy) and ham(hard)
easy_train, easy_test = train_test(easy_ham_df,len(spam_train),len(spam_test))

hard_train = hard_ham_df.sample(frac = 0.8,random_state = 25)
hard_test = hard_ham_df.drop(hard_train.index,axis = 0)


### Naive Bayes

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# training easy
X_easy = pd.concat([easy_train,spam_train])
X_easy = X_easy.drop('Unnamed: 0',1)

#testing easy
x_easy = pd.concat([easy_test,spam_test])
x_easy = x_easy.drop('Unnamed: 0',1)

#training hard
X_hard = pd.concat([hard_train,spam_train.sample(n = len(hard_train),random_state= 25)])
X_hard = X_hard.drop('Unnamed: 0',1)

#testing hard
x_hard = pd.concat([hard_test,spam_test.sample(n = len(hard_test),random_state = 25)])
x_hard = x_hard.drop('Unnamed: 0',1)



In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
multi_NB = MultinomialNB()
bernoulli_NB = BernoulliNB()

# Function that inputs training data, testing data, mtype = {'multinomial','bernoulli'}
# dtype = {'easy_ham','hard_ham'}, stopwords = list of words. Function returns a confusion matrix
# from classifying on the testing data and the class accuracies
def Naive_Bayes(train,test,mtype,dtype,stopwords):

  if mtype == "multinomial":
    count_vec = CountVectorizer(stop_words = stopwords)
    
    X = count_vec.fit_transform(train.text)
    x = count_vec.transform(test.text)
    
    clf = multi_NB.fit(X,train.label)
    pred = clf.predict(x)
    matrix = confusion_matrix(test.label,pred,labels = ['spam',dtype])
    acc_spam = matrix[0,0]/(matrix[0,0] + matrix[0,1])
    acc_ham = matrix[1,1]/(matrix[1,1]+matrix[1,0])
    print("Result multinomial NB:",dtype,sep = " ")
    print(matrix)
    print("Accuracy spam:",acc_spam,sep = " ")
    print("Accuracy ham:",acc_ham, sep = " ")
    print("\n")
  elif mtype == "bernoulli":
    count_vec = CountVectorizer(stop_words= stopwords, binary = True)

    X = count_vec.fit_transform(train.text)
    x = count_vec.transform(test.text)

    clf = bernoulli_NB.fit(X,train.label)
    pred = clf.predict(x)
    matrix = confusion_matrix(test.label,pred,labels = ['spam',dtype])
    acc_spam = matrix[0,0]/(matrix[0,0] + matrix[0,1])
    acc_ham = matrix[1,1]/(matrix[1,1]+matrix[1,0])
    print("Result Bernoulli NB:",dtype,sep = " ")
    print(matrix)
    print("Accuracy spam:",acc_spam,sep = " ")
    print("Accuracy ham:",acc_ham, sep = " ")
    print("\n")

  
  return X


In [None]:
# Running Naive Bayes 
X_easy_mn = Naive_Bayes(X_easy,x_easy,'multinomial','easy_ham',stopwords= [])
X_hard_mn = Naive_Bayes(X_hard,x_hard,'multinomial','hard_ham',stopwords = [])
X_easy_bn = Naive_Bayes(X_easy,x_easy,'bernoulli','easy_ham',stopwords = [])
X_hard_bn = Naive_Bayes(X_hard,x_hard,'bernoulli','hard_ham',stopwords = [])

### Common Words

In [11]:
#Function that inputs a dataframe and number of common and uncommon words to generate
# returns a concatenated list of common and uncommon words and 
# separate lists with counts of how many times the words occur in the data.
def words_list(X,n_common,n_uncommon):
  vect = CountVectorizer()  
  X_counts = vect.fit_transform(X.text)
  
  counts = pd.DataFrame(X_counts.toarray(), columns=vect.get_feature_names_out())
  common = counts.sum(axis = 0).sort_values(ascending = False).head(n_common)
  uncommon = counts.sum(axis = 0).sort_values(ascending = True).head(n_uncommon)

  return_list = []
  for i in range(len(common)):
    return_list.append(common.index[i])
  
  for j in range(len(uncommon)):

    return_list.append(uncommon.index[j])

  return return_list, common, uncommon



In [12]:
# Common and uncommon words from our three dataset
easy_words, easy_common, easy_uncommon = words_list(easy_ham_df,50,0)
hard_words, hard_common, hard_uncommon = words_list(hard_ham_df,50,0)
spam_words, spam_common, spam_uncommon = words_list(spam_df,50,0)

In [13]:
# Words that are common in both easy ham vs spam and hard ham vs spam respectively
common_easy = list(set(easy_words).intersection(spam_words))
common_hard = list(set(hard_words).intersection(spam_words))
# Words  that are uncommon
uncommon_easy = [i for i in easy_uncommon.index]
uncommon_hard = [i for i in hard_uncommon.index]
uncommon_spam = [i for i in spam_uncommon.index]

# Final list of stopwords to pass in the functio Naive_Bayes
stop_easy = common_easy + uncommon_easy + uncommon_spam
stop_hard = common_hard + uncommon_hard + uncommon_spam

In [None]:
# Running Naive Bayes with stopwords
Naive_Bayes(X_easy,x_easy,'multinomial','easy_ham' ,stopwords = stop_easy)
Naive_Bayes(X_hard,x_hard,'multinomial','hard_ham',stopwords = stop_hard)
Naive_Bayes(X_easy,x_easy,'bernoulli','easy_ham',stopwords = stop_easy)
Naive_Bayes(X_hard,x_hard,'bernoulli','hard_ham',stopwords = stop_hard)

### Filtering of headers and footer 

In [15]:
# Function that filters out any text in data that is not between the first and last indices of
# quotation mark (")
def filter_out(data):

  df_out = pd.DataFrame()

  for i in range(len(data)):
    txt = data.text.iloc[i]
    start = txt.find('"')
    finish = txt.rfind('"')
    out = txt[start+1:finish]
    df_out = df_out.append({'text' : out},ignore_index = True)
  
  df_out['label'] = [i for i in data.label]
  return df_out


In [16]:
# Filtering training and testing data
X_easy_filtered = filter_out(X_easy)
X_hard_filtered = filter_out(X_hard)

x_easy_filtered = filter_out(x_easy)
x_hard_filtered = filter_out(x_hard)

In [None]:
# Running Naive Bayes on filtered data
X_easy_mn = Naive_Bayes(X_easy_filtered,x_easy_filtered,'multinomial','easy_ham',stopwords= [])
X_hard_mn = Naive_Bayes(X_hard_filtered,x_hard_filtered,'multinomial','hard_ham',stopwords = [])
X_easy_bn  = Naive_Bayes(X_easy_filtered,x_easy_filtered,'bernoulli','easy_ham',stopwords = [])
X_hard_bn = Naive_Bayes(X_hard_filtered,x_hard_filtered,'bernoulli','hard_ham',stopwords = [])

In [None]:
# Running Naive Bayes on filtered data with stop words
X_easy_mn = Naive_Bayes(X_easy_filtered,x_easy_filtered,'multinomial','easy_ham',stopwords = stop_easy )
X_hard_mn = Naive_Bayes(X_hard_filtered,x_hard_filtered,'multinomial','hard_ham',stopwords = stop_hard)
X_easy_bn  = Naive_Bayes(X_easy_filtered,x_easy_filtered,'bernoulli','easy_ham',stopwords = stop_easy)
X_hard_bn = Naive_Bayes(X_hard_filtered,x_hard_filtered,'bernoulli','hard_ham',stopwords = stop_hard)