<a href="https://colab.research.google.com/github/ccwbroomfield/NLP_work/blob/main/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
from datasets import load_dataset, load_dataset_builder
dataset_imdb_train = load_dataset('imdb', split="train")
dataset_imdb_test = load_dataset('imdb', split="test")
dataset_builder = load_dataset_builder('imdb')
print(dataset_builder.info.features)
dataset_sms_train = load_dataset('sms_spam', split="train")

In [None]:
from typing import List
import nltk

##### FEATURIZERS  ########
def bag_of_words(cleaned : List[str]):   #given method
  words = {w.lower() for w in cleaned if w.isalpha()}
  return list(words)

def bag_of_counted_words(tokens: List[str]):
  feature_list = []
  for w in tokens:
    if w.isalpha():
      feature_list.append(w)
  return feature_list

def text_len(tokens: List[str]):   #number of words. would like to classify as long or short but that doesn't generalize
  return [len(tokens)]

def word_length(tokens: List[str]):   #avg length of word, int so that it'll match more often
  avg = 0
  for t in tokens:
    avg += len(t)
  return [int(avg/len(tokens))]

def part_of_speech(tokens: List[str]): #treats the parts of speech like words, will get count
  text = nltk.pos_tag(tokens)          #from https://www.nltk.org/book/ch05.html
  parts_of_speech = []
  for word, lab in text:
    parts_of_speech.append(lab)

  return parts_of_speech

def bigrams(tokens: List[str]):    #from https://www.nltk.org/book/ch05.html
  return list(nltk.bigrams(tokens))



  

In [None]:
from nltk.tokenize import word_tokenize # from https://www.nltk.org/api/nltk.tokenize.html
import nltk
nltk.download('punkt')  #recommended by error message


##### CLEANERS ########
def basic_split(document: str):
  tokens = document.split(" ")

  return tokens

def nltk_word_tok(document: str):
  return word_tokenize(document)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from typing import Callable, List, Tuple
from collections import Counter
import nltk
nltk.download('averaged_perceptron_tagger')  #recommended by error message

def learn(data, clean, featurize):
  prob_table = {}
  count_classes = Counter()
  #count_features = Counter()
  tot_vocab = []
  #num_docs = len(data)
  
  for document, cls in data: # iterate over the data
    if cls not in prob_table:
      prob_table[cls] = Counter()   #if new class, add new class to nested dict of features
    count_classes[cls] += 1  #keep track of how many times we see each class
    document = clean(document) # clean the document
    features = []
    for featurizer in featurize:
      features += featurizer(document)
    for f in features:
      prob_table[cls][f] += 1    #keep track of how many times we see each feature per class
      tot_vocab.append(f)   #add to total vocab
  

  num_docs = len(data)
  tot_vocab = set(tot_vocab)     #convert to set so that we can get length of vocab

  model = (num_docs, tot_vocab, count_classes, prob_table)
  return model
  

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
from collections import Counter
import math
def classify(text, model, clean, featurize):
  num_docs = model[0]
  tot_vocab = model[1]
  count_classes = model[2]
  prob_table = model[3]
  
  length_v = len(tot_vocab)

  text = clean(text)
  features = []
  for featurizer in featurize:
    features += featurizer(text)
  
  stripped_text = []
  for word in features:       #get rid of words not seen in training
    if word not in tot_vocab:
      continue
    stripped_text.append(word)
  
  prob = -math.inf
  most_prob_class = None

  for cls in prob_table:
    word_prob = []
    prob_class = math.log(count_classes[cls] / num_docs)   # get the probability of the class
    #print("prob_class of class " + str(cls) + " is " + str(prob_class))
    for word in stripped_text:
      word_prob.append(prob_table[cls][word] + 1)  #laplace smoothing, number of times each word seen
    loc_prob = 1
    for num in word_prob:
      loc_prob += math.log(num / (len(prob_table[cls]) + length_v))   #sum log of every probability
    #print("loc_prob of class " + str(cls) + " is " + str(loc_prob))
    tot_prob = prob_class + loc_prob #sum total probability
    #print("tot_prob for class " + str(cls) + " is " + str(tot_prob))
    if tot_prob > prob:
      prob = tot_prob
      most_prob_class = cls
  
  return most_prob_class

In [None]:
from collections import Counter
def classify_baseline(text, model):
  num_docs = model[0]
  count_classes = model[2]
  prob_table = model[3]

  prob = 0
  most_prob_class = None

  for cls in prob_table:
    prob_class = count_classes[cls] / num_docs   # get the probability of the class
    #print("class " + str(cls) + " has a probability of " + str(prob_class) + " with " + str(count_classes[cls]) + " of the " + str(num_docs) + " documents")
    if prob_class > prob:
      prob = prob_class
      most_prob_class = cls
  
  return most_prob_class

In [None]:
import random

cleaner = word_tokenize   #setting cleaner
featurize = []   #selectng featurizers
featurize.append(bag_of_counted_words)
#featurize.append(text_len)
#featurize.append(word_length)
#featurize.append(part_of_speech)
featurize.append(bigrams)

data = []  #selecting training data and building data for learn
#text = dataset_imdb_train["text"]
#labels = dataset_imdb_train["label"]

text = dataset_sms_train["sms"]
labels = dataset_sms_train["label"]
#print(labels)
lab_counter = 0
for doc in text:    #gotta be a better way but i was having trouble so i matched manually
  #if(len(labels) > lab_counter):
  data.append((doc, labels[lab_counter]))
  lab_counter += 1 


## ONLY FOR SMS, BUILDING TRAIN AND TEST SETS ########
train_data = []
test_data = []
random.shuffle(data)
len_data = len(data)
for index, tup in enumerate(data):   #get a random percentage for testing 
  if index / len_data < 0.95:
    train_data.append(tup)
  else:
    test_data.append(tup)

model = learn(train_data, cleaner, featurize)

In [None]:
import datasets
accuracy = datasets.load_metric("accuracy")
precision = datasets.load_metric("precision")
recall = datasets.load_metric("recall")

print("Running the word_tokenize cleaner and bag_of_counted_words and bigram featurizers, trained on 95% of the sms dataset:")

results_classify = []           #storing how each classifier classified each document in training set
results_classify_baseline = []
#for text in dataset_imdb_test["text"]: #different for loop depending on testing set, classifying everything in test set
for text, lab in test_data:
  results_classify.append(classify(text, model, cleaner, featurize))
  results_classify_baseline.append(classify_baseline(text, model))

#print(results_classify)
#print(results_classify_baseline)
real_tags = []
#for lab in dataset_imdb_test["label"]:
for doc, lab in test_data:
  real_tags.append(lab)

#calculating metrics
acc_class = accuracy.compute(predictions = results_classify, references = real_tags)
acc_base = accuracy.compute(predictions = results_classify_baseline, references = real_tags)
prec_class = precision.compute(predictions = results_classify, references = real_tags)
prec_base = precision.compute(predictions = results_classify_baseline, references = real_tags)
rec_class = recall.compute(predictions = results_classify, references = real_tags)
rec_base = recall.compute(predictions = results_classify_baseline, references = real_tags)

f_score_class = (2 * prec_class["precision"] * rec_class["recall"]) / (prec_class["precision"] + rec_class["recall"])
#f_score_base = (2 * prec_base["precision"] * rec_base["recall"]) / (prec_base["precision"] + rec_base["recall"])
print("The accuracy of the naive bayes classifier was " + str(acc_class["accuracy"]) + " compared to the baseline of " + str(acc_base["accuracy"]))
print("The precision of the naive bayes classifier was " + str(prec_class["precision"]) + " compared to the baseline of " + str(prec_base["precision"]))
print("The recall of the naive bayes classifier was " + str(rec_class["recall"]) + " compared to the baseline of " + str(rec_base["recall"]))
print("This gives an F1 score for the naive bayes classifier of " + str(f_score_class))
#print(" compared to the baseline score of " + str(f_score_base))



Running the word_tokenize cleaner and bag_of_counted_words and bigram featurizers, trained on 95% of the sms dataset:
The accuracy of the naive bayes classifier was 0.9820143884892086 compared to the baseline of 0.8345323741007195
The precision of the naive bayes classifier was 1.0 compared to the baseline of 0.0
The recall of the naive bayes classifier was 0.8913043478260869 compared to the baseline of 0.0
This gives an F1 score for the naive bayes classifier of 0.9425287356321839


  _warn_prf(average, modifier, msg_start, len(result))


# **IMDB DATASET**
# Results
Running the word_tokenize cleaner and bag_of_counted_words featurizer: 
*   The accuracy of the naive bayes classifier was 0.8198
*   The precision of the naive bayes classifier was 0.8217 
*   The recall of the naive bayes classifier was 0.8170 
*   This gives an F1 score for the naive bayes classifier of 0.8193

<br/>Running the word_tokenize cleaner and bag_of_counted_words and bigrams featurizers:
*   The accuracy of the naive bayes classifier was 0.8512
*   The precision of the naive bayes classifier was 0.8827 
*   The recall of the naive bayes classifier was 0.8101
* This gives an F1 score for the naive bayes classifier of 0.8449


<br />Running the word_tokenize cleaner and text_len, word_length, and part_of_speech featurizers:
*   The accuracy of the naive bayes classifier was 0.5909
*   The precision of the naive bayes classifier was 0.5884 
*   The recall of the naive bayes classifier was 0.6051
*   This gives an F1 score for the naive bayes classifier of 0.5966

<br />Running the word_tokenize cleaner and bag_of_counted_words, text_len, word_length, part_of_speech, and bigrams featurizers:
*   The accuracy of the naive bayes classifier was 0.8462
*   The precision of the naive bayes classifier was 0.8783
*   The recall of the naive bayes classifier was 0.8038
*   This gives an F1 score for the naive bayes classifier of 0.8394

<br />For all cases the accuracy of the baseline classifier was 0.5, and the precision and recall were 0. This is because there were an equal number of positive and negative reviews in the testing set, and everything was categorized as 0 (meaning there were no true positives). With a precision and recall of 0, the F1 score is useless and creates a divide by 0 error.

---

# Analysis

Using just the bag_of_counted_words tokenizer, performance on all metrics was approximately 0.82. This seems reliable, consistent, and unbiased. 
<br/>By including the bigrams featurizer with bag of counted words the accuracy and precision is increased to .85 and .88 respectively. This was the best performing combination of featurizers that I tested. Recall does stay approximately the same, meaning that  the number of things falsely labelled as positive decreased (if the number of true positives increased, recall would have changed). 
<br/>Classifying simply on text features, the word length, review length, and parts of speech, is dismal (which did not surprise me) and including all 5 featurizers sees a slight dip in performance, but is similar to the bag of counted words and bigrams. 

<br/>**Future Work**
<br/>Future work could still test other combinations of featurizers; I did not exhaust the permutations. I would also be curious if including 3-grams would increase performance as including bigrams did, and what featurizers could increase recall.
 





# **SMS-SPAM DATASET**
# Results
**Running on a random training set of 75% of the data**
<br/>Running the word_tokenize cleaner and bag_of_counted_words featurizer:
*   The accuracy of the naive bayes classifier was 0.9584 compared to the baseline of 0.8672
*   The precision of the naive bayes classifier was 0.9922
*   The recall of the naive bayes classifier was 0.6919
*   This gives an F1 score for the naive bayes classifier of 0.8153

<br/>Running the word_tokenize cleaner and bag_of_counted_words and bigrams featurizers:
*   The accuracy of the naive bayes classifier was 0.9785 compared to the baseline of 0.8636
*   The precision of the naive bayes classifier was 0.9938
*   The recall of the naive bayes classifier was 0.8474
*   This gives an F1 score for the naive bayes classifier of 0.9148

<br/>Running the word_tokenize cleaner and text_len, word_length, and part_of_speech featurizers:
*   The accuracy of the naive bayes classifier was 0.8557 compared to the baseline of 0.8557
*   The precision of the naive bayes classifier was 0.0
*   The recall of the naive bayes classifier was 0.0

*Note: it appears that, when classified based on text properties, the naive bayes classifier behaved exactly like the baseline classifier*

<br/>Running the word_tokenize cleaner and bag_of_counted_words, text_len, word_length, part_of_speech, and bigrams featurizers:
*   The accuracy of the naive bayes classifier was 0.9512 compared to the baseline of 0.8586
*   The precision of the naive bayes classifier was 1.0
*   The recall of the naive bayes classifier was 0.6548
*   This gives an F1 score for the naive bayes classifier of 0.7914

<br>**To assess the performance of different training sizes, the most successful set of featurizers, the bag of counted words and bigrams, were used**

<br/>Trained on 95% of the sms dataset:
*   The accuracy of the naive bayes classifier was 0.9820 compared to the baseline of 0.8345
*   The precision of the naive bayes classifier was 1.0
*   The recall of the naive bayes classifier was 0.8913
*   This gives an F1 score for the naive bayes classifier of 0.9425

<br/>Trained on 90% of the sms dataset:
*   The accuracy of the naive bayes classifier was 0.9964 compared to the baseline of 0.8743
*   The precision of the naive bayes classifier was 1.0
*   The recall of the naive bayes classifier was 0.9714
*   This gives an F1 score for the naive bayes classifier of 0.9855

<br/>Trained on 80% of the sms dataset:
*   The accuracy of the naive bayes classifier was 0.9776 compared to the baseline of 0.8627
*   The precision of the naive bayes classifier was 0.9923
*   The recall of the naive bayes classifier was 0.8431
*   This gives an F1 score for the naive bayes classifier of 0.9117

<br/>Trained on 70% of the sms dataset:
*   The accuracy of the naive bayes classifier was 0.9743 compared to the baseline of 0.8666
*   The precision of the naive bayes classifier was 0.9945
*   The recall of the naive bayes classifier was 0.8117
*   This gives an F1 score for the naive bayes classifier of 0.8938

<br/>Trained on 50% of the sms dataset:
*   The accuracy of the naive bayes classifier was 0.9706 compared to the baseline of 0.8629
*   The precision of the naive bayes classifier was 0.9902
*   The recall of the naive bayes classifier was 0.7932
*   This gives an F1 score for the naive bayes classifier of 0.8808

<br/> For all cases the precision and recall of the baseline classifier was 0, making the F1 score useless

---
# Analysis

Performance was far better, at least in the best cases on the sms_spam dataset than the imdb dataset. Using just the bag of counted words featurizer, accuracy was 0.96 and precision 0.99, although I was surprised to see a very low recall of 0.69. I imagine this represents the features of the dataset. Adding the bigram featurizer makes marginal improvements to the accuracy and precision but makes dramatic improvements to recall. The addition of the bigram featurizer this bumps the F1 score from 0.81 to .91.
<br/>I was a bit surprised by my findings with different featurizers. Using exclusively text data the naive bayes classifer appears to operate in the exact same way as the baseline classifier, but using all featurizers bumps precision to a perfect 1.0 while recall drops to a mere 0.65.
<br/> To assess the effects of different training sets, I also ran the best performing combination of featurizers, the bag of counted words and bigrams, with models built from training and testing sets of different percentages of the total data. Interestingly, the optimal training set size appeared to be 90% of the data. 
<br/>Judged off of F1 scores alone:

*   Training on 50% of the data produced an F1 score of 0.8808
*   Training on 70% of the data produced an F1 score of 0.8938
*   Training on 75% of the data produced an F1 score of 0.9148
*   Training on 80% of the data produced an F1 score of 0.9117
*   Training on 90% of the data produced an F1 score of 0.9855
*   Training on 95% of the data produced an F1 score of 0.9425

<br/>**Future Work**
<br/>Future work could include different combinations of featurizers, and different featurizers, as the imdb evaluation could. As the sms_spam dataset is not balanced I am especially interested in the precision and recall tradeoffs, as improvements in one metric does not appear to always correspond with an improvement in the other.
<br/>Another important area of work could be evaluating the average performance of the model. The training and testing sets were not made permanent, and were randomly created with each model. While multiple tests appeared to return similar results, it may be interesting to average the metrics of the same models run on many different splits of the data. The variability can be seen in the performance of the baseline classifier, which ranged from .83 to .87. Additionally, training on 75% of the data perfomed better than training on 80% of the data, but far worse than training on 90% of the data. I would expect the F1 scores to appear a little more linear with a testing of averages. 

