## **Naive Bayes Text Classification**

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from text_preprocessing import *
import numpy as np
import math
from collections import defaultdict
from sklearn import metrics

#### **Get documents**

Topics:
- Computer Science
- Languages

In [3]:
urls_and_class = [["https://en.wikipedia.org/wiki/Computer_science", 0], 
                      ["https://en.wikipedia.org/wiki/Social_computing", 0],
                     ["https://en.wikipedia.org/wiki/Computer_science_and_engineering", 0], 
                     ["https://en.wikipedia.org/wiki/Computer_engineering", 0],
                  ["https://en.wikipedia.org/wiki/Language", 1],
                      ["https://en.wikipedia.org/wiki/Linguistics", 1],
                     ["https://en.wikipedia.org/wiki/Cognitive_linguistics", 1],
                     ["https://en.wikipedia.org/wiki/Psycholinguistics", 1]]
class_names = {0: "Computer science", 1: "Languages"}

In [4]:
df = pd.DataFrame.from_records(urls_and_class, columns=['url', 'class'])
df

Unnamed: 0,url,class
0,https://en.wikipedia.org/wiki/Computer_science,0
1,https://en.wikipedia.org/wiki/Social_computing,0
2,https://en.wikipedia.org/wiki/Computer_science...,0
3,https://en.wikipedia.org/wiki/Computer_enginee...,0
4,https://en.wikipedia.org/wiki/Language,1
5,https://en.wikipedia.org/wiki/Linguistics,1
6,https://en.wikipedia.org/wiki/Cognitive_lingui...,1
7,https://en.wikipedia.org/wiki/Psycholinguistics,1


#### **Preprocess text**

In [5]:
class TextPreprocessorNB(TextPreprocessor):
    def preprocess_text_nb(self, text):
        text = self.lower_case(text)
        text = self.remove_extra_spaces(text)
        text = self.remove_numbers(text)
        text = self.remove_square_braces(text)
        text = self.tokenize_words(text)
        text = self.remove_stopwords(text)
        return text

In [6]:
preprocessor = TextPreprocessorNB()
def get_paragraphs_from_link(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    paras = soup.find_all('p')
    paragraphs = []
    for p in paras:
        text = p.getText()
        # preprocess
        text = preprocessor.preprocess_text_nb(text)
        text = " ".join([w for w in text])
        print(text)
        paragraphs.append(text)
    return paragraphs
def combine_strings(strings):
    result = ""
    for p in strings:
        result += p
        result += " "
    return result

In [7]:
df['text'] = [combine_strings(get_paragraphs_from_link(url)) for url in df['url']]
df


computer science study computation , information , automation . computer science spans theoretical disciplines ( algorithms , theory computation , information theory ) applied disciplines ( including design implementation hardware software ) . though often considered academic discipline , computer science closely related computer programming .
algorithms data structures central computer science . theory computation concerns abstract models computation general classes problems solved using . fields cryptography computer security involve studying means secure communication preventing security vulnerabilities . computer graphics computational geometry address generation images . programming language theory considers different ways describe computational processes , database theory concerns management repositories data . human–computer interaction investigates interfaces humans computers interact , software engineering focuses design principles behind developing software . areas operating

Unnamed: 0,url,class,text
0,https://en.wikipedia.org/wiki/Computer_science,0,"computer science study computation , informat..."
1,https://en.wikipedia.org/wiki/Social_computing,0,social computing area computer science concern...
2,https://en.wikipedia.org/wiki/Computer_science...,0,computer science engineering ( cse ) academic ...
3,https://en.wikipedia.org/wiki/Computer_enginee...,0,computer engineering ( coe cpe ) branch compu...
4,https://en.wikipedia.org/wiki/Language,1,language structured system communication cons...
5,https://en.wikipedia.org/wiki/Linguistics,1,linguistics scientific study language . lingu...
6,https://en.wikipedia.org/wiki/Cognitive_lingui...,1,cognitive linguistics interdisciplinary branch...
7,https://en.wikipedia.org/wiki/Psycholinguistics,1,psycholinguistics psychology language study in...


#### **Bag of Words Vectorizer**

In [8]:
class BOW:
    def __init__(self):
        self.bow = {}
        self.total_words = 0
    
    def fit(self, text):
        words = self.tokenize(text)
        for i in range(len(words)):
            word = words[i]
            if word not in self.bow.keys():
                self.bow[word] = 1
            else:
                self.bow[word] += 1
            self.total_words += 1
        self.bow = self.bow
        return self.bow
    
    def tokenize(self, text):
        words = nltk.word_tokenize(text)
        return words
    
    def print_bow(self):
        bow_df = pd.DataFrame.from_records(list(self.bow.items()), columns=['Word', 'Count'])
        print(bow_df)
    
    def get_total_words(self):
        return self.total_words

#### **Naive Bayes Classifier using BOW**

In [31]:
class NBClassifierWithBOW:
    def __init__(self):
        self.total_samples = 0
        self.class_counts = {}
        self.class_conditional_counts = {}
        self.laplace_smoothing_factor = 1
        self.class_total_words = {}
        self.class_unique_words = {}
        self.preprocessor = TextPreprocessorNB()
    
    def fit(self, X, y):
        class_texts = {}
        for i in range(len(X)):
            cls = int(y[i])
            # update class count
            if cls not in self.class_counts:
                self.class_counts[cls] = 1
                self.total_samples += 1
                class_texts[cls] = X[i]
            else:
                self.class_counts[cls] += 1
                class_texts[cls] += (" " + X[i])
        # for each class define new BOW object to store class conditional prob
        for c in self.class_counts.keys():
            bow_object = BOW()
            self.class_conditional_counts[c] = bow_object.fit(class_texts[c])
            self.class_total_words[c] = bow_object.get_total_words()
            # need this while smoothing, for updating the denominator
            self.class_unique_words[c] = len(self.class_conditional_counts[c].keys())
            print("\n-------------------------------------------")
            print(f"Class {c}")
            print("-------------------------------------------")
            print(f"Count = {self.class_counts[c]}")
            bow_object.print_bow()
        # perform smoothing
        self.perform_smoothing()
    
    def perform_smoothing(self):
        for c in self.class_counts.keys():
            k = self.class_unique_words[c]
            self.class_total_words[c] += k
            for w in self.class_conditional_counts[c].keys():
                self.class_conditional_counts[c][w] += 1
    
    def predict(self, X_test):
        predictions = []
        for text in X_test:
            predictions.append(self.get_prediction(text))
        return predictions
    
    def get_prediction(self, text):
        max_prob = -1
        predicted_class = -1
        text = self.preprocessor.preprocess_text_nb(text)
        for class_id in self.class_counts.keys():
            p_class = (self.class_counts[class_id] / self.total_samples)
            p_conditionals = 1
            for word in text:
                if word in self.class_conditional_counts[class_id]:
                    p_conditionals *= (self.class_conditional_counts[class_id][word] / self.class_unique_words[class_id])
                else:
                    p_conditionals *= (1 / self.class_unique_words[class_id])
            p_class_given_text = p_class * p_conditionals
            if p_class_given_text > max_prob:
                max_prob = p_class_given_text
                predicted_class = class_id
        return predicted_class
    
    def get_dataframe_of_class_bow(self):
        df_bow = pd.DataFrame([self.class_conditional_counts[i] for i in self.class_conditional_counts.keys()])
        df_bow = df_bow.fillna(1)
        print(df_bow)
        self.df_bow = df_bow

In [10]:
nb_classifier_with_bow = NBClassifierWithBOW()
nb_classifier_with_bow.fit(df['text'], df['class'])


-------------------------------------------
Class 0
-------------------------------------------
Count = 4
             Word  Count
0        computer    196
1         science    100
2           study     19
3     computation     18
4               ,    684
...           ...    ...
2111         wage      1
2112   categories      1
2113    petroleum      1
2114          top      2
2115      nuclear      1

[2116 rows x 2 columns]

-------------------------------------------
Class 1
-------------------------------------------
Count = 4
               Word  Count
0          language    523
1        structured      3
2            system     35
3     communication     39
4          consists      6
...             ...    ...
3498        therapy      1
3499      suffering      1
3500           list      1
3501          books      1
3502     non-expert      1

[3503 rows x 2 columns]


In [11]:
# predict on test text
testing = "I love Computer Science!!!"
predicted_class = nb_classifier_with_bow.predict(testing)
print(class_names[predicted_class])

Computer science


In [12]:
nb_classifier_with_bow.get_dataframe_of_class_bow()

   computer  science  study  computation     ,  information  automation    .  \
0       197      101     20           19   685           45         3.0  396   
1        12       24     85            2  1406           20         1.0  949   

   spans  theoretical  ...  diminishes  seems  deficits  arise  damage  \
0    3.0           11  ...         1.0    1.0       1.0    1.0     1.0   
1    1.0           19  ...         2.0    2.0       2.0    2.0     2.0   

   therapy  suffering  list  books  non-expert  
0      1.0        1.0   1.0    1.0         1.0  
1      2.0        2.0   2.0    2.0         2.0  

[2 rows x 4662 columns]


#### **Naive Bayes Classifier with TF-IDF**

In [45]:
class NBClassifierWithTfIdf:
    def __init__(self):
        self.N = 0
        self.class_word_counts = defaultdict(lambda: defaultdict(int))
        self.class_doc_counts = defaultdict(int)
        self.vocabulary = set()
        self.preprocessor = TextPreprocessorNB()
        self.tfidf = {}
        self.tf = {}
    
    def fit(self, X, Y):
        self.N = len(X)
        docs = []
        for x, y in zip(X, Y):
            words = self.preprocessor.preprocess_text_nb(x)
            docs.append(set(words))
            self.vocabulary.update(words)
            self.class_doc_counts[y] += 1
            for word in words:
                self.class_word_counts[y][word] += 1

        # compute idf
        self.idf = {}
        for word in self.vocabulary:
            num_docs_containing_word = sum(1 for doc in docs if word in doc)
            self.idf[word] = math.log(self.N / (num_docs_containing_word))
        
        for docid in self.class_word_counts.keys():
            self.tf[docid] = {}
            self.tfidf[docid] = {}
            for word in self.class_word_counts[docid].keys():
                tot = sum(self.class_word_counts[docid].values())
                self.tf[docid][word] = self.class_word_counts[docid][word] / tot
                self.tfidf[docid][word] = (self.tf[docid][word]) * self.idf[word]
        return self.tfidf
    
    def predict(self, X_test):
        predictions = []
        for x in X_test:
            max_score = float('-inf')
            predicted_class = None
            for c in self.class_doc_counts.keys():
                score = 0
                words = self.preprocessor.preprocess_text_nb(x)
                for word in words:
                    if word in self.tfidf[c].keys():
                        # not sure about the implementation, adding scores w/o taking log, but it should still be proportional to the prob that this word is useful for this class
                        score += self.tfidf[c][word]
                if score > max_score:
                    max_score = score
                    predicted_class = c
            predictions.append(predicted_class)
        return predictions
    
    def get_dataframe_of_class_tfidf(self):
        df_tfidf = pd.DataFrame([self.tfidf[i] for i in self.tfidf.keys()])
        df_tfidf = df_tfidf.fillna(0)
        self.df_tfidf = df_tfidf
        return df_tfidf


In [14]:
df

Unnamed: 0,url,class,text
0,https://en.wikipedia.org/wiki/Computer_science,0,"computer science study computation , informat..."
1,https://en.wikipedia.org/wiki/Social_computing,0,social computing area computer science concern...
2,https://en.wikipedia.org/wiki/Computer_science...,0,computer science engineering ( cse ) academic ...
3,https://en.wikipedia.org/wiki/Computer_enginee...,0,computer engineering ( coe cpe ) branch compu...
4,https://en.wikipedia.org/wiki/Language,1,language structured system communication cons...
5,https://en.wikipedia.org/wiki/Linguistics,1,linguistics scientific study language . lingu...
6,https://en.wikipedia.org/wiki/Cognitive_lingui...,1,cognitive linguistics interdisciplinary branch...
7,https://en.wikipedia.org/wiki/Psycholinguistics,1,psycholinguistics psychology language study in...


In [15]:
nb_classifier_with_tfidf = NBClassifierWithTfIdf()
nb_classifier_with_tfidf.fit(df['text'], df['class'])

{0: {'computer': 0.0,
  'science': 0.0,
  'study': 0.0,
  'computation': 0.0,
  ',': 0.0,
  'information': 0.0008601055885637524,
  'automation': 0.000608824928028059,
  '.': 0.0,
  'spans': 0.0004058832853520394,
  'theoretical': 0.0004211419593789794,
  'disciplines': 0.0007568496445180928,
  '(': 0.0,
  'algorithms': 0.0011008722102081349,
  'theory': 0.0,
  ')': 0.0,
  'applied': 0.00033691356750318354,
  'including': 0.0003790277634410815,
  'design': 0.0027521805255203372,
  'implementation': 0.0005743400691036312,
  'hardware': 0.0038558912108443743,
  'software': 0.002442623364398081,
  'though': 8.422839187579589e-05,
  'often': 0.00013683497999877878,
  'considered': 0.0002947993715652856,
  'academic': 0.0005474845471926732,
  'discipline': 0.0006317129390684691,
  'closely': 0.0002029416426760197,
  'related': 0.00016845678375159177,
  'programming': 0.000963263183932118,
  'data': 0.0,
  'structures': 0.00011728712571323898,
  'central': 0.0004058832853520394,
  'concerns'

In [16]:
nb_classifier_with_tfidf.get_dataframe_of_class_tfidf()

Unnamed: 0,computer,science,study,computation,",",information,automation,.,spans,theoretical,...,diminishes,seems,deficits,arise,damage,therapy,suffering,list,books,non-expert
0,0.0,0.0,0.0,0.0,0.0,0.00086,0.000609,0.0,0.000406,0.000421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000157,0.0,0.0,0.0,0.00032,...,0.000129,0.000129,0.000129,6.1e-05,0.000129,0.000129,0.000129,2.9e-05,0.000129,0.000129


In [17]:
test_text = "Language is a process of free creation; its laws and principles are fixed, but the manner in which the principles of generation are used is free and infinite."
predicted = nb_classifier_with_tfidf.predict([test_text])
print(class_names[predicted[0]])

Languages


#### **Naive Bayes with bigram probabilities**

In [18]:
class NGram:
    def __init__(self, text, n=1):
        self.n = n
        self.total_unigrams = 0
        self.unigram_freq = {}
        self.ngram_freq = {}
        self.tokenized_text = self.tokenize(text)
        self.vocab_size = 0
    
    def tokenize(self, text):
        sents = nltk.sent_tokenize(text)
        tokenized_text = []
        for s in sents:
            # remember to add (n - 1) <s> at the beginning of sentence for ngram
            tokenized_text.extend(["<s>"] * (self.n - 1))
            words = nltk.word_tokenize(s)
            tokenized_text.extend(words)
            tokenized_text.append("</s>")
        return tokenized_text
    
    def compute_unigram_count_matrix(self):
        for i in range(len(self.tokenized_text)):
            unigram = self.tokenized_text[i]
            if unigram not in self.unigram_freq.keys():
                self.unigram_freq[unigram] = 1
                self.vocab_size += 1
            else:
                self.unigram_freq[unigram] += 1
            self.total_unigrams += 1
        return self.unigram_freq
    
    def compute_ngram_count_matrix(self):
        for i in range(0, len(self.tokenized_text) - self.n + 1):
            ngram = tuple(self.tokenized_text[i : i + self.n])
            if ngram not in self.ngram_freq.keys():
                self.ngram_freq[ngram] = 1
            else:
                self.ngram_freq[ngram] += 1
        # for Laplace smoothing add one to every count
        for k in self.ngram_freq.keys():
            self.ngram_freq[k] += 1
        return self.ngram_freq
    
    def print_unigram_counts(self):
        print("---------------------------------------------------")
        print("Unigram counts")
        print("---------------------------------------------------")
        df = pd.DataFrame.from_records(list(self.unigram_freq.items()), columns=['Unigram', 'Count'])
        print(df)
        print("---------------------------------------------------")
    
    def print_ngram_counts(self):
        print("---------------------------------------------------")
        print(f"Ngram counts (n = {self.n})")
        print("---------------------------------------------------")
        df = pd.DataFrame.from_records(list(self.ngram_freq.items()), columns=['Ngram', 'Count'])
        print(df)
        print("---------------------------------------------------")
    
    def get_unigram_probability_from_counts(self):
        total = sum(self.unigram_freq.values())
        self.unigram_probability_distribution = {}
        for k, v in self.unigram_freq.items():
            self.unigram_probability_distribution[k] = v/total
        return self.unigram_probability_distribution
    
    def get_ngram_probability_from_counts(self):
        total = sum(self.ngram_freq.values())
        self.ngram_probability_distribution = {}
        for k, v in self.ngram_freq.items():
            # using Laplace smoothing, and hence adding the vocab size in the denominator
            self.ngram_probability_distribution[k] = v/(self.unigram_freq[k[0]] + self.vocab_size)
        del self.ngram_probability_distribution[('</s>', '<s>')]
        return self.ngram_probability_distribution
    
    def print_unigram_probabilities(self):
        print("---------------------------------------------------")
        print(f"Unigram Probabilities")
        print("---------------------------------------------------")
        df = pd.DataFrame.from_records(list(self.unigram_probability_distribution.items()), columns=['Unigram', 'P'])
        print(df)
        print("---------------------------------------------------")
    
    def print_ngram_probabilities(self):
        print("---------------------------------------------------")
        print(f"Ngram Probabilities")
        print("---------------------------------------------------")
        df = pd.DataFrame.from_records(list(self.ngram_probability_distribution.items()), columns=['Ngram', 'P'])
        print(df)
        print("---------------------------------------------------")
    
    def get_unique_words(self):
        return self.total_unigrams

In [19]:
class NBClassifierWithBigram:
    def __init__(self):
        self.class_ngrams = {}
        self.class_counts = {}
        self.class_bigram_prob = {}
        self.total_samples = 0
        self.n = 2
    
    def fit(self, X, y):
        class_texts = {}
        for i in range(len(X)):
            cls = int(y[i])
            # update class count
            if cls not in self.class_counts:
                self.class_counts[cls] = 1
                self.total_samples += 1
                class_texts[cls] = X[i]
            else:
                self.class_counts[cls] += 1
                class_texts[cls] += (" " + X[i])

        for k, v in class_texts.items():
            class_texts[k] = preprocessor.remove_numbers(class_texts[k].lower())
            self.class_ngrams[k] = NGram(class_texts[k], n=self.n)
            self.class_ngrams[k].compute_unigram_count_matrix()
            self.class_ngrams[k].compute_ngram_count_matrix()
            self.class_ngrams[k].get_unigram_probability_from_counts()
            # let this be the smoothed probabilities
            self.class_bigram_prob[k] = self.class_ngrams[k].get_ngram_probability_from_counts()
    
    def predict(self, X_test):
        predictions = []
        for x in X_test:
            x = preprocessor.remove_numbers(x.lower())
            words = self.tokenize(x)
            max_score = float('-inf')
            predicted_class = None
            for c in self.class_counts.keys():
                p_class = self.class_counts[c] / sum(self.class_counts.values())
                score = 1
                for i in range(0, len(words) - 2 + 1):
                    bg = (words[i], words[i+1])
                    if bg in self.class_bigram_prob[c].keys():
                        score *= self.class_bigram_prob[c][bg]
                    else:
                        if words[i] in self.class_ngrams[c].unigram_freq.keys():
                            score *= 1 / (self.class_ngrams[c].unigram_freq[words[i]] + self.class_ngrams[c].vocab_size)
                        else:
                            # unigram count of words[i] will be 1 since we added the bigram (words[i], words[i+1])
                            score *= 1 / (1 + self.class_ngrams[c].vocab_size) 
                if score > max_score:
                    max_score = score
                    predicted_class = c
            predictions.append(predicted_class)
        return predictions

    def tokenize(self, text):
        sents = nltk.sent_tokenize(text)
        tokenized_text = []
        for s in sents:
            # remember to add (n - 1) <s> at the beginning of sentence for ngram
            tokenized_text.extend(["<s>"] * (self.n - 1))
            words = nltk.word_tokenize(s)
            tokenized_text.extend(words)
            tokenized_text.append("</s>")
        return tokenized_text

In [20]:
nb_classifier_with_bigram = NBClassifierWithBigram()
nb_classifier_with_bigram.fit(df['text'], df['class'])

In [21]:
test_text = ["The most important aspect of computer science is problem solving, an essential skill for life.",
             "Computer science deals with the theory, design, development, and application of computational systems and software."]
predictions = nb_classifier_with_bigram.predict(test_text)
for p in predictions:
    print(f"Predicted class = {class_names[p]}")

Predicted class = Computer science
Predicted class = Computer science


#### **Using AG-news dataset for training and testing**

In [42]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [43]:
X_train = train_data['text']
y_train = train_data['label']
X_test = test_data['text']
y_test = test_data['label']

In [34]:
def train_model(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    return metrics.accuracy_score(predictions, y_test)

In [35]:
nb_classifier_with_bow2 = NBClassifierWithBOW()
acc1 = train_model(nb_classifier_with_bow2, X_train, y_train, X_test, y_test)


-------------------------------------------
Class 2
-------------------------------------------
Count = 30000
                  Word  Count
0                 Wall   1078
1                  St.     74
2                Bears      5
3                 Claw      2
4                 Back    118
...                ...    ...
48246  northern\export      1
48247           Ceyhan      2
48248       mega-deals      1
48249     Cost-Cutters      1
48250          mantras      1

[48251 rows x 2 columns]

-------------------------------------------
Class 3
-------------------------------------------
Count = 30000
               Word  Count
0           'Madden      9
1                 ,  41668
2                 '   1164
3             'ESPN      2
4          Football     20
...             ...    ...
57628       Naughty      1
57629       Wichita      1
57630     dollhouse      1
57631     Digitized      1
57632  165-year-old      1

[57633 rows x 2 columns]

-----------------------------------------

In [36]:
print(f'Accuracy using BOW: {acc1}')

Accuracy using BOW: 0.8505263157894737


In [46]:
nb_classifier_with_tfidf2 = NBClassifierWithTfIdf()
acc2 = train_model(nb_classifier_with_tfidf2, X_train, y_train, X_test, y_test)

In [47]:
print(f'Accuracy using TF-IDF vectorizer: {acc2}')

Accuracy using TF-IDF vectorizer: 0.8072368421052631


In [48]:
nb_classifier_with_bigram2 = NBClassifierWithBigram()
acc3 = train_model(nb_classifier_with_bigram2, X_train, y_train, X_test, y_test)

In [49]:
print(f'Accuracy using bigrams: {acc3}')

Accuracy using bigrams: 0.890921052631579
