# Naive Bayes text classification

Naive Bayes model with classic and TF/IDF algorithm to solve text classification problem


In [1]:
import re
import csv
import math
from collections import Counter
from collections import defaultdict
import numpy as np
import nltk

In [2]:
demo_data = [["Chinese Beijing Chinese","0"],
             ["Chinese Chinese Shanghai","0"],
             ["Chinese Macao","0"],
             ["Tokyo Japan Chinese","1"]]
demo_pred =  "Chinese Chinese Chinese Tokyo Japan"

In [3]:
class NaiveBayes:
    """
    Abstract parent class for implementation base skeleton Naive Bayes text classification model
    without classification algorithm. Classification algorithm (classic or TF-IDF) will be implementated in inheritors.
    """
    
    def __init__(self, data, isUseLog = False):
        """
        Constructor, init class fields
        Parameters
            data - texts with labels 
            isUseLog - use the sum of logarithms instead of the multiplication of probabilities
        Returns
            no return     
        """
        self.corpus, self.labels  = self.data_split(data)
        self.classes = Counter(self.labels)
        self.isUseLog = isUseLog
     
    def data_split(self, data):
        """
        Split text with labels into apart array of word lists and an array of labels
        Words convert to lowercase
        Parameters
            data - texts with labels 
        Returns
            corpus - array of texts (text - a list of words)
            labels - array of texts classification labels
        """
        corpus = []
        labels = []
        for text, label in data:
            corpus.append(text.lower().split())
            labels.append(label)
        return corpus, labels
    
    def get_data_statistic(self):
        """
        Get main characteristics of the input texts and classes data
        Parameters
            no parameters
        Returns
            no return 
            Print to display main characteristics of the input texts and classes data
        """
        print ("*** NaiveBayes data statistic ***")
        print ("Corpus length = ", len(self.corpus))
        print ("classes count = ", dict(self.classes))
        print ("classes ratio = ", {i: self.classes[i]/len(self.corpus) for i in self.classes} )
        print ("unique words in corpus = ", self.get_unique_words())
        print ("*** ------------------------- ***")
    
    def get_unique_words(self):
        """
        Calculate unique words in corpus count
        Parameters
            no parameters
        Returns
            number of unique words in corpus
        """
        unique_words_in_corpus = Counter()
        for doc in self.corpus:
            unique_words_in_corpus += Counter(doc)
        return len(unique_words_in_corpus)
    
    def get_prior (self, class_label):
        """
        Calculate probability given class in corpus
        Parameters
            class_label - label of class for calc probability
        Returns
            probability class in corpus
        """
        return self.classes[class_label]/len(self.corpus)          
    
    def fit(self):
        # abstract method, will be implemented in the inheritors
        """
        Model fiting by calculating parameters and store in class fields  
        """
        pass
    
    def predict_doc(self, doc):
        # abstract method, will be implemented in the inheritors
        """
        Classification doc to class
        Parameters
            doc - text (one doc) for classification
        """
        pass
    
    def predict(self, docs):
        """
        Predicts and checks matches for a lot of texts with labels
        Parameters
            docs - texts with labels
        Returns
            matches for each doc in docs in Counter format
        """
        matches = []
        for doc, label in docs:
            y,p = self.predict_doc(doc)
            matches.append(y == label)
        return Counter(matches)


In [4]:
class NaiveBayesClassic(NaiveBayes):
    """
    Class implementing Naive Bayes text classification model
    """
    
    def get_class_words(self):
        """
        Scan corpus for build words occurrences dictionary
        Parameters
            no parameters
        Returns
            words - words in corpus for each label, format - dict(label, Counter)
            total - total words in corpus for each label, format - dict(label, count)
        """
        total = defaultdict(int)
        words = defaultdict(Counter)       
        for i in range(len(self.corpus)):           
            words[self.labels[i]] += Counter(self.corpus[i])
            total[self.labels[i]] += len(self.corpus[i])
        return words, total
        
    def get_p (self, class_label, word):
        """
        Calculate conditional probability for word in class
        Parameters
            class_label - label of class
            word - word for calc conditional probability
        Returns
            p - conditional probability for word in class
        """
        # P(word|class) = (word_count_in_class + 1)/(total_words_in_class+total_unique_words_in_corpus) 
        p = (self.words[class_label][word] + 1)/(self.total[class_label] + self.unique)
        return p
    
    def fit(self):
        """
        Model fiting by calculating parameters and store in class fields  
        Parameters
            no parameters
        Returns
            no return  
        """
        self.words, self.total = self.get_class_words()
        self.unique = self.get_unique_words()
        
    def predict_doc(self, doc):
        """
        Classification doc to class
        Parameters
            doc - text (one doc) for classification
        Returns
            y - class label with max probabilitiy
            p - dict of probabilities of affiliation to each class
        """
        p = dict()
        for label in self.classes:
            p[label] = self.get_prior (label)
            if self.isUseLog:
                p[label] = math.log(p[label])
            for word in doc.lower().split():
                if self.isUseLog:
                    p[label] += math.log(self.get_p(label,word))
                else:
                    p[label] *= self.get_p(label,word)
        y = max(p, key=p.get)
        return y, p 

In [5]:
# Test NaiveBayes Model for demo data

# pobability
nbm = NaiveBayesClassic(demo_data)
nbm.get_data_statistic()
nbm.fit()
y,p = nbm.predict_doc(demo_pred)
print("pobability   ", y, p)

# log
nbm = NaiveBayesClassic(demo_data, True)
nbm.fit()
y,p = nbm.predict_doc(demo_pred)
print("log          ", y, p)

# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]
# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}
# or log     {'1': -7.906681345001262, '0': -7.10769031284391}

*** NaiveBayes data statistic ***
Corpus length =  4
classes count =  {'0': 3, '1': 1}
classes ratio =  {'0': 0.75, '1': 0.25}
unique words in corpus =  6
*** ------------------------- ***
pobability    0 {'0': 0.00030121377997263036, '1': 0.00013548070246744226}
log           0 {'0': -8.10769031284391, '1': -8.906681345001262}


In [6]:
%%time

# data.csv prepare dataset

def read_csv_file(file_name):
    with open(file_name, 'r') as csv_file:
        reader = csv.reader(csv_file)
        data = [doc for doc in reader]
        csv_file.close()    
    return data

def delete_stopwords(data):
    # nltk.download('stopwords')  # 1 time or nltk.download()
    stopwords = nltk.corpus.stopwords.words('english')
    pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
    for i in range(len(data)):   
        data[i][0] = data[i][0].lower()
        data[i][0] = pattern.sub('', data[i][0])   # filter(lambda x: x not in stopwords, data[i][0])
    return data

def train_verif_split(data, train_persent):
    train_count = int(len(data)*train_persent/100.0)
    train = data[:train_count]
    verif = data[train_count:]
    return train, verif

# print statistic
data = read_csv_file("data.csv")
train, verif = train_verif_split (data, 80.0)

print(len(data), "=", len(train), "+", len(verif))
print("------------")

nbm = NaiveBayes(data)
nbm.get_data_statistic()

nbm = NaiveBayes(train)
nbm.get_data_statistic()

nbm = NaiveBayes(verif)
nbm.get_data_statistic()

1118 = 894 + 224
------------
*** NaiveBayes data statistic ***
Corpus length =  1118
classes count =  {'0': 380, '1': 738}
classes ratio =  {'0': 0.33989266547406083, '1': 0.6601073345259392}
unique words in corpus =  33697
*** ------------------------- ***
*** NaiveBayes data statistic ***
Corpus length =  894
classes count =  {'0': 297, '1': 597}
classes ratio =  {'0': 0.33221476510067116, '1': 0.6677852348993288}
unique words in corpus =  26275
*** ------------------------- ***
*** NaiveBayes data statistic ***
Corpus length =  224
classes count =  {'1': 141, '0': 83}
classes ratio =  {'1': 0.6294642857142857, '0': 0.3705357142857143}
unique words in corpus =  15295
*** ------------------------- ***
Wall time: 1.67 s


In [7]:
%%time

# data.csv model
data = read_csv_file("data.csv")
train, verif = train_verif_split (data, 80.0)

nbm = NaiveBayesClassic(train)
nbm.fit()
matches = nbm.predict(verif)
print ("pobability")
print ("Matches:  ", matches) 
print ("Accuracy: ", {i: matches[i]/len(verif) for i in matches}) 
print ("----------")

nbm = NaiveBayesClassic(train, True)
nbm.fit()
matches = nbm.predict(verif)
print ("log")
print ("Matches:  ", matches) 
print ("Accuracy: ", {i: matches[i]/len(verif) for i in matches}) 
print ("----------")

nbm = NaiveBayesClassic(delete_stopwords(train), True)
nbm.fit()
matches = nbm.predict(delete_stopwords(verif))
print ("log without stopwords")
print ("Matches:  ", matches) 
print ("Accuracy: ", {i: matches[i]/len(verif) for i in matches}) 
print ("----------")


pobability
Matches:   Counter({True: 117, False: 107})
Accuracy:  {False: 0.47767857142857145, True: 0.5223214285714286}
----------
log
Matches:   Counter({True: 214, False: 10})
Accuracy:  {True: 0.9553571428571429, False: 0.044642857142857144}
----------
log without stopwords
Matches:   Counter({True: 211, False: 13})
Accuracy:  {True: 0.9419642857142857, False: 0.05803571428571429}
----------
Wall time: 3.6 s


# TF-IDF algorithm

## Term Frequency
TF — это частотность термина, которая измеряет, насколько часто термин встречается в документе. Логично предположить, что в длинных документах термин может встретиться в больших количествах, чем в коротких, поэтому абсолютные числа тут не катят. Поэтому применяют относительные — делят количество раз, когда нужный термин встретился в тексте, на общее количество слов в тексте. 

## Inverse Document Frequency
IDF — это обратная частотность документов. Она измеряет непосредственно важность термина. То есть, когда мы считали TF, все термины считаются как бы равными по важности друг другу. Но всем известно, что, например, предлоги встречаются очень часто, хотя практически не влияют на смысл текста. И что с этим поделать? Ответ прост — посчитать IDF. Он считается как логарифм от общего количества документов, делённого на количество документов, в которых встречается термин а.

#### TF термина а = (Количество раз, когда термин а встретился в тексте / количество всех слов в тексте)
#### IDF термина а = (Общее количество документов / Количество документов, в которых встречается термин а)

In [8]:
class NaiveBayesTfIdf(NaiveBayes):
    """
    Class implementing Naive Bayes text classification model with tf-idf algorithm
    """
    
    def calc_tf(self, doc):
        """
        Calculate term frequency for text
        Parameters
            doc - text, array of words
        Returns
            tf - term frequency for each word in Counter format 
        """
        tf = Counter(doc)
        for i in tf:
            tf[i] = tf[i]/float(len(doc))
        return tf

    def calc_idf(self, word):
        """
        Calculate inverse document frequency for word in corpus
        Parameters
            word - word for calc
        Returns
            inverse document frequency word in corpus for each class in format dict(class, idf)
        """
        idf = defaultdict(float)
        for i in range(len(self.corpus)):
            if word in self.corpus[i]:
                idf[self.labels[i]] += 1
        return {label: math.log((self.classes[label]+2)/(idf[label]+1)) for label in self.classes}
    
    def get_words_idf(self):
        """
        Calculate inverse document frequency for each unique word in corpus
        Parameters
            no parameters
        Returns
            idfs - inverse document frequency in format dictionary(word, idf)
        """
        idfs = defaultdict(dict)
        
        unique_words_in_corpus = Counter()
        for doc in self.corpus:
            unique_words_in_corpus += Counter(doc)
        for word in unique_words_in_corpus:
            idfs[word] = self.calc_idf(word)
        return idfs
    
    def fit(self):
        """
        Model fiting by calculating parameters and store in class fields  
        Parameters
            no parameters
        Returns
            no return  
        """
        self.idf = self.get_words_idf()
        
    def predict_doc(self, doc):
        """
        Classification doc to class
        Parameters
            doc - text (one doc) for classification
        Returns
            y - class label with max probabilitiy
            p - dict of probabilities of affiliation to each class
        """
        doc = doc.lower().split()
        tf = self.calc_tf(doc)
        p = dict()
        for label in self.classes:
            p[label] = self.get_prior (label)
            if self.isUseLog:
                p[label] = math.log(p[label])
            for word in doc:
                if word in self.idf:
                    idf_ = self.idf[word][label]
                else:
                    idf_ = 1.0 # 1.0/tf[word]
                if self.isUseLog:
                    p[label] += math.log(tf[word]/idf_)
                else:
                    p[label] *= (tf[word]/idf_)
        y = max(p, key=p.get)
        return y, p 

In [9]:
# TF-IDF

# pobability
nbm = NaiveBayesTfIdf(demo_data)
nbm.fit()
y,p = nbm.predict_doc(demo_pred)
print("pobability   ", y, p)

# log
nbm = NaiveBayesTfIdf(demo_data, True)
nbm.fit()
y,p = nbm.predict_doc(demo_pred)
print("log          ", y, p)

pobability    0 {'0': 0.22515081802568107, '1': 0.19709985929564233}
log           0 {'0': -1.4909847989936278, '1': -1.6240447786966632}


In [10]:
%%time

# data.csv model
data = read_csv_file("data.csv")
train, verif = train_verif_split (data, 80.0)  #data[:400]

nbm = NaiveBayesTfIdf(train)
nbm.get_data_statistic()
nbm.fit()
matches = nbm.predict(verif)
print ("pobability")
print ("Matches:  ", matches) 
print ("Accuracy: ", {i: matches[i]/len(verif) for i in matches}) 
print ("----------")

nbm = NaiveBayesTfIdf(train, True)
nbm.fit()
matches = nbm.predict(verif)
print ("log")
print ("Matches:  ", matches) 
print ("Accuracy: ", {i: matches[i]/len(verif) for i in matches}) 
print ("----------")

nbm = NaiveBayesTfIdf(delete_stopwords(train), True)
nbm.fit()
matches = nbm.predict(delete_stopwords(verif))
print ("log without stopwords")
print ("Matches:  ", matches) 
print ("Accuracy: ", {i: matches[i]/len(verif) for i in matches}) 
print ("----------")


*** NaiveBayes data statistic ***
Corpus length =  894
classes count =  {'0': 297, '1': 597}
classes ratio =  {'0': 0.33221476510067116, '1': 0.6677852348993288}
unique words in corpus =  26275
*** ------------------------- ***
pobability
Matches:   Counter({True: 133, False: 91})
Accuracy:  {True: 0.59375, False: 0.40625}
----------
log
Matches:   Counter({True: 181, False: 43})
Accuracy:  {True: 0.8080357142857143, False: 0.19196428571428573}
----------
log without stopwords
Matches:   Counter({True: 206, False: 18})
Accuracy:  {True: 0.9196428571428571, False: 0.08035714285714286}
----------
Wall time: 3min 47s


- http://nlpx.net/archives/57
- https://stevenloria.com/tf-idf/
- https://ru.wikipedia.org/wiki/TF-IDF