In [1]:
import csv
import numpy as np
import my_stemmer
from sklearn.model_selection import train_test_split
import utils

In [2]:
import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

In [6]:
def load_news_dataset(training_data_file_path):
    headlines = []
    labels = []
    with open(training_data_file_path, 'r', encoding='utf8') as tsv_file:
        filereader = csv.reader(tsv_file, delimiter='\t')
        
        for row in filereader:
            headlines.append(row[0])
            labels.append(int(row[4]))
            
    return headlines, labels


def create_dictionary(headlines):
    temp_counter = {}
    vocabs = {}
    i = 0
    for headline in headlines:
        words = utils.get_words(headline)
        for word in words:
            if word not in temp_counter:
                temp_counter[word] = 1
            else:
                temp_counter[word] += 1
                
            # only considering word into dictionary if it's occurence exceeds given threshold
            if temp_counter[word] == 1:
                vocabs[word] = i
                i += 1
    return vocabs


def bag_of_words(headlines, word_dictionary, multinomial):
    N, V = len(headlines), len(word_dictionary)
    data = np.zeros((N, V))
    for i, headline in enumerate(headlines):
        for word in utils.get_words(headline):
            if word in word_dictionary:
                if multinomial:
                    data[i, word_dictionary[word]] += 1 # Multinomial Naive Bayes
                else:
                    data[i, word_dictionary[word]] = 1 # Bernoulli event model
    return data 


def refine_to_one_vs_rest_dataset(headlines, labels, one_label=None):
    one_vs_rest_labels = []
    if one_label is not None:
        for label in labels:
            one_vs_rest_labels.append(1 if label == one_label else 0)
    else:
        raise TypeError("Missing positional argument: one_label")
        
    return headlines, np.array(one_vs_rest_labels)


def fit_naive_bayes_model(matrix, labels, multinomial):
    total_headlines_in_label_one = np.sum(labels)
    phi_y = labels.sum(axis=0) / np.shape(labels)
    
    phi_x = np.zeros((2, matrix.shape[1]))
    phi_x[0] = matrix[labels == 0].sum(axis=0)
    phi_x[1] = matrix[labels == 1].sum(axis=0)
    
    # Laplace smoothing (for multinomial event)
    if multinomial:
        phi_x += 1
        phi_x = phi_x / phi_x.sum(axis=1, keepdims=True)
    else:
        # Laplace smoothing (for bernoulli event)
        phi_x += 1
        phi_x[0] = phi_x[0] / (len(labels) - total_headlines_in_label_one + 3)
        phi_x[1] = phi_x[1] / (total_headlines_in_label_one + 3)
    
    return phi_x, phi_y


def predict_from_naive_bayes_model(model, matrix):
    if matrix.ndim == 1:
        matrix = np.expand_dims(matrix, axis=-2)
    phi_x, phi_y = model
    log_likelihood = np.sum(matrix[:, None] * np.log(phi_x[None]), axis=-1)
    log_likelihood[:, 0] += np.log(1 - phi_y)
    log_likelihood[:, 1] += np.log(phi_y)
    
    return np.array(log_likelihood)

def stem_headlines(headlines):
    stemmed_headlines = []
    for i, headline in enumerate(headlines):
        result_from_my_stemmer = my_stemmer.stem_it(headline)
        if i%10 == 0:
            print(f"{i} Headlines Stemming Complete")
        stemmed_headlines.append([result_from_my_stemmer])
        
    return stemmed_headlines

In [10]:
def naive_bayes_model(train_headlines, test_headlines, raw_train_labels, raw_test_labels, multinomial=False):
    # maintaining list to store likelihood for each category
    log_likelihoods = []
    
    # Creating dictionary of words
    dictionary = create_dictionary(train_headlines)
    print(f"Total dictionary words: {len(dictionary)}")
    
    train_matrix = bag_of_words(train_headlines, dictionary, multinomial)
    test_matrix = bag_of_words(test_headlines, dictionary, multinomial)
      
    classification_labels = [0, 1, 2]
    
    for class_label in classification_labels:
        
        # refine dataset to fit for one-vs-rest
        train_headlines, train_labels = refine_to_one_vs_rest_dataset(train_headlines, raw_train_labels, class_label)
        test_headlines, test_labels = refine_to_one_vs_rest_dataset(test_headlines, raw_test_labels, class_label)
        
        # Fit and predict
        model = fit_naive_bayes_model(train_matrix, train_labels, multinomial)
        log_likelihood = predict_from_naive_bayes_model(model, test_matrix)
        log_likelihoods.append(log_likelihood[:, 1])
        
    # Calculating accuracy
    # TO check why reshaping and changing axis=1 does give only about 1/3 accuracy
#     log_likelihoods = np.reshape(log_likelihoods, (np.shape(log_likelihoods)[-1], np.shape(log_likelihoods)[-2]))
    test_predictions = np.argmax(log_likelihoods, axis=0)
    accuracy = np.mean([test_predictions == raw_test_labels])
    print(f"Accuracy obtained: {accuracy}")
    
    if multinomial:
        # Fit and predict using MultinomialNB Sklearn
        MultiNB = MultinomialNB()
        MultiNB.fit(train_matrix, raw_train_labels)
        sklearn_multinomial_predictions = MultiNB.predict(test_matrix)
        sklearn_multinomial_accuracy = np.mean([sklearn_multinomial_predictions == raw_test_labels])
        print(f"Multinomial sklearn: {sklearn_multinomial_accuracy}")
    
    else:
        # Fit and predict using Sklearn(Bernoulli)   
        BernNB = BernoulliNB()
        BernNB.fit(train_matrix, raw_train_labels)
        sklearn_bernoulli_predictions = BernNB.predict(test_matrix)
        sklearn_bernoulli_accuracy = np.mean([sklearn_bernoulli_predictions == raw_test_labels])
        print(f"Bernoulli sklearn: {sklearn_bernoulli_accuracy}")

In [7]:
training_data_file_path = r'/home/jay/projectWorks/Inception/data_preprocessing/data_to_label/final_data_to_label/final_training_data/first_training_data.tsv'

# Loading headlines and raw_labels
headlines, raw_labels = load_news_dataset(training_data_file_path)

# Splitting dataset for training and validation set
train_headlines, test_headlines, raw_train_labels, raw_test_labels = train_test_split(headlines, raw_labels, test_size=0.3, random_state=1)

# Stemming
stemmed_train_headlines = stem_headlines(train_headlines)
stemmed_test_headlines = stem_headlines(test_headlines)

stemmed_train_headlines = np.reshape(stemmed_train_headlines, (len(stemmed_train_headlines), ))
stemmed_test_headlines = np.reshape(stemmed_test_headlines, (len(stemmed_test_headlines), ))

0 Headlines Stemming Complete
10 Headlines Stemming Complete
20 Headlines Stemming Complete
30 Headlines Stemming Complete
40 Headlines Stemming Complete
50 Headlines Stemming Complete
60 Headlines Stemming Complete
70 Headlines Stemming Complete
80 Headlines Stemming Complete
90 Headlines Stemming Complete
100 Headlines Stemming Complete
110 Headlines Stemming Complete
120 Headlines Stemming Complete
130 Headlines Stemming Complete
140 Headlines Stemming Complete
150 Headlines Stemming Complete
160 Headlines Stemming Complete
170 Headlines Stemming Complete
180 Headlines Stemming Complete
190 Headlines Stemming Complete
200 Headlines Stemming Complete
210 Headlines Stemming Complete
220 Headlines Stemming Complete
230 Headlines Stemming Complete
240 Headlines Stemming Complete
250 Headlines Stemming Complete
260 Headlines Stemming Complete
270 Headlines Stemming Complete
280 Headlines Stemming Complete
290 Headlines Stemming Complete
300 Headlines Stemming Complete
310 Headlines Stemm

In [13]:
# Fitting multinomial NB
naive_bayes_model(train_headlines, test_headlines, raw_train_labels, raw_test_labels, multinomial=True)

Total dictionary words: 3618
Accuracy obtained: 0.6287625418060201
Multinomial sklearn: 0.6287625418060201


In [14]:
# Fitting bernoulli NB
naive_bayes_model(train_headlines, test_headlines, raw_train_labels, raw_test_labels, multinomial=False)

Total dictionary words: 3618
Accuracy obtained: 0.5819397993311036
Bernoulli sklearn: 0.568561872909699
