# Comparison of naive Bayes and logistic regression for text categorization

Adapted from https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

## Load subset of "20 Newsgroups" dataset

In [None]:
categories = ["misc.forsale", "sci.space", 
              "sci.electronics", "comp.graphics"]
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, 
                                  shuffle=True)
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, 
                                 shuffle=True)

In [None]:
len(twenty_train.data)

In [None]:
len(twenty_test.data)

In [None]:
for t in twenty_train.target[:5]:
    print(twenty_train.target_names[t])

In [None]:
y_train = twenty_train.target
y_test = twenty_test.target

## Normalize and vectorize documents

In [None]:
vectorizer = TfidfVectorizer(min_df=3, stop_words="english").fit(twenty_train.data)
X_train = vectorizer.transform(twenty_train.data)
X_test = vectorizer.transform(twenty_test.data)

In [None]:
X_train.shape

## Naive Bayes model

In [None]:
%%time
nb_model = MultinomialNB(alpha=1.0).fit(X_train, y_train)
y_hat_nb_test = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat_nb_test, 
                            target_names=twenty_train.target_names))

## Logistic regression model

In [None]:
%%time
lr_model = LogisticRegression(penalty="none", 
                              multi_class="multinomial",
                              solver="lbfgs").fit(X_train, y_train)
y_hat_lr_test = lr_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat_lr_test, 
                            target_names=twenty_train.target_names))

## Logistic regression with L2 penalty

In [None]:
%%time
lr2_model = LogisticRegression(penalty="l2", 
                               solver="lbfgs",
                               multi_class="multinomial",
                               max_iter=1000,
                               C=10).fit(X_train, y_train)
y_hat_lr2_test = lr2_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat_lr2_test, 
                            target_names=twenty_train.target_names))

## Comparison of train/test performance across models

In [None]:
model_info = {"Naive Bayes": nb_model,
              "Logistic Regression": lr_model,
              "L2 Regularized LR": lr2_model}
plot_data = []
for name, model in model_info.items():
    train_acc = accuracy_score(y_train, model.predict(X_train))
    plot_data.append([name, "Train", train_acc])
    test_acc = accuracy_score(y_test, model.predict(X_test))
    plot_data.append([name, "Test", test_acc])  

In [None]:
plt.figure(figsize=(6,6))
plt.ylim((0.9,1))
plot_df = pd.DataFrame(plot_data, columns=["model", "dataset", "accuracy"])
sns.lineplot(data=plot_df, 
             sort=False,
             x="dataset", 
             y="accuracy", 
             hue="model")
plt.show()

## Feature importances

In [None]:
pd.options.display.float_format = '{:.4f}'.format
vocab = {idx: w for w, idx in vectorizer.vocabulary_.items()}

### Naive Bayes

In [None]:
word_data = {}
for i, c in enumerate(twenty_train.target_names):
    top_features = np.argsort(nb_model.feature_log_prob_[i,:])[-1:-11:-1]
    logprobs = nb_model.feature_log_prob_[i,top_features]
    words = [vocab[x] for x in top_features]
    word_data[f"{c}_P(w|c)"] = [np.exp(x) for x in logprobs]
    word_data[f"{c}_words"] = words

In [None]:
pd.DataFrame(word_data).T

## Logistic regression

In [None]:
word_data = {}
for i, c in enumerate(twenty_train.target_names):
    top_features = np.argsort(lr2_model.coef_[i,:])[-1:-11:-1]
    coefs = lr2_model.coef_[i,top_features]
    words = [vocab[x] for x in top_features]
    word_data[f"{c}_beta"] = coefs
    word_data[f"{c}_words"] = words

In [None]:
pd.DataFrame(word_data).T

In [None]:
import pandas as pd
dataset = pd.read_csv('data.csv', encoding='ISO-8859-1');

import re
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

from autocorrect import Speller

spell = Speller(lang='en')

data = []

for i in range(dataset.shape[0]):
    sms = dataset.iloc[i, 1]

    # remove non alphabatic characters
    sms = re.sub('[^A-Za-z]', ' ', sms)

    # make words lowercase, because Go and go will be considered as two words
    sms = sms.lower()

    # tokenising
    tokenized_sms = wt(sms)

    # remove stop words and stemming
 
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(spell(stemmer.stem(word)))

    sms_text = " ".join(sms_processed)
    data.append(sms_text)



In [None]:
# creating the feature matrix 
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
dataset.iloc[:, 0]

X = matrix.fit_transform(data).toarray()
y = dataset.iloc[:, 0]



In [None]:
# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(y_test)

In [1]:
import numpy as np
import os
import string
import sys
import time
import json
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups
import random as rn
#All this for reproducibility
np.random.seed(1)
rn.seed(1)
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk_stopw = stopwords.words('english')

wvLength = 300
vectorSource = str(sys.argv[1]) # none, fasttext, custom-fasttext

def tokenize (text):        #   no punctuation & starts with a letter & between 2-15 characters in length
    tokens = [word.strip(string.punctuation) for word in RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(text)]
    return  [f.lower() for f in tokens if f and f.lower() not in nltk_stopw]

def get20News():
    X, labels, labelToName = [], [], {}
    twenty_news = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
    for i, article in enumerate(twenty_news['data']):
        stopped = tokenize (article)
        if (len(stopped) == 0):
            continue
        groupIndex = twenty_news['target'][i]
        X.append(stopped)
        labels.append(groupIndex)
        labelToName[groupIndex] = twenty_news['target_names'][groupIndex]
    nTokens = [len(x) for x in X]
    return X, np.array(labels), labelToName, nTokens

def getEmbeddingMatrix (word_index, vectorSource):
    wordVecSources = {'fasttext' : './vectors/crawl-300d-2M-subword.vec', 'custom-fasttext' : './vectors/' + '20news-fasttext.json' }
    f = open (wordVecSources[vectorSource])
    allWv = {}
    if (vectorSource == 'custom-fasttext'):
        allWv = json.loads(f.read())
    elif (vectorSource == 'fasttext'):
        errorCount = 0
        for line in f:
            values = line.split()
            word = values[0].strip()
            try:
                wv = np.asarray(values[1:], dtype='float32')
                if (len(wv) != wvLength):
                    errorCount = errorCount + 1
                    continue
            except:
                errorCount = errorCount + 1
                continue
            allWv[word] = wv
        print ("# Bad Word Vectors:", errorCount)
    f.close()
    embedding_matrix = np.zeros((len(word_index)+1, wvLength))  # +1 for the masked 0
    for word, i in word_index.items():
        if word in allWv:
            embedding_matrix[i] = allWv[word]
    return embedding_matrix

def sparseMultiply (sparseX, corpus_embedding_matrix):
    denseZ = []
    for row in sparseX:
        newRow = np.zeros(wvLength)
        for nonzeroLocation, value in list(zip(row.indices, row.data)):
            newRow = newRow + value * corpus_embedding_matrix[nonzeroLocation]
        denseZ.append(newRow)
    denseZ = np.array([np.array(xi) for xi in denseZ])
    return denseZ


In [6]:

X, labels, labelToName, nTokens = get20News()
print(X[0])

print("\n\n\n\n")
print(labels[0])

print("\n\n\n\n")
print(nTokens[0])

print("\n\n\n\n")


# print (np.amin(nTokens), np.mean(nTokens),np.median(nTokens),np.std(nTokens),np.percentile(nTokens,85),np.percentile(nTokens,86),np.percentile(nTokens,87),np.percentile(nTokens,88),np.percentile(nTokens,89),np.percentile(nTokens,90),np.percentile(nTokens,91),np.percentile(nTokens,92),np.percentile(nTokens,93),np.percentile(nTokens,94),np.percentile(nTokens,95),np.percentile(nTokens,99),np.amax(nTokens))

labelToNameSortedByLabel = sorted(labelToName.items(), key=lambda kv: kv[0]) # List of tuples sorted by the label number [ (0, ''), (1, ''), .. ]
print(labelToNameSortedByLabel)
namesInLabelOrder = [item[1] for item in labelToNameSortedByLabel]
numClasses = len(namesInLabelOrder)
print ('X, labels #classes classes {} {} {} {}'.format(len(X), str(labels.shape), numClasses, namesInLabelOrder))



['sure', 'bashers', 'pens', 'fans', 'pretty', 'confused', 'lack', 'kind', 'posts', 'recent', 'pens', 'massacre', 'devils', 'actually', 'bit', 'puzzled', 'bit', 'relieved', 'however', 'going', 'put', 'end', 'non', 'pittsburghers', 'relief', 'bit', 'praise', 'pens', 'man', 'killing', 'devils', 'worse', 'thought', 'jagr', 'showed', 'much', 'better', 'regular', 'season', 'stats', 'also', 'lot', 'fun', 'watch', 'playoffs', 'bowman', 'let', 'jagr', 'lot', 'fun', 'next', 'couple', 'games', 'since', 'pens', 'going', 'beat', 'pulp', 'jersey', 'anyway', 'disappointed', 'see', 'islanders', 'lose', 'final', 'regular', 'season', 'game', 'pens', 'rule']





10





70





[(0, 'alt.atheism'), (1, 'comp.graphics'), (2, 'comp.os.ms-windows.misc'), (3, 'comp.sys.ibm.pc.hardware'), (4, 'comp.sys.mac.hardware'), (5, 'comp.windows.x'), (6, 'misc.forsale'), (7, 'rec.autos'), (8, 'rec.motorcycles'), (9, 'rec.sport.baseball'), (10, 'rec.sport.hockey'), (11, 'sci.crypt'), (12, 'sci.electronics'), (13, 'sci.

In [None]:

X=np.array([np.array(xi) for xi in X])          #   rows: Docs. columns: words
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1).fit(X)
word_index = vectorizer.vocabulary_
Xencoded = vectorizer.transform(X)
print ('Vocab sparse-Xencoded {} {}'.format(len(word_index), str(Xencoded.shape)))

if (vectorSource != 'none'):
    embedding_matrix = getEmbeddingMatrix (word_index, vectorSource)
    Xencoded = sparseMultiply (Xencoded, embedding_matrix)
    print ('Dense-Xencoded {}'.format(str(Xencoded.shape)))

# Test & Train Split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(Xencoded, labels)
train_indices, test_indices = next(sss)
train_x, test_x = Xencoded[train_indices], Xencoded[test_indices]
train_labels, test_labels = labels[train_indices], labels[test_indices]
start_time = time.time()
model = LinearSVC(tol=1.0e-6,max_iter=20000,verbose=1)
model.fit(train_x, train_labels)
predicted_labels = model.predict(test_x)
elapsed_time = time.time() - start_time
results = {}
results['confusion_matrix'] = confusion_matrix(test_labels, predicted_labels).tolist()
results['classification_report'] = classification_report(test_labels, predicted_labels, digits=4, target_names=namesInLabelOrder, output_dict=True)

print (confusion_matrix(labels[test_indices], predicted_labels))
print (classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder))
print ('Time Taken:', elapsed_time)
results['elapsed_time'] = elapsed_time        # seconds

f = open ('svm-' + vectorSource + '.json','w')
out = json.dumps(results, ensure_ascii=True)
f.write(out)
f.close()