<a href="https://colab.research.google.com/github/adimyth/datascience_stuff/blob/master/nlp/NBSVMClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBSVM Classifier

[Jeremy Howard's Kernel | Kaggle](https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline/comments?select=submission.csv)

[Sijun's Blog](https://sijunhe.github.io/blog/2018/04/03/nb-svm/)

NB-SVM was introduced by Sida Wang & Chris Manning in 2012. Bag-of-words model that trains with a fraction of resource (time & compute) but performs reasonably well.

* *Naive Bayes (NB)* and *Support Vector Machine (SVM)* are widely used as baselines in text-related tasks but their performance varies significantly across variants, features and datasets.
* *NB* does better than *SVM* for short snippet sentiment tasks, while *SVM* outperforms NB for longer documents
* A SVM variant using *NB* $log-count ratios$ as feature values consistently performs well across tasks and datasets

In [80]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score
import tensorflow_datasets as tfds 

## Loading Dataset

In [None]:
train = tfds.load('imdb_reviews', split='train')

In [3]:
test = tfds.load('imdb_reviews', split='test')

In [4]:
X_train, y_train, X_test, y_test = [], [], [], []

In [5]:
train = train.batch(25000)
for features in train.take(1):
    X_train.append(features['text'].numpy())
    y_train.append(features['label'].numpy())

In [6]:
X_train = X_train[0]
y_train = y_train[0]

In [7]:
len(X_train), len(y_train)

(25000, 25000)

In [8]:
test = test.batch(25000)
for features in test.take(1):
    X_test.append(features['text'].numpy())
    y_test.append(features['label'].numpy())

In [9]:
X_test = X_test[0]
y_test = y_test[0]

In [10]:
len(X_test), len(y_test)

(25000, 25000)

## TF-IDF Vectorizer

Term Document Matrix which as a Bag of Words representation

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),
                             min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                             smooth_idf=1, sublinear_tf=1 )

In [None]:
vectorizer.fit(X_train)

In [13]:
train_trm_doc = vectorizer.transform(X_train)

In [14]:
test_trm_doc = vectorizer.transform(X_test)

In [15]:
train_trm_doc.shape, test_trm_doc.shape

((25000, 267897), (25000, 267897))

Returns sparse matrix stored in compressed sparse row(csr) format

In [16]:
train_trm_doc[1]

<1x267897 sparse matrix of type '<class 'numpy.float64'>'
	with 148 stored elements in Compressed Sparse Row format>

In [17]:
len(X_train[1].decode("utf-8").split(" "))

112

## Feature Probability

In [18]:
def pr(x, y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [19]:
out = pr(train_trm_doc, 1, y_train)

In [20]:
out.shape

(1, 267897)

#### What is the above function doing?

In [21]:
X = ["This movie is good",
     "The movie is good",
     "This movie is bad",
     "The movie is bad"]
y = np.array([1., 1., 0., 0.])

In [22]:
vec = CountVectorizer()

In [23]:
X_doc = vec.fit_transform(X)

In [24]:
reverse = {k:v for v, k in vec.vocabulary_.items()}

In [25]:
X_doc.toarray().T[0]

array([0, 0, 1, 1])

In [26]:
columns = []
values = []
for idx in range(6):
    columns.append(reverse[idx])
    values.append(X_doc.toarray().T[idx])
df = pd.DataFrame.from_dict(dict(zip(columns, values)))
df.index = X

In [27]:
df

Unnamed: 0,bad,good,is,movie,the,this
This movie is good,0,1,1,1,0,1
The movie is good,0,1,1,1,1,0
This movie is bad,1,0,1,1,0,1
The movie is bad,1,0,1,1,1,0


In [28]:
print(f"p(f|y=1): {pr(X_doc, 0., y)}")

p(f|y=1): [[1.         0.33333333 1.         1.         0.66666667 0.66666667]]


In [29]:
print(f"p(f|y=1): {pr(X_doc, 1., y)}")

p(f|y=1): [[0.33333333 1.         1.         1.         0.66666667 0.66666667]]


\begin{array}{ | l | l | l | l | l | l | l | l | l | }
\hline
	FEATURES (f) &  & \  & \  & \  & \  & \  & \  & \  \\ \hline
	TEXT & LABEL &  & bad & good & is & movie & the  & this \\ \hline
	This movie is good & 1.0 &  & 0.0 & 1.0 & 1.0 & 1.0 & 0.0 & 1.0 \\ \hline
	The movie is good & 1.0 &  & 0.0 & 1.0 & 1.0 & 1.0 & 1.0 & 0.0 \\ \hline
	This movie is bad & 0.0 &  & 1.0 & 0.0 & 1.0 & 1.0 & 0.0 & 1.0 \\ \hline
	The movie is bad & 0.0 &  & 1.0 & 0.0 & 1.0 & 1.0 & 1.0 & 0.0 \\ \hline
	 &  &  &  & \  & \  & \  & \  & \  \\ \hline
	 & Ones (smoothing) & 1.0 & 1.0 & 1.0 & 1.0 & 1.0 & 1.0 & \  \\ \hline
	 &  & sum(label=1) & 0.0 & 2.0 & 2.0 & 2.0 & 1.0 & 1.0 \\ \hline
	 &  & p(f | 1) & 0.333 & 1.0 & 1.0 & 1.0 & 0.667 & 0.667 \\ \hline
	 &  &  &  &  &  &  &  &  \\ \hline
	 &  & sum(label=0) & 2.0 & 0.0 & 2.0 & 2.0 & 1.0 & 1.0 \\ \hline
	 &  & p(f | 0) & 1.0 & 0.333 & 1.0 & 1.0 & 0.667 & 0.667 \\ \hline
	 &  &  &  &  &  & \  & \  & \  \\ \hline
\end{array}


## Model

A logistic regression classifier using `log-count ratios` from Naive Bayes

In [37]:
def get_mdl(X, y):
    r = np.log(pr(X, 1,y) / pr(X, 0,y))
    m = LogisticRegression(C=4, dual=True, solver='liblinear')
    X_nb = X.multiply(r)
    return m.fit(X_nb, y), r

* `dual=True` - Useful when *n_features* `>` *n_samples*

In [38]:
model, ratio = get_mdl(train_trm_doc, y_train)

In [41]:
ratio.shape

(1, 267897)

In [58]:
y_pred = model.predict_proba(test_trm_doc.multiply(ratio))

In [60]:
y_pred = np.argmax(y_pred, axis=1)

In [62]:
y_test[:5], y_pred[:5]

(array([1, 1, 0, 0, 1]), array([1, 1, 0, 0, 1]))

In [78]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.90112


In [81]:
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

ROC AUC Score: 0.90112


## Sklearn Estimator

In [71]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C,
                                       solver='liblinear', 
                                       dual=self.dual, 
                                       n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [None]:
model = NbSvmClassifier(C=4, dual=True, n_jobs=-1).fit(train_trm_doc, y_train)

In [73]:
y_preds = model.predict_proba(test_trm_doc)

In [74]:
y_preds = np.argmax(y_preds, axis=1)

In [82]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.90112


In [83]:
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

ROC AUC Score: 0.90112
