In [27]:
import os.path as op
import numpy as np
import re
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [28]:
# Load data
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt')))

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.


print("%d documents" % len(texts))

Loading dataset
2000 documents



# Implementation of the classifier

## Count words

In [29]:
# Read stop word
def read_stop_words(remove_punc=False):
    filepath = "data/english.stop"
    stop_words = []
    with open(filepath) as fp:
        line = fp.readline()
        cnt = 1
        while line:
            line = line.strip()
            if remove_punc:
                line = re.sub(r"[^a-z]", "", line) #remove punctuation            
            stop_words.append(line) 
            cnt += 1
            line = fp.readline()
    print("There are", cnt, "stop words")
    return stop_words

In [30]:
def tokenize(s):    
    s = s.lower().strip()
    s = re.sub(r"[^a-z]", " ", s) # remove punctuation 
    ws = re.compile(r"\s+").split(s)    
    return ws

def count_words(texts, stop_words=[]):
    """Vectorize text : return count of each word in the text snippets

    Parametersl
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    
    words = set()
    for text in texts:      
        ws = tokenize(text)      
        for w in ws:   
            if len(w) > 0 and w not in stop_words:
                words.add(w)
    
    vocabulary = dict()
    for i, word in enumerate(words):
        vocabulary[word] = i
        
    n_features = len(vocabulary)
    print("Number of words in vocabulary:", n_features)
    counts = np.zeros((len(texts), n_features))
    for i, text in enumerate(texts):
        ws = tokenize(text)
        for word in ws: 
            if word in vocabulary:
                word_index = vocabulary[word]
                counts[i, word_index] += 1
    return vocabulary, counts


## Create vocabulary

In [31]:
stop_words = read_stop_words(remove_punc=True)
vocabulary, X = count_words(texts, stop_words=stop_words)
print("X shape", X.shape)

There are 572 stop words
Number of words in vocabulary: 38395
X shape (2000, 38395)


## Shuffle data

In [32]:
indices = list(range(len(y)))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]
texts = np.array(texts)[indices]

In [33]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):        
        self.prior = []
        self.likelihood = []
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        prior = [0] * len(self.classes)
        N = len(y)
        likelihood = np.zeros((X.shape[1], len(self.classes)))
        for i, c in enumerate(self.classes):
            prior[i] = np.sum(y == c) / N 
            X_class = np.sum(X[y == c], axis=0) # count number of occurences of token in each class        
            likelihood[:, i] = X_class        
        likelihood = likelihood + 1
        likelihood = likelihood / np.sum(likelihood, axis=0).reshape(1, -1)
          
        self.prior = prior
        self.likelihood = likelihood
                    
        return self

    def predict(self, X):                
        scores = X @ np.log(self.likelihood) + np.log(self.prior).reshape(1, -1)
        y_pred = np.argmax(scores, axis=1)        
        return [self.classes[i] for i in y_pred]

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

# Performance evaluation and comparison

In [34]:
def accuracy(y, y_pred):
    return np.mean(y == y_pred)

In [35]:
def cross_validation(clf, X, y, n_folds=5):
    interval = len(y) // n_folds
    scores = []
    for i in range(n_folds):
        start = int(i * interval)
        end = int((i + 1) * interval)

        X_test = X[start:end]
        y_test = y[start:end]
        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        score = accuracy(y_pred, y_test)
        scores.append(score)

    return scores


## My implementation of NB

In [36]:
nb = NB()
scores = cross_validation(nb, X, y)
print(scores, np.mean(scores)) 


[0.785, 0.8175, 0.79, 0.795, 0.8075] 0.799


## sklearn MultinomialNB

In [37]:
mnb = MultinomialNB()
scores = cross_validation(mnb, X, y)
print(scores, np.mean(scores)) 


[0.785, 0.8175, 0.79, 0.795, 0.8075] 0.799


# Use sklearn and pipeline

In [42]:
def create_pipeline(clf, analyzer):
    mnb_pipeline = Pipeline([
        ('vect', CountVectorizer(stop_words='english', analyzer=analyzer)),
        ('clf', MultinomialNB()),
    ])
    scores = cross_validation(mnb_pipeline, texts, y)
    return scores

## word

In [None]:
mnb = MultinomialNB()
mnb_pipeline = create_pipeline()

In [38]:
mnb_pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', MultinomialNB()),
])
scores = cross_validation(mnb_pipeline, texts, y)
print(scores, np.mean(scores)) 

[0.785, 0.815, 0.7925, 0.795, 0.815] 0.8004999999999999


In [39]:
nb_pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', NB()),
])
scores = cross_validation(nb_pipeline, texts, y)
print(scores, np.mean(scores)) 

[0.785, 0.815, 0.7925, 0.795, 0.815] 0.8004999999999999
