In [43]:
import os.path as op
import numpy as np
import re
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [69]:
# Load data
def load_data():
    print("Loading dataset")

    from glob import glob
    filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt')))
    filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt')))

    texts_neg = [open(f).read() for f in filenames_neg]
    texts_pos = [open(f).read() for f in filenames_pos]
    texts = texts_neg + texts_pos
    y = np.ones(len(texts), dtype=np.int)
    y[:len(texts_neg)] = 0.


    print("%d documents" % len(texts))
    return texts, y

def shuffle_data(X, y):
    indices = list(range(len(y)))
    np.random.shuffle(indices)    
    y = y[indices]
    X = X[indices]
    return X, y

In [132]:
texts, y = load_data()
texts, y = shuffle_data(np.array(texts), y)

Loading dataset
2000 documents



# Implementation of the classifier

## Count words

In [71]:
# Read stop word
def read_stop_words(remove_punc=False):
    filepath = "data/english.stop"
    stop_words = []
    with open(filepath) as fp:
        line = fp.readline()
        cnt = 1
        while line:
            line = line.strip()
            if remove_punc:
                line = re.sub(r"[^a-z]", "", line) #remove punctuation            
            stop_words.append(line) 
            cnt += 1
            line = fp.readline()
    print("There are", cnt, "stop words")
    return stop_words

In [159]:
def tokenize(s):    
    s = s.lower().strip()
    s = re.sub(r"[^a-z]", " ", s) # remove punctuation 
    ws = re.compile(r"\s+").split(s)    
    return [w for w in ws if len(w) > 0]

def count_words(texts, stop_words=[]):
    """Vectorize text : return count of each word in the text snippets

    Parametersl
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    
    words = set()
    for text in texts:      
        ws = tokenize(text)      
        for w in ws:   
            if w not in stop_words:
                words.add(w)
    
    vocabulary = dict()
    for i, word in enumerate(words):
        vocabulary[word] = i
        
    n_features = len(vocabulary)
    print("Number of words in vocabulary:", n_features)
    counts = np.zeros((len(texts), n_features))
    for i, text in enumerate(texts):
        ws = tokenize(text)
        for word in ws: 
            if word in vocabulary:
                word_index = vocabulary[word]
                counts[i, word_index] += 1
    return vocabulary, counts


## Create vocabulary

In [73]:
stop_words = read_stop_words(remove_punc=True)
vocabulary, X = count_words(texts, stop_words=stop_words)
print("X shape", X.shape)

There are 572 stop words
Number of words in vocabulary: 38395
X shape (2000, 38395)


In [74]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):        
        self.prior = []
        self.likelihood = []
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        prior = [0] * len(self.classes)
        N = len(y)
        likelihood = np.zeros((X.shape[1], len(self.classes)))
        for i, c in enumerate(self.classes):
            prior[i] = np.sum(y == c) / N 
            X_class = np.sum(X[y == c], axis=0) # count number of occurences of token in each class        
            likelihood[:, i] = X_class        
        likelihood = likelihood + 1
        likelihood = likelihood / np.sum(likelihood, axis=0).reshape(1, -1)
          
        self.prior = prior
        self.likelihood = likelihood
                    
        return self

    def predict(self, X):                
        scores = X @ np.log(self.likelihood) + np.log(self.prior).reshape(1, -1)
        y_pred = np.argmax(scores, axis=1)        
        return [self.classes[i] for i in y_pred]

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

# Performance evaluation and comparison

In [75]:
def accuracy(y, y_pred):
    return np.mean(y == y_pred)

In [76]:
def cross_validation(clf, X, y, n_folds=5):
    interval = len(y) // n_folds
    scores = []
    for i in range(n_folds):
        start = int(i * interval)
        end = int((i + 1) * interval)

        X_test = X[start:end]
        y_test = y[start:end]
        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        score = accuracy(y_pred, y_test)
        scores.append(score)

    return scores


## My implementation of NB

In [77]:
nb = NB()
scores = cross_validation(nb, X, y)
print(scores, np.mean(scores)) 


[0.805, 0.8075, 0.8025, 0.8075, 0.785] 0.8015000000000001


## sklearn MultinomialNB

In [78]:
mnb = MultinomialNB()
scores = cross_validation(mnb, X, y)
print(scores, np.mean(scores)) 


[0.805, 0.8075, 0.8025, 0.8075, 0.785] 0.8015000000000001


# Use sklearn and pipeline

In [110]:
def evaluate_pipeline(clf_name, clf, texts, y, analyzer):
    mnb_pipeline = Pipeline([
        ('vect', CountVectorizer(stop_words='english', # only applied if analyzer = 'word'
                                 analyzer=analyzer, 
                                 ngram_range=(1, 2))), # allow bigrams
        ('clf', clf),
    ])
    scores = cross_validation(mnb_pipeline, texts, y)
    print(clf_name + ":", scores,"- mean accuracy:", np.mean(scores))

In [111]:
def evaluate(analyzer, texts, y):
    
    nb = NB()
    evaluate_pipeline("My NB", nb, texts, y, analyzer)

    mnb = MultinomialNB()
    evaluate_pipeline("Multinomial NB", mnb, texts, y, analyzer)

    lsvc = LinearSVC(max_iter=500)
    evaluate_pipeline("Linear SVC", lsvc, texts, y, analyzer)

    reg = LogisticRegression(solver='lbfgs')
    evaluate_pipeline("Logistic Regression", reg, texts, y, analyzer)

    rf = RandomForestClassifier(n_estimators=20)
    evaluate_pipeline("Random forest", rf, texts, y, analyzer)


## word

In [113]:
evaluate('word', texts, y)

My NB: [0.8225, 0.815, 0.7775, 0.81, 0.7825] - mean accuracy: 0.8015000000000001
Multinomial NB: [0.8225, 0.815, 0.7775, 0.81, 0.7825] - mean accuracy: 0.8015000000000001




Linear SVC: [0.8325, 0.8275, 0.8425, 0.8375, 0.8175] - mean accuracy: 0.8315000000000001
Logistic Regression: [0.83, 0.8275, 0.8375, 0.845, 0.8325] - mean accuracy: 0.8344999999999999
Random forest: [0.7175, 0.7325, 0.78, 0.6975, 0.725] - mean accuracy: 0.7305


## Character

In [114]:
evaluate('char', texts, y)

My NB: [0.68, 0.7175, 0.68, 0.6975, 0.6675] - mean accuracy: 0.6885000000000001
Multinomial NB: [0.68, 0.7175, 0.68, 0.6975, 0.6675] - mean accuracy: 0.6885000000000001




Linear SVC: [0.72, 0.62, 0.645, 0.69, 0.63] - mean accuracy: 0.6609999999999999




Logistic Regression: [0.7225, 0.71, 0.655, 0.675, 0.6775] - mean accuracy: 0.6880000000000001
Random forest: [0.575, 0.63, 0.615, 0.615, 0.57] - mean accuracy: 0.601


# Question 3: stemming

In [93]:
# import nltk
# nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [133]:
from nltk import SnowballStemmer

In [134]:
stemmer = SnowballStemmer(language='english')

In [135]:
texts, y = load_data()
texts, y = shuffle_data(np.array(texts), y)

new_texts = []
for i, text in enumerate(texts):
    tokens = tokenize(text)
    token_str = " ".join([stemmer.stem(token) for token in tokens if len(token) > 0 and token not in stop_words])
    new_texts.append(token_str)

Loading dataset
2000 documents


In [136]:
evaluate('word', new_texts, y)

My NB: [0.8025, 0.83, 0.7875, 0.8075, 0.765] - mean accuracy: 0.7985
Multinomial NB: [0.8025, 0.83, 0.7875, 0.8075, 0.765] - mean accuracy: 0.7985




Linear SVC: [0.8175, 0.8375, 0.8175, 0.8125, 0.8225] - mean accuracy: 0.8215




Logistic Regression: [0.835, 0.8425, 0.8075, 0.81, 0.84] - mean accuracy: 0.827
Random forest: [0.7375, 0.7375, 0.7525, 0.71, 0.6925] - mean accuracy: 0.726


# Question 4: POS tag

In [179]:
texts, y = load_data()
texts, y = shuffle_data(np.array(texts), y)

keeps = ["NN", "NNS", 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']

new_texts = []
for i, text in enumerate(texts):
    tokens = tokenize(text)
    token_with_tags = nltk.pos_tag(tokens)
    filtered_token_with_tags = filter(lambda x: x[1] in keeps, token_with_tags) # keep only noun, adverbs, verb, adj
    filtered_tokens = map(lambda x: x[0], filtered_token_with_tags)
    new_texts.append(" ".join(filtered_tokens))

Loading dataset
2000 documents


# Comparison

In [181]:
# pos_tag - keep only noun, adverbs, verb, adj
evaluate('word', new_texts, y)

My NB: [0.845, 0.775, 0.785, 0.7775, 0.82] - mean accuracy: 0.8005000000000001
Multinomial NB: [0.845, 0.775, 0.785, 0.7775, 0.82] - mean accuracy: 0.8005000000000001




Linear SVC: [0.825, 0.83, 0.83, 0.825, 0.845] - mean accuracy: 0.8309999999999998
Logistic Regression: [0.835, 0.835, 0.8425, 0.81, 0.86] - mean accuracy: 0.8365
Random forest: [0.7475, 0.74, 0.74, 0.715, 0.7025] - mean accuracy: 0.729


In [182]:
# Without pos tag
evaluate('word', texts, y)

My NB: [0.8425, 0.7675, 0.7925, 0.7775, 0.82] - mean accuracy: 0.7999999999999999
Multinomial NB: [0.8425, 0.7675, 0.7925, 0.7775, 0.82] - mean accuracy: 0.7999999999999999




Linear SVC: [0.825, 0.8325, 0.8475, 0.82, 0.8525] - mean accuracy: 0.8355
Logistic Regression: [0.835, 0.8325, 0.8475, 0.8175, 0.86] - mean accuracy: 0.8385
Random forest: [0.725, 0.7325, 0.73, 0.7275, 0.735] - mean accuracy: 0.73
