In [3]:
import os.path as op
import numpy as np
from glob import glob
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.help import upenn_tagset
from nltk import word_tokenize, pos_tag
from nltk.stem.snowball import SnowballStemmer
from multiprocessing import Pool
from normalizr import Normalizr

In [4]:
# Load data
uri = '/home/axel/desktop/mastere/machine_learning/tpnote/data'
filenames_neg = sorted(glob(op.join(uri, 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join(uri, 'imdb1', 'pos', '*.txt')))


def read_f(f):
    return open(f).read()

texts_neg = Pool().map(read_f, filenames_neg)
texts_pos = Pool().map(read_f, filenames_pos)
texts = texts_neg + texts_pos

y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

2000 documents


In [18]:
normalizr = Normalizr(language='en')
normalizr_options = [
    'remove_accent_marks',
    'replace_hyphens',
    'replace_punctuation',
    'replace_symbols',
    'remove_extra_whitespaces'
]

def normalize(ts):
    return [normalizr.normalize(t, normalizr_options) for t in ts]

sw = open(uri + '/english.stop').read().splitlines()

def not_in_sw(w):
    return w not in sw

In [27]:
def count_words(t, ignore_sw):
    t = normalize(t)
    all_words = set(' '.join(t).split(' '))
    dictionary = set(filter(not_in_sw, all_words)) if ignore_sw else all_words
    d = {w: i for i, w in enumerate(dictionary)}
    counts = np.zeros((len(t), len(d)))
    for ix_text, text in enumerate(t):
        split = text.split(' ')
        words = list(filter(not_in_sw, split)) if ignore_sw else split
        for word in words:
            counts[ix_text, d[word]] += 1

    return counts

#2 Les classes positives et négatives ont été assignées à partir des notes données au film, avec une échelle différente selon le système de notation de la source.

In [28]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.prior = None
        self.condprobe = None

    def fit(self, x, y):
        '''
        Given x, a matrix of number of apparition by term and text, 
        and y, a vector containing the labels associated with each text,
        calculate the frequency for each word, by label.
        '''
        n_docs, n_words = x.shape
        classes = set(y)
        p = len(classes)
        
        # probability a priori
        self.prior = np.empty(p)
        self.condprobe = np.empty((p, n_words))

        for i, c in enumerate(classes):
            # over all the training data, frequency of label c
            self.prior[i] = len(y[y == c]) / n_docs
            
            # calculate the frequency of each word
            t = np.sum(x[y == c], axis=0)
            self.condprobe[i] = (t + 1) / np.sum(t + 1)
        
        return self

    def predict(self, x):
        '''
        Calculates a score for each label for a new "apparition matrix"
        Return the higher scoring labels
        '''
        score = np.empty((x.shape[0], len(self.prior)))
        score[:, :] = np.log(self.prior)
        
        # np.nonzeros allows to consider only non-zero terms
        self.condprobe = self.condprobe.T
        for c, t in np.transpose(np.nonzero(x)):
            score[c] += np.log(self.condprobe[t])

        return np.argmax(score, axis=1)

    def score(self, x, y):
        return np.mean(self.predict(x) == y)

In [29]:
X_ignore_true = count_words(texts, ignore_sw=True)
X_ignore_false = count_words(texts, ignore_sw=False)


def cross_val(naive_bayes, x, cv=5):
    return np.mean(cross_val_score(naive_bayes, x, y, cv=cv))

def eval_nb(clf):
    nb = clf()
    print('\nScore using ' + clf.__name__)
    print('\tignore_sw=False =>', cross_val(nb, X_ignore_false))
    print('\tignore_sw=True =>', cross_val(nb, X_ignore_true))

In [30]:
eval_nb(NB)


Score using NB
	ignore_sw=False => 0.8195
	ignore_sw=True => 0.831


In [31]:
eval_nb(MultinomialNB)


Score using MultinomialNB
	ignore_sw=False => 0.8095
	ignore_sw=True => 0.806


In [14]:
vect = CountVectorizer(lowercase=True, stop_words=sw)
def eval_pipeline(clf):
    pipeline = Pipeline([('vect', vect), ('clf', clf())])
    for va in ['word', 'char', 'char_wb']:
        for ngr in [(1, 1), (1, 2)]:
            pipeline.set_params(vect__analyzer=va, vect__ngram_range=ngr)
            print(clf.__name__, va, ngr,
            np.mean(cross_val(pipeline, texts)), '\n')

In [15]:
eval_pipeline(MultinomialNB)

MultinomialNB word (1, 1) 0.8 

MultinomialNB word (1, 2) 0.8025 

MultinomialNB char (1, 1) 0.6095 

MultinomialNB char (1, 2) 0.674 

MultinomialNB char_wb (1, 1) 0.6115 

MultinomialNB char_wb (1, 2) 0.6745 



In [16]:
eval_pipeline(LinearSVC)

LinearSVC word (1, 1) 0.8125 

LinearSVC word (1, 2) 0.828 

LinearSVC char (1, 1) 0.5635 

LinearSVC char (1, 2) 0.663 

LinearSVC char_wb (1, 1) 0.531 

LinearSVC char_wb (1, 2) 0.6055 



In [17]:
eval_pipeline(LogisticRegression)

LogisticRegression word (1, 1) 0.829 

LogisticRegression word (1, 2) 0.836 

LogisticRegression char (1, 1) 0.6375 

LogisticRegression char (1, 2) 0.7055 

LogisticRegression char_wb (1, 1) 0.6385 

LogisticRegression char_wb (1, 2) 0.7045 



In [18]:
stemmer = SnowballStemmer('english', ignore_stopwords=False)
vect_stem = CountVectorizer(
    lowercase=True,
    stop_words=sw,
    tokenizer=lambda t: [stemmer.stem(token) for token in word_tokenize(t)],
    ngram_range=(1, 2),
    analyzer='word'
)

def eval_stem(clf):
    pipeline = Pipeline([('vect', vect_stem), ('clf', clf())])
    print(clf.__name__, ':', np.mean(cross_val(pipeline, texts)))

In [19]:
eval_stem(MultinomialNB)

MultinomialNB : 0.8115


In [20]:
eval_stem(LogisticRegression)

LogisticRegression : 0.838


In [21]:
eval_stem(LinearSVC)

LinearSVC : 0.8315


In [23]:
ok_pos = [
    'JJ', 'JJR', 'JJS',                         # ADJECTIVES
    'NN', 'NNP', 'NNPS',                        # NOUNS
    'RB', 'RBR', 'RBS',                         # ADVERBS
    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'     # VERBS
]


def filter_words(t):
    return [w for w, pos in pos_tag(word_tokenize(t)) if pos in ok_pos]

vect_pos = CountVectorizer(
    lowercase=True,
    stop_words=sw,
    tokenizer=filter_words,
    ngram_range=(1, 2),
    analyzer='word'
)

In [None]:
pipeline = Pipeline([('vect', vect_pos), ('clf', LogisticRegression())])
print(LogisticRegression.__name__, ':', np.mean(cross_val(pipeline, texts)))