In [1]:
import os.path as op
import numpy as np
from glob import glob
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.help import upenn_tagset
from nltk import word_tokenize, pos_tag
from nltk.stem.snowball import SnowballStemmer
from multiprocessing import Pool
from normalizr import Normalizr
from chronometer import Chronometer
import gc

In [2]:
gc.collect()
def read_f(f):
    return open(f).read()

# Load data
with Chronometer() as t:
    uri = '/home/axel/mastere/machine_learning/tpnote/data'
    filenames_neg = sorted(glob(op.join(uri, 'imdb1', 'neg', '*.txt')))
    filenames_pos = sorted(glob(op.join(uri, 'imdb1', 'pos', '*.txt')))
    texts_neg = Pool().map(read_f, filenames_neg)
    texts_pos = Pool().map(read_f, filenames_pos)
    texts = texts_neg + texts_pos

print('Got data in {:.2f} s'.format(float(t)))

y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Got data in 0.20 s
2000 documents


In [3]:
gc.collect()
normalizr = Normalizr(language='en')
normalizr_options = [
    'remove_accent_marks',
    'replace_hyphens',
    'replace_punctuation',
    'replace_symbols',
    'remove_extra_whitespaces'
]

def normalize(ts):
    return [normalizr.normalize(t, normalizr_options) for t in ts]

sw = open(uri + '/english.stop').read().splitlines()

def not_in_sw(w):
    return w not in sw

In [4]:
def count_words_in_text(text, ignore_sw):
    split = text.split(' ')
    words = list(filter(not_in_sw, split)) if ignore_sw else split
    
    counts = np.zeros(len(dictionary))
    for word in words:
        counts[dictionary[word]] += 1
        
    return counts

def count_words(ts, ignore_sw):
    # remove all special characters
    ts = normalize(ts)
    
    # retrieve the vocabulary of the texts
    all_words = set(' '.join(ts).split(' '))
    
    # remove stop words if need be
    d = set(filter(not_in_sw, all_words)) if ignore_sw else all_words
    
    global dictionary
    dictionary = {w: i for i, w in enumerate(d)}

    return np.array(Pool(8).starmap(count_words_in_text, [(t, ignore_sw) for t in ts]))

In [5]:
gc.collect()
with Chronometer() as t:
    X_ignore_true = count_words(texts, ignore_sw=True)
print('Got counts (ignore_sw=True) in {:.3f}s'.format(float(t)))
    
with Chronometer() as t:
    X_ignore_false = count_words(texts, ignore_sw=False)
print('Got counts (ignore_sw=False) in {:.3f}s'.format(float(t)))

Got counts (ignore_sw=True) in 9.637s
Got counts (ignore_sw=False) in 7.808s


#2 Les classes positives et négatives ont été assignées à partir des notes données au film, avec une échelle différente selon le système de notation de la source.

In [9]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.prior = None
        self.condprobe = None

    def fit(self, x, y):
        '''
        Given x, a matrix of number of apparition by term and text, 
        and y, a vector containing the labels associated with each text,
        calculate the frequency for each word, by label.
        '''
        self.x = x
        self.y = y
        n_docs, n_words = x.shape
        classes = set(y)
        p = len(classes)
        
        # probability a priori
        self.prior = np.empty(p)
#         self.condprobe = np.empty((p, n_words))
        self.prior = np.array([len(self.y[self.y == c]) / n_docs for c in classes])
        self.condprobe = np.array(Pool(8).map(self._calc_freq, classes))
        
        return self
    
    def _calc_freq(self, c):
        # calculate the frequency of each word
        t = np.sum(self.x[self.y == c], axis=0)
        return (t + 1) / np.sum(t + 1)
        

    def predict(self, x):
        '''
        Calculates a score for each label for a new "apparition matrix"
        Return the higher scoring labels
        '''
        score = np.empty((x.shape[0], len(self.prior)))
        score[:, :] = np.log(self.prior)
        
        # np.nonzeros allows to consider only non-zero terms
        self.condprobe = self.condprobe.T
        for c, t in np.transpose(np.nonzero(x)):
            score[c] += np.log(self.condprobe[t])

        return np.argmax(score, axis=1)

    def score(self, x, y):
        return np.mean(self.predict(x) == y)

In [7]:
# Mean of cross validation scores
def cross_val(naive_bayes, x, cv=5):
    with Chronometer() as t:
        res = np.mean(cross_val_score(naive_bayes, x, y, cv=cv))
    print('\tEvaluation took {:.3f}s'.format(float(t)))
    return res

# Evaluate a classifier with ignore_sw true or false, consecutively
def eval_nb(clf):
    nb = clf()
    print('Score using ' + clf.__name__)
    with Chronometer() as t:
        print('\tignore_sw=False =>', cross_val(nb, X_ignore_false))
    
    with Chronometer() as t:
        print('\tignore_sw=True =>', cross_val(nb, X_ignore_true))

In [10]:
# Apply to custom NB classifier
gc.collect()
eval_nb(NB)

Score using NB
	Evaluation took 26.531s
	ignore_sw=False => 0.8195
	Evaluation took 82.847s
	ignore_sw=True => 0.831


In [9]:
# Now to MultinomialNB classifier
gc.collect()
eval_nb(MultinomialNB)

Score using MultinomialNB
	Evaluation took 2.029s
	ignore_sw=False => 0.8095
	Evaluation took 1.994s
	ignore_sw=True => 0.806


In [10]:
vect = CountVectorizer(lowercase=True, stop_words=sw)
def eval_pipeline(clf):
    pipeline = Pipeline([('vect', vect), ('clf', clf())])
    print('\t', clf.__name__, ':', cross_val(pipeline, texts))

In [11]:
gc.collect()
eval_pipeline(MultinomialNB)

	Evaluation took 5.702s
	 MultinomialNB : 0.8


In [12]:
gc.collect()
eval_pipeline(LinearSVC)

	Evaluation took 7.324s
	 LinearSVC : 0.8125


In [13]:
gc.collect()
eval_pipeline(LogisticRegression)

	Evaluation took 6.477s
	 LogisticRegression : 0.829


In [14]:
stemmer = SnowballStemmer('english', ignore_stopwords=False)
vect_stem = CountVectorizer(
    lowercase=True,
    stop_words=sw,
    tokenizer=lambda t: [stemmer.stem(token) for token in word_tokenize(t)],
    ngram_range=(1, 2),
    analyzer='word'
)

def eval_stem(clf):
    pipeline = Pipeline([('vect', vect_stem), ('clf', clf())])
    print(clf.__name__, ':', cross_val(pipeline, texts))

In [15]:
gc.collect()
eval_stem(MultinomialNB)

	Evaluation took 230.276s
MultinomialNB : 0.8115


In [16]:
gc.collect()
eval_stem(LinearSVC)

	Evaluation took 243.671s
LinearSVC : 0.8315


In [17]:
gc.collect()
eval_stem(LogisticRegression)

	Evaluation took 278.125s
LogisticRegression : 0.837


In [18]:
ok_pos = [
    'JJ', 'JJR', 'JJS',                         # ADJECTIVES
    'NN', 'NNP', 'NNPS',                        # NOUNS
    'RB', 'RBR', 'RBS',                         # ADVERBS
    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'     # VERBS
]


def filter_words(t):
    return [w for w, pos in pos_tag(word_tokenize(t)) if pos in ok_pos]

vect_pos = CountVectorizer(
    lowercase=True,
    stop_words=sw,
    tokenizer=filter_words,
    ngram_range=(1, 2),
    analyzer='word'
)

In [19]:
gc.collect()
pipeline = Pipeline([('vect', vect_pos), ('clf', LogisticRegression())])
print(LogisticRegression.__name__, ':', np.mean(cross_val(pipeline, texts)))

KeyboardInterrupt: 