In [2]:
import os.path as op
import numpy as np
from glob import glob
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import KFold

In [3]:
# Load data
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt')))

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos

y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

2000 documents


In [4]:
def count_words(texts, ignore_stop_words=True):
    words = set(' '.join(texts).split(' '))

    if ignore_stop_words:
        stop_words = open('data/english.stop').read()
        words = set(filter(lambda w: w not in stop_words, words))

    d = {w: i for i, w in enumerate(words)}

    counts = np.zeros((len(texts), len(words)))
    for ix_text, text in enumerate(texts):
        for word in text.split(' '):
            if not ignore_stop_words or word not in stop_words:
                counts[ix_text, d[word]] += 1

    return d, counts

#2 Les classes positives et négatives ont été assignées à partir des notes données au film, avec une échelle différente selon le système de notation de la source.

In [5]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.prior = None
        self.condprobe = None

    def fit(self, x, y):
        n_docs, n_words = x.shape
        classes = set(y)
        p = len(classes)
        self.prior = np.empty(p)
        self.condprobe = np.empty((p, n_words))

        for c in classes:
            self.prior[c] = len(y[y == c]) / n_docs
            t = np.sum(x[y == c], axis=0)
            self.condprobe[c] = (t + 1) / np.sum(t + 1)

        self.condprobe = self.condprobe.T

    def predict(self, x):
        score = np.empty((x.shape[0], len(self.prior)))
        score[:, :] = np.log(self.prior)

        for row, col in np.transpose(np.nonzero(x)):
            score[row] += np.log(self.condprobe[col])

        return np.argmax(score, axis=1)

    def score(self, x, y):
        return np.mean(self.predict(x) == y)

In [6]:
_, X = count_words(texts, ignore_stop_words=False)
for train_index, test_index in KFold(5).split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    nb = NB()
    nb.fit(X_train, y_train)
    print(nb.score(X_test, y_test))

0.8625
0.875
0.8275
0.7275
0.735


In [None]:
_, X = count_words(texts, ignore_stop_words=True)
for train_index, test_index in KFold(5).split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    nb = NB()
    nb.fit(X_train, y_train)
    print(nb.score(X_test, y_test))