In [68]:
import os
from glob import glob

# unpack the dataset from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# and store the folder 'aclImdb' in the same folder as this script

def read_data(dataset):
    texts = []
    labels = []
    for label in ['pos', 'neg']:
        for file in glob(os.path.join('aclImdb',dataset,label,'*.txt')):
            with open(file) as f:
                texts.append(f.read())
                labels.append(label)
    return texts, labels

X_train_fulltext, y_train = read_data('train')
X_test_fulltext, y_test= read_data('test')

In [67]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train_fulltext)
X_test = vectorizer.transform(X_test_fulltext)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[10995  1505]
 [ 3003  9497]]
              precision    recall  f1-score   support

         neg       0.79      0.88      0.83     12500
         pos       0.86      0.76      0.81     12500

   micro avg       0.82      0.82      0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

