In [1]:
import os
from glob import glob

# unpack the dataset from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz and store the folder 'aclImdb' in the same folder as this script

def read_data(dataset):
    texts = []
    labels = []
    for label in ['pos', 'neg']:
        for file in glob(os.path.join('aclImdb',dataset,label,'*.txt')):
            with open(file) as f:
                texts.append(f.read())
                labels.append(label)
    return texts, labels

X_train_fulltext, y_train = read_data('train')
X_test_fulltext, y_test= read_data('test')

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train_fulltext)
X_test = vectorizer.transform(X_test_fulltext)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[10995  1505]
 [ 3003  9497]]
              precision    recall  f1-score   support

         neg       0.79      0.88      0.83     12500
         pos       0.86      0.76      0.81     12500

   micro avg       0.82      0.82      0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [16]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

def short_classification_report (y_test, y_pred):
    print("                \t precision \t recall")
    print(f"positive reviews:\t {metrics.precision_score(y_test,y_pred, pos_label='pos'):0.2f} \t\t {metrics.recall_score(y_test,y_pred, pos_label='pos'):0.2f}")
    print(f"negative reviews:\t {metrics.precision_score(y_test,y_pred, pos_label='neg'):0.2f} \t\t {metrics.recall_score(y_test,y_pred, pos_label='neg'):0.2f}")

configurations = [('NB with Count', CountVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('NB with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('LogReg with Count', CountVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear')),
                 ('LogReg with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear'))]

for description, vectorizer, classifier in configurations:
    print(description)
    X_train = vectorizer.fit_transform(X_train_fulltext)
    X_test = vectorizer.transform(X_test_fulltext)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    short_classification_report(y_test, y_pred)
    print('\n')

NB with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.77
negative reviews:	 0.79 		 0.88


NB with TfIdf
                	 precision 	 recall
positive reviews:	 0.87 		 0.78
negative reviews:	 0.80 		 0.88


LogReg with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.85
negative reviews:	 0.85 		 0.87


LogReg with TfIdf
                	 precision 	 recall
positive reviews:	 0.89 		 0.88
negative reviews:	 0.88 		 0.89




In [17]:
from sklearn.pipeline import make_pipeline

for description, vectorizer, classifier in configurations:
    print(description)
    pipe = make_pipeline(vectorizer, classifier)
    pipe.fit(X_train_fulltext, y_train)
    y_pred = pipe.predict(X_test_fulltext)
    short_classification_report(y_test, y_pred)
    print('\n')

NB with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.77
negative reviews:	 0.79 		 0.88


NB with TfIdf
                	 precision 	 recall
positive reviews:	 0.87 		 0.78
negative reviews:	 0.80 		 0.88


LogReg with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.85
negative reviews:	 0.85 		 0.87


LogReg with TfIdf
                	 precision 	 recall
positive reviews:	 0.89 		 0.88
negative reviews:	 0.88 		 0.89




In [129]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps = [('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(solver='liblinear'))])
grid = {
    'vectorizer__ngram_range' : [(1,1), (1,2)],
    'vectorizer__max_df': [0.5, 1.0],
    'vectorizer__min_df': [0, 5],
    'classifier__C': [0.01, 1, 100]
}

search = GridSearchCV(estimator=pipe,
                      param_grid=grid,
                      scoring='accuracy',   # all classes are balanced, let's just score on accuracy
                      cv=5,
                      n_jobs=-1,  # use all cpus
                      verbose=10)
search.fit(X_train_fulltext, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(short_classification_report(y_test, search.predict(X_test_fulltext)))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 15.9min finished


Using these hyperparameters {'classifier__C': 100, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 5, 'vectorizer__ngram_range': (1, 2)}, we get the best performance:
                	 precision 	 recall
positive reviews:	 0.90 		 0.90
negative reviews:	 0.90 		 0.90
None


In [143]:
from nltk.sentiment import vader

analyzer = vader.SentimentIntensityAnalyzer()
y_vader = []
for review in X_test_fulltext:
    sentiment = analyzer.polarity_scores(review)
    if sentiment['compound']>0:
        y_vader.append('pos')
    elif sentiment['compound']<0:
        y_vader.append('neg')
    else:
        y_vader.append('dont know')
print(confusion_matrix(y_test, y_vader))
print(classification_report(y_test, y_vader))

[[    0     0     0]
 [    6  6696  5798]
 [    5  1751 10744]]
              precision    recall  f1-score   support

   dont know       0.00      0.00      0.00         0
         neg       0.79      0.54      0.64     12500
         pos       0.65      0.86      0.74     12500

   micro avg       0.70      0.70      0.70     25000
   macro avg       0.48      0.47      0.46     25000
weighted avg       0.72      0.70      0.69     25000



In [13]:
import pickle
from sklearn.externals import joblib

# let's save the vectorizer and classifier from the first example
pickle.dump(vectorizer,open("myvectorizer.pkl",mode="wb"))
joblib.dump(nb, "myclassifier.pkl")

#Then, later on, instead of fitting a new vectorizer, you can simply load the old one and use it:

listwithnewdata = ['This is a great movie', 'I hated this one.', 'What an awful fail']

myvectorizer = pickle.load(open("myvectorizer.pkl",mode="rb"))
new_features = vectorizer.transform(listwithnewdata)

myclassifier = joblib.load("myclassifier.pkl")
predictions = myclassifier.predict(new_features)

for review, label in zip(listwithnewdata, predictions):
    print(f"'{review}' probably belongs to class '{label}'.")


'This is a great movie' probably belongs to class 'pos'.
'I hated this one.' probably belongs to class 'neg'.
'What an awful fail' probably belongs to class 'neg'.


In [18]:
import eli5
eli5.show_weights(pipe, top = 10)

Weight?,Feature
+7.173,great
+6.101,excellent
+5.055,best
+4.791,perfect
… 13663 more positive …,… 13663 more positive …
… 13574 more negative …,… 13574 more negative …
-5.337,poor
-5.733,boring
-6.315,waste
-6.349,awful


In [30]:
eli5.show_prediction(classifier, X_test_fulltext[0], vec=vectorizer)

Contribution?,Feature
1.237,Highlighted in text (sum)
0.013,<BIAS>
