In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
import joblib
import eli5
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# ugly copy paste for now from dictionary_py as I did not manage to import from a .ipynb file
# (even though the modules imdb and/or nbimport should be able to do this)
from pathlib import Path
import tarfile
import bz2
import urllib.request
import re
import pickle
import requests

def get_review_data(filename = "reviewdata.pickle.bz2", url = "http://cssbook.net/d/aclImdb_v1.tar.gz"):
    '''
    Checks whether review dataset has already been downloaded.
    If not, downloads it.
    
    Parameters
    ----------
    filename : string
        name of cached file
    url : string
        url of IMDB dataset
    
    Returns
    -------
    tuple of lists of strings
        reviews_train, reviews_test, label_train, label_test
    '''

    if Path(filename).exists():
        print(f"Using cached file {filename}")
        with bz2.BZ2File(filename, 'r') as f:
            reviews_train, reviews_test, label_train, label_test = pickle.load(f)
    else:
        print(f"Downloading from {url}")
        fn, _headers = urllib.request.urlretrieve(url, filename=None)
        t = tarfile.open(fn, mode="r:gz")
        reviews_train, reviews_test, label_train, label_test = [], [], [], []
        for file in t.getmembers():
            try:
                _imdb, dataset, label, _fn = Path(file.name).parts
            except ValueError:
                # if the Path cannot be parsed, e.g. because it does not consist of exactly four parts, then it is not a part of the dataset but for instance a folder name. Let's skip it then
                continue
            if dataset == "train" and (label=='pos' or label=='neg'):
                reviews_train.append(t.extractfile(file).read().decode("utf-8"))
                label_train.append(label)
            elif dataset == "test" and (label=='pos' or label=='neg'):
                reviews_test.append(t.extractfile(file).read().decode("utf-8"))
                label_test.append(label)
        print(f"Saving {len(label_train)} training and {len(label_test)} test cases to {filename}")
        with bz2.BZ2File(filename, 'w') as f:
            pickle.dump((reviews_train, reviews_test, label_train, label_test), f)
    return reviews_train, reviews_test, label_train, label_test

reviews_train, reviews_test, y_train, y_test = get_review_data()

Using cached file reviewdata.pickle.bz2


In [3]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.79      0.88      0.83     12500
         pos       0.86      0.76      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [4]:
def short_classification_report (y_test, y_pred):
    print("                \t precision \t recall")
    print(f"positive reviews:\t {metrics.precision_score(y_test,y_pred, pos_label='pos'):0.2f} \t\t {metrics.recall_score(y_test,y_pred, pos_label='pos'):0.2f}")
    print(f"negative reviews:\t {metrics.precision_score(y_test,y_pred, pos_label='neg'):0.2f} \t\t {metrics.recall_score(y_test,y_pred, pos_label='neg'):0.2f}")

configurations = [('NB with Count', CountVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('NB with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('LogReg with Count', CountVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear')),
                 ('LogReg with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear'))]

for description, vectorizer, classifier in configurations:
    print(description)
    X_train = vectorizer.fit_transform(reviews_train)
    X_test = vectorizer.transform(reviews_test)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    short_classification_report(y_test, y_pred)
    print('\n')

NB with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.77
negative reviews:	 0.79 		 0.88


NB with TfIdf
                	 precision 	 recall
positive reviews:	 0.87 		 0.78
negative reviews:	 0.80 		 0.88


LogReg with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.85
negative reviews:	 0.85 		 0.87


LogReg with TfIdf
                	 precision 	 recall
positive reviews:	 0.89 		 0.88
negative reviews:	 0.88 		 0.89




In [5]:
for description, vectorizer, classifier in configurations:
    print(description)
    pipe = make_pipeline(vectorizer, classifier)
    pipe.fit(reviews_train, y_train)
    y_pred = pipe.predict(reviews_test)
    short_classification_report(y_test, y_pred)
    print('\n')

NB with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.77
negative reviews:	 0.79 		 0.88


NB with TfIdf
                	 precision 	 recall
positive reviews:	 0.87 		 0.78
negative reviews:	 0.80 		 0.88


LogReg with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.85
negative reviews:	 0.85 		 0.87


LogReg with TfIdf
                	 precision 	 recall
positive reviews:	 0.89 		 0.88
negative reviews:	 0.88 		 0.89




In [6]:
pipeline = Pipeline(steps = [('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(solver='liblinear'))])
grid = {
    'vectorizer__ngram_range' : [(1,1), (1,2)],
    'vectorizer__max_df': [0.5, 1.0],
    'vectorizer__min_df': [0, 5],
    'classifier__C': [0.01, 1, 100]
}

search = GridSearchCV(estimator=pipeline,
                      param_grid=grid,
                      scoring='accuracy',   # all classes are balanced, let's just score on accuracy
                      cv=5,
                      n_jobs=-1,  # use all cpus
                      verbose=10)
search.fit(reviews_train, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(short_classification_report(y_test, search.predict(reviews_test)))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 11.3min finished


Using these hyperparameters {'classifier__C': 100, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0, 'vectorizer__ngram_range': (1, 2)}, we get the best performance:
                	 precision 	 recall
positive reviews:	 0.90 		 0.90
negative reviews:	 0.90 		 0.90
None


In [7]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
y_vader = []
for review in reviews_test:
    sentiment = analyzer.polarity_scores(review)
    if sentiment['compound']>0:
        y_vader.append('pos')
    elif sentiment['compound']<0:
        y_vader.append('neg')
    else:
        y_vader.append('dont know')
# For advanced readers: The whole for loop can be replaced by just two lines (at the expense of being much less readable)
# labels = ({1:'pos',-1:'neg',0:'dont know'})
# y_vader = [labels.get(np.sign(analyzer.polarity_scores(review)['compound'])) for review in test_texts]

print(metrics.confusion_matrix(y_test, y_vader))
print(metrics.classification_report(y_test, y_vader))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/damian/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


[[    0     0     0]
 [    6  6688  5806]
 [    5  1745 10750]]
              precision    recall  f1-score   support

   dont know       0.00      0.00      0.00         0
         neg       0.79      0.54      0.64     12500
         pos       0.65      0.86      0.74     12500

    accuracy                           0.70     25000
   macro avg       0.48      0.47      0.46     25000
weighted avg       0.72      0.70      0.69     25000



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# let's take a vectorizer and classifier...
vectorizer = TfidfVectorizer(min_df=5, max_df=.5)
classifier = LogisticRegression(solver='liblinear')
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)
classifier.fit(X_train, y_train)
# .... and save it:
pickle.dump(vectorizer,open("myvectorizer.pkl",mode="wb"))
joblib.dump(nb, "myclassifier.pkl")

#Then, later on, instead of fitting a new vectorizer, you can simply load the old one and use it:

listwithnewdata = ['This is a great movie', 'I hated this one.', 'What an awful fail']

myvectorizer = pickle.load(open("myvectorizer.pkl",mode="rb"))
new_features = vectorizer.transform(listwithnewdata)

myclassifier = joblib.load("myclassifier.pkl")
predictions = myclassifier.predict(new_features)

for review, label in zip(listwithnewdata, predictions):
    print(f"'{review}' probably belongs to class '{label}'.")


'This is a great movie' probably belongs to class 'pos'.
'I hated this one.' probably belongs to class 'neg'.
'What an awful fail' probably belongs to class 'neg'.


In [20]:
eli5.show_weights(pipe, top = 10)

Weight?,Feature
+7.173,great
+6.101,excellent
+5.055,best
+4.791,perfect
… 13663 more positive …,… 13663 more positive …
… 13574 more negative …,… 13574 more negative …
-5.337,poor
-5.733,boring
-6.315,waste
-6.349,awful


In [25]:
eli5.show_prediction(classifier, reviews_test[0], vec=vectorizer, targets=['pos'])

Contribution?,Feature
0.013,<BIAS>
-1.83,Highlighted in text (sum)
