In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
import joblib
import eli5
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
import bz2, pickle
with bz2.BZ2File("reviewdata.pickle.bz2") as f:
    train_texts, test_texts, y_train, y_test = pickle.load(f)

In [15]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.79      0.88      0.83     12500
         pos       0.86      0.76      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [3]:
def short_classification_report (y_test, y_pred):
    print("                \t precision \t recall")
    print(f"positive reviews:\t {metrics.precision_score(y_test,y_pred, pos_label='pos'):0.2f} \t\t {metrics.recall_score(y_test,y_pred, pos_label='pos'):0.2f}")
    print(f"negative reviews:\t {metrics.precision_score(y_test,y_pred, pos_label='neg'):0.2f} \t\t {metrics.recall_score(y_test,y_pred, pos_label='neg'):0.2f}")

configurations = [('NB with Count', CountVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('NB with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('LogReg with Count', CountVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear')),
                 ('LogReg with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear'))]

for description, vectorizer, classifier in configurations:
    print(description)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    short_classification_report(y_test, y_pred)
    print('\n')

NB with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.77
negative reviews:	 0.79 		 0.88


NB with TfIdf
                	 precision 	 recall
positive reviews:	 0.87 		 0.78
negative reviews:	 0.80 		 0.88


LogReg with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.85
negative reviews:	 0.85 		 0.87


LogReg with TfIdf
                	 precision 	 recall
positive reviews:	 0.89 		 0.88
negative reviews:	 0.88 		 0.89




In [20]:
for description, vectorizer, classifier in configurations:
    print(description)
    pipe = make_pipeline(vectorizer, classifier)
    pipe.fit(train_texts, y_train)
    y_pred = pipe.predict(test_texts)
    short_classification_report(y_test, y_pred)
    print('\n')

NB with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.77
negative reviews:	 0.79 		 0.88


NB with TfIdf
                	 precision 	 recall
positive reviews:	 0.87 		 0.78
negative reviews:	 0.80 		 0.88


LogReg with Count
                	 precision 	 recall
positive reviews:	 0.87 		 0.85
negative reviews:	 0.85 		 0.87


LogReg with TfIdf
                	 precision 	 recall
positive reviews:	 0.89 		 0.88
negative reviews:	 0.88 		 0.89




In [5]:
pipeline = Pipeline(steps = [('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(solver='liblinear'))])
grid = {
    'vectorizer__ngram_range' : [(1,1), (1,2)],
    'vectorizer__max_df': [0.5, 1.0],
    'vectorizer__min_df': [0, 5],
    'classifier__C': [0.01, 1, 100]
}

search = GridSearchCV(estimator=pipeline,
                      param_grid=grid,
                      scoring='accuracy',   # all classes are balanced, let's just score on accuracy
                      cv=5,
                      n_jobs=-1,  # use all cpus
                      verbose=10)
search.fit(train_texts, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(short_classification_report(y_test, search.predict(test_texts)))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 118 out of 120 | elapsed:  5.0min remaining:    5.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  5.1min finished


Using these hyperparameters {'classifier__C': 100, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0, 'vectorizer__ngram_range': (1, 2)}, we get the best performance:
                	 precision 	 recall
positive reviews:	 0.90 		 0.90
negative reviews:	 0.90 		 0.90
None


In [12]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
y_vader = []
for review in test_texts:
    sentiment = analyzer.polarity_scores(review)
    if sentiment['compound']>0:
        y_vader.append('pos')
    elif sentiment['compound']<0:
        y_vader.append('neg')
    else:
        y_vader.append('dont know')
print(metrics.confusion_matrix(y_test, y_vader))
print(metrics.classification_report(y_test, y_vader))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/wva/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


[[    0     0     0]
 [    6  6688  5806]
 [    5  1745 10750]]
              precision    recall  f1-score   support

   dont know       0.00      0.00      0.00         0
         neg       0.79      0.54      0.64     12500
         pos       0.65      0.86      0.74     12500

    accuracy                           0.70     25000
   macro avg       0.48      0.47      0.46     25000
weighted avg       0.72      0.70      0.69     25000



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# let's save the vectorizer and classifier from the first example
pickle.dump(vectorizer,open("myvectorizer.pkl",mode="wb"))
joblib.dump(nb, "myclassifier.pkl")

#Then, later on, instead of fitting a new vectorizer, you can simply load the old one and use it:

listwithnewdata = ['This is a great movie', 'I hated this one.', 'What an awful fail']

myvectorizer = pickle.load(open("myvectorizer.pkl",mode="rb"))
new_features = vectorizer.transform(listwithnewdata)

myclassifier = joblib.load("myclassifier.pkl")
predictions = myclassifier.predict(new_features)

for review, label in zip(listwithnewdata, predictions):
    print(f"'{review}' probably belongs to class '{label}'.")


'This is a great movie' probably belongs to class 'pos'.
'I hated this one.' probably belongs to class 'neg'.
'What an awful fail' probably belongs to class 'neg'.


In [21]:
eli5.show_weights(pipe, top = 10)

Weight?,Feature
+7.173,great
+6.101,excellent
+5.055,best
+4.791,perfect
… 13663 more positive …,… 13663 more positive …
… 13574 more negative …,… 13574 more negative …
-5.337,poor
-5.733,boring
-6.315,waste
-6.349,awful


In [22]:
eli5.show_prediction(classifier, test_texts[0], vec=vectorizer)

Contribution?,Feature
0.271,cheap
0.27,poorly
0.251,dull
0.167,predictable
0.154,wooden
0.135,painful
0.117,silly
0.115,rubbish
0.102,cardboard
0.089,painfully
