In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
import joblib
import eli5
import nltk
import bz2
import pickle

from nltk.sentiment.vader import SentimentIntensityAnalyzer

2021-07-06 22:43:48.142912: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-06 22:43:48.142927: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
filename = "reviewdata.pickle.bz2"
with bz2.BZ2File(filename, "r") as f:
    text_train, text_test, y_train, y_test = pickle.load(f)


In [3]:
vectorizer = CountVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(text_train)
X_test = vectorizer.transform(text_test)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

rep=metrics.classification_report(y_test, y_pred)
print(rep)

              precision    recall  f1-score   support

         neg       0.79      0.88      0.83     12500
         pos       0.86      0.76      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [4]:
def short_classification_report (y_test, y_pred):
  print("    \tPrecision\tRecall")
  for label in set(y_pred):
    pr = metrics.precision_score(y_test, y_pred, pos_label=label)
    re = metrics.recall_score(y_test,y_pred, pos_label=label)
    print(f"{label}:\t{pr:0.2f}\t\t{re:0.2f}")

configs = [
    ("NB with Count", CountVectorizer(min_df=5, max_df=.5), MultinomialNB()),
    ("NB with TfIdf", TfidfVectorizer(min_df=5, max_df=.5), MultinomialNB()),
    ("LogReg with Count", CountVectorizer(min_df=5, max_df=.5), 
     LogisticRegression(solver="liblinear")),
    ("LogReg with TfIdf", TfidfVectorizer(min_df=5, max_df=.5), 
     LogisticRegression(solver="liblinear"))]

for name, vectorizer, classifier in configs:
    print(name)
    X_train = vectorizer.fit_transform(text_train)
    X_test = vectorizer.transform(text_test)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    short_classification_report(y_test, y_pred)
    print("\n")

NB with Count
    	Precision	Recall
neg:	0.79		0.88
pos:	0.87		0.77


NB with TfIdf
    	Precision	Recall
neg:	0.80		0.88
pos:	0.87		0.78


LogReg with Count
    	Precision	Recall
neg:	0.85		0.87
pos:	0.87		0.85


LogReg with TfIdf
    	Precision	Recall
neg:	0.88		0.89
pos:	0.89		0.88




In [5]:
for name, vectorizer, classifier in configs:
    print(name)
    pipe = make_pipeline(vectorizer, classifier)
    pipe.fit(text_train, y_train)
    y_pred = pipe.predict(text_test)
    short_classification_report(y_test, y_pred)
    print("\n")

NB with Count
    	Precision	Recall
neg:	0.79		0.88
pos:	0.87		0.77


NB with TfIdf
    	Precision	Recall
neg:	0.80		0.88
pos:	0.87		0.78


LogReg with Count
    	Precision	Recall
neg:	0.85		0.87
pos:	0.87		0.85


LogReg with TfIdf
    	Precision	Recall
neg:	0.88		0.89
pos:	0.89		0.88




In [6]:
pipeline = Pipeline(steps = [("vectorizer", TfidfVectorizer()), 
                             ("classifier", LogisticRegression(solver="liblinear"))])
grid = {"vectorizer__ngram_range" : [(1,1), (1,2)],
        "vectorizer__max_df": [0.5, 1.0],
        "vectorizer__min_df": [0, 5],
        "classifier__C": [0.01, 1, 100]
       }
search = GridSearchCV(estimator=pipeline, param_grid=grid, cv=5,
                      scoring="accuracy", n_jobs=-1, verbose=10)
search.fit(text_train, y_train)
print(f"Best parameters: {search.best_params_}")
pred = search.predict(text_test)
print(short_classification_report(y_test, pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters: {'classifier__C': 100, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0, 'vectorizer__ngram_range': (1, 2)}
    	Precision	Recall
neg:	0.90		0.90
pos:	0.90		0.90
None


In [7]:
nltk.download("vader_lexicon")
analyzer = SentimentIntensityAnalyzer()
pred = []
for review in text_test:
    sentiment = analyzer.polarity_scores(review)
    if sentiment["compound"]>0:
        pred.append("pos")
    elif sentiment["compound"]<0:
        pred.append("neg")
    else:
        pred.append("dont know")

print(metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test,pred))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/wva/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


[[    0     0     0]
 [    6  6706  5788]
 [    5  1748 10747]]
              precision    recall  f1-score   support

   dont know       0.00      0.00      0.00         0
         neg       0.79      0.54      0.64     12500
         pos       0.65      0.86      0.74     12500

    accuracy                           0.70     25000
   macro avg       0.48      0.47      0.46     25000
weighted avg       0.72      0.70      0.69     25000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Make a vectorizer and train a classifier
vectorizer=TfidfVectorizer(min_df=5, max_df=.5)
classifier=LogisticRegression(solver="liblinear")
X_train=vectorizer.fit_transform(text_train)
classifier.fit(X_train, y_train)

# Save them to disk
with open("myvectorizer.pkl",mode="wb") as f:
    pickle.dump(vectorizer, f)
with open("myclassifier.pkl",mode="wb") as f:
    joblib.dump(classifier, f)
  
# Later on, re-load this classifier and apply:
new_texts = ["This is a great movie", 
            "I hated this one.", 
            "What an awful fail"]

with open("myvectorizer.pkl",mode="rb") as f:
    myvectorizer = pickle.load(f)
with open("myclassifier.pkl",mode="rb") as f:
    myclassifier = joblib.load(f)
    
new_features = myvectorizer.transform(new_texts)
pred = myclassifier.predict(new_features)

for review, label in zip(new_texts, pred):
    print(f"'{review}' is probably '{label}'.")

'This is a great movie' is probably 'pos'.
'I hated this one.' is probably 'neg'.
'What an awful fail' is probably 'neg'.


In [10]:
pipe = make_pipeline(TfidfVectorizer(min_df=5, max_df=.5), 
                     LogisticRegression(solver="liblinear"))
pipe.fit(text_train, y_train)
eli5.show_weights(pipe, top = 10)

Weight?,Feature
+7.173,great
+6.101,excellent
+5.055,best
+4.791,perfect
… 13663 more positive …,… 13663 more positive …
… 13574 more negative …,… 13574 more negative …
-5.337,poor
-5.733,boring
-6.315,waste
-6.349,awful


In [None]:
eli5.show_prediction(classifier, text_test[0], vec=vectorizer, targets=["pos"])

[CV 2/5; 1/24] START classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=0, vectorizer__ngram_range=(1, 1)
[CV 2/5; 1/24] END classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=0, vectorizer__ngram_range=(1, 1);, score=0.826 total time=   3.2s
[CV 2/5; 3/24] START classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1)
[CV 2/5; 3/24] END classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1);, score=0.826 total time=   2.9s
[CV 4/5; 3/24] START classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1)
[CV 4/5; 3/24] END classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1);, score=0.809 total time=   3.0s
[CV 2/5; 4/24] START classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2)
[CV 2/5; 4/24] END classifier__C=0.01, vectorizer__max_df=0.5, vectorizer__min_df