In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import spacy
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Bagging to BERT: Sentiment analysis three ways
This notebook accompanies the ODSC blog post introducing the Bagging to BERT workshop.  This will be expanded for the full workshop.

Up first is some preprocessing.  You'll either need to download the [imdb review data](https://ai.stanford.edu/~amaas/data/sentiment/) and save it to this directory OR download the processed data from [here](https://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharinghttps://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharing).

In [82]:
# # processing the original data into DataFrame
# # here for reference, don't need to run this if you're using reviews.pkl.gz
# source_path = Path('./aclImdb/')
# #neg_files = source_path.glob('./*/neg/*.txt')
# #pos_files = source_path.glob('./*/pos/*.txt')
# all_files = []
# for f in source_path.glob('./*/*/*.txt'):
#     filename = f.as_posix()
#     if 'unsup' not in filename:
#         # split up into useful components
#         _, split, sent, idx = filename.split('/')
#         idx = int(idx.split('_')[0])
#         all_files.append([idx, split, sent, f.read_text()])
# review_df = pd.DataFrame(all_files)
# review_df.columns = ['idx', 'split', 'label', 'text']
# # some minor html cruft is in here
# review_df['text'] = review_df['text'].str.replace('<br /><br />', '')
# review_df = review_df.to_pickle('reviews.pkl.gz')

In [199]:
# can skip here if you already have reviews.pkl.gz
review_df = pd.read_pickle('reviews.pkl.gz')

### Word counts
A very basic way to use a sanitized list of tokens is to do a word count. This unlocks a lot of insights right off and is an important step in exploratory data analysis in text.

In [83]:
# take a positive and negative review for examples
# we'll use Star Wars Episode VI since everyone likes a Star War
neg_review = review_df.loc[(review_df.label=='neg')].iloc[0]['text']
pos_review = review_df[(review_df.label=='pos')].iloc[0]['text']
print('Negative\n', neg_review, '\n')
print('Positive\n', pos_review)

Negative
 Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook. 

Positive
 Based on an actual story, John Boorman shows the struggle of an American doctor, whose husband and son were murdered and she was continually plagued with her loss. A holiday to Burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in Rangoon, she could not leave the country with

In [84]:
# base python word count - split on whitespace, use Counter object)
print(Counter(neg_review.split()))

Counter({'the': 4, 'it': 4, 'for': 3, 'and': 3, 'The': 3, 'performances': 2, 'with': 2, 'in': 2, 'worth': 2, 'Holbrook.': 2, 'movie': 2, 'that': 2, 'as': 2, 'is': 2, 'by': 2, 'a': 2, 'uninteresting': 2, 'Alan': 1, 'Rickman': 1, '&': 1, 'Emma': 1, 'Thompson': 1, 'give': 1, 'good': 1, 'southern/New': 1, 'Orleans': 1, 'accents': 1, 'this': 1, 'detective': 1, 'flick.': 1, "It's": 1, 'seeing': 1, 'their': 1, 'scenes-': 1, "Rickman's": 1, 'scene': 1, 'Hal': 1, 'These': 1, 'three': 1, 'actors': 1, 'mannage': 1, 'to': 1, 'entertain': 1, 'us': 1, 'no': 1, 'matter': 1, 'what': 1, 'movie,': 1, 'seems.': 1, 'plot': 1, 'shows': 1, 'potential,': 1, 'but': 1, 'one': 1, 'gets': 1, 'impression': 1, 'watching': 1, 'film': 1, 'was': 1, 'not': 1, 'pulled': 1, 'off': 1, 'well': 1, 'could': 1, 'have': 1, 'been.': 1, 'fact': 1, 'cluttered': 1, 'rather': 1, 'subplot': 1, 'mostly': 1, 'kidnappers': 1, 'really': 1, 'muddles': 1, 'things.': 1, 'view-': 1, 'if': 1, 'nothing': 1, 'more': 1, 'than': 1, 'entertainin

Already see some things that need to be considered; capitalization treats "The" and "the" differently, words like "the" and "it" dominate counts.

Luckily, scikit-learn's CountVectorizer allows for simple preprocessing like this.

In [85]:
# scikit-learn's countvectorizer
count = CountVectorizer()
neg_vec = count.fit_transform([neg_review])
neg_vec

<1x76 sparse matrix of type '<class 'numpy.int64'>'
	with 76 stored elements in Compressed Sparse Row format>

`CountVectorizer` outputs a sparse matrix by default.  We can convert that to a normal numpy array and stitch it together with the vocabulary from the `fit()` call.

In [86]:
print(
    dict(zip(count.get_feature_names_out(), 
             neg_vec.toarray().flatten())))

{'accents': 1, 'actors': 1, 'alan': 1, 'and': 3, 'as': 2, 'been': 1, 'but': 1, 'by': 2, 'cluttered': 1, 'could': 1, 'detective': 1, 'emma': 1, 'entertain': 1, 'entertaining': 1, 'fact': 1, 'film': 1, 'flick': 1, 'for': 3, 'gets': 1, 'give': 1, 'good': 1, 'hal': 1, 'have': 1, 'holbrook': 2, 'if': 1, 'impression': 1, 'in': 2, 'is': 2, 'it': 5, 'kidnappers': 1, 'mannage': 1, 'matter': 1, 'more': 1, 'mostly': 1, 'movie': 3, 'muddles': 1, 'new': 1, 'no': 1, 'not': 1, 'nothing': 1, 'off': 1, 'one': 1, 'orleans': 1, 'performances': 2, 'plot': 1, 'potential': 1, 'pulled': 1, 'rather': 1, 'really': 1, 'rickman': 3, 'scene': 1, 'scenes': 1, 'seeing': 1, 'seems': 1, 'shows': 1, 'southern': 1, 'subplot': 1, 'than': 1, 'that': 2, 'the': 7, 'their': 1, 'these': 1, 'things': 1, 'this': 1, 'thompson': 2, 'three': 1, 'to': 1, 'uninteresting': 2, 'us': 1, 'view': 1, 'was': 1, 'watching': 1, 'well': 1, 'what': 1, 'with': 2, 'worth': 2}


We can see the defaults have already done some amount of cleaning for us.

### Deterministic Approach

Let's try a deterministic approach, using word counts and a list of "positive" vs "negative" words.

In [87]:
pos_words = ["good", "great", "like", "loved"]
neg_words = ["bad", "awful", "dislike", "hated"]

# we're going to use this train/test split throughout
# we'll also use this seed for consistency
# NOTE: Usually you'll want to do a separate validation set when choosing models/featuresets!
seed = 37
np.random.seed(seed)
pct_train = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    review_df['text'],
    review_df['label'], train_size=pct_train)

cv = CountVectorizer(stop_words='english')
train_vecs = cv.fit_transform(X_train)
feats = cv.get_feature_names_out()
pos_idxs = np.where(np.isin(feats, pos_words))[0]
neg_idxs = np.where(np.isin(feats, neg_words))[0]
train_det_score = train_vecs[:, pos_idxs].sum(1) - train_vecs[:, neg_idxs].sum(1)
# easier for group-level score
train_det_score = pd.Series(np.array(train_det_score).ravel(), 
                            index=X_train.index)

In [88]:
# our threshold - the average score for negative, that or below = negative
neg_thresh = train_det_score.groupby(review_df['label'].loc[X_train.index]).mean()['neg']
test_vecs = cv.transform(X_test)
test_det_score = test_vecs[:, pos_idxs].sum(1) - test_vecs[:, neg_idxs].sum(1)
det_pred = test_det_score>neg_thresh

In [89]:
print(
    classification_report(y_pred=det_pred,
                          y_true=y_test=='pos'))

              precision    recall  f1-score   support

       False       0.61      0.44      0.51      7522
        True       0.56      0.71      0.63      7478

    accuracy                           0.58     15000
   macro avg       0.58      0.58      0.57     15000
weighted avg       0.58      0.58      0.57     15000



### Count Vector + Logistic Regression
Here we try a count vector with Logistic Regression.  This alleviates the need for chosing an arbitrary set of terms and arbitrary threshold as above.

Here I use scikit-learn's [Pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) functionality.  I won't try and explain that here, the docs do a much better job than I can.


In [90]:
count = CountVectorizer(stop_words='english')

count_pipeline = Pipeline(
    steps=[("preprocessor", count),
          ('model', LogisticRegression(max_iter=500, solver='liblinear'))]
)

In [91]:
np.random.seed(seed)
count_pipeline.fit(X_train, y_train)
count_pipeline.score(X_test, y_test)

0.8813333333333333

In [92]:
print(
    classification_report(y_pred=count_pipeline.predict(X_test),
                          y_true=y_test))

              precision    recall  f1-score   support

         neg       0.89      0.88      0.88      7522
         pos       0.88      0.89      0.88      7478

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



This is actually really good! 90% of the time we're predicting the right class with this model.  But can we do...better?

### TF-IDF
One thing we notice with count vectors is that all words are being counted the same.  We might want to use a weighting scheme to ensure that words that are more informative about the content are flagged as more important.  One weighting scheme is Term Frequency - Inverse Document Frequency (TF-IDF).

Take as an example some kind of simplistic movie reviews.  We can already tell which words are most relevant to the specific content of each review (i.e. "good", "bad", "great").

In [93]:
docs = ['The movie was good',
        'The movie was bad',
        'The movie was great']

cv = CountVectorizer()
vecs = cv.fit_transform(docs).toarray()
# we'll use pandas DF for easier display
pd.DataFrame(vecs, columns=cv.get_feature_names_out())

Unnamed: 0,bad,good,great,movie,the,was
0,0,1,0,1,1,1
1,1,0,0,1,1,1
2,0,0,1,1,1,1


You'll notice that `vecs` contains the term frequencies.  If we use sklearn's `TfidfVectorizer`, it will calculate those term counts and then multiply them by the Inverse Document Frequency (IDF).

In [94]:
tfidf = TfidfVectorizer()
# we'll use pandas DF for easier display
tfidf_vecs = tfidf.fit_transform(docs).toarray()
tfidf_df = pd.DataFrame(tfidf_vecs, columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,bad,good,great,movie,the,was
0,0.0,0.69903,0.0,0.412859,0.412859,0.412859
1,0.69903,0.0,0.0,0.412859,0.412859,0.412859
2,0.0,0.0,0.69903,0.412859,0.412859,0.412859


You can see that the discriminative words have higher weight than the non-discriminative words.  

It's worth noting here - in terms of "separability", having 0 v 1 (count of "good" vs count of "bad") might actually be better.  But these are highly curated examples - you can imagine cases where good and bad descriptive terms are mixed in a review, you want to capture the words that describe better the "aboutness" of the review.  (Think: "This movie was not bad, it was good!")

Now let's fit our regression as above with TF-IDF vectors.

In [95]:
# we use binary here to handle longer reviews
tfidf = TfidfVectorizer(stop_words='english')

tfidf_pipeline = Pipeline(
    steps=[("preprocessor", tfidf),
          ('model', LogisticRegression(max_iter=500, solver='liblinear'))]
)

In [96]:
np.random.seed(seed)
tfidf_pipeline.fit(X_train, y_train)
tfidf_pipeline.score(X_test, y_test)

0.8912666666666667

In [97]:
print(
    classification_report(y_pred=tfidf_pipeline.predict(X_test),
                          y_true=y_test))

              precision    recall  f1-score   support

         neg       0.90      0.88      0.89      7522
         pos       0.88      0.90      0.89      7478

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [187]:
# looking at the coefficients on the LR for each model
word_feats = tfidf_pipeline['preprocessor'].get_feature_names_out()
# get the largest by magnitude, stitch together to compare
top = 10
top_tfidf = np.argsort(np.abs(tfidf_pipeline['model'].coef_.flatten()))[-top:]
top_count = np.argsort(np.abs(count_pipeline['model'].coef_.flatten()))[-top:]
# top
coef_df = pd.DataFrame([
    word_feats,
    tfidf_pipeline['model'].coef_.flatten(),
    count_pipeline['model'].coef_.flatten()],
    index=['word', 'tfidf', 'count']).T
# normalize result for compare
coef_df['tfidf'] = coef_df['tfidf'].rank()
coef_df['count'] = coef_df['count'].rank()
coef_df.loc[np.unique(np.concatenate([top_tfidf, top_count]))]

Unnamed: 0,word,tfidf,count
6408,awful,3.0,4.0
6697,bad,2.0,316.0
10324,boring,5.0,14.0
22357,disappointing,16.0,5.0
22360,disappointment,14.0,2.0
27335,excellent,89233.0,89227.0
28106,fails,17.0,9.0
34078,great,89234.0,89112.0
50143,mediocre,30.0,6.0
51096,mildly,96.0,10.0


In [188]:
# examples where there's disagreement
tfidf_pred = tfidf_pipeline.predict_proba(X_test)[:, 1]
count_pred = count_pipeline.predict_proba(X_test)[:, 1]

In [193]:
# most interesting are where there's the largest disagreement
top_disagree_idx = np.argsort(np.abs(tfidf_pred - count_pred))[-10:]

In [194]:
# assemble in df
compare_df = pd.DataFrame([tfidf_pred, count_pred, y_test, X_test],
            index=['tfidf_pred', 'count_pred', 'label', 'text']).T
# would like some shorter mv reviews here
compare_df['text'] = compare_df['text'].apply(lambda x: x[:2000])

In [195]:
compare_df['tfidf_right'] = ((compare_df['tfidf_pred']>=0.5)&(compare_df['label']=='pos'))|\
    ((compare_df['tfidf_pred']<0.5)&(compare_df['label']=='neg'))

In [196]:
compare_df[compare_df.tfidf_right].reindex(top_disagree_idx).values

array([[0.6621354380525597, 0.00953398674695978, 'pos',
        True],
       [nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan],
       [0.30912394320895137, 0.983967046134271, 'neg',
        "Some guy gets whacked. Right out in plain sight this other guy shoots him. He's got some bodyguards and they whack the killer, but a reporter gets interested. She goes to the hospital where they took the guy who got whacked. She walks in, and corners one bodyguard, but he doesn't feel like talking. I can't figure out why. It's not like anyone else is interested. She's the only reporter there. Anyway, her editor discourages her from working on this lame story. But hey, she does anyway. She goes to see the killer's sister & mom. A few minutes after she leaves they get whacked big time-- somebody blows up their trailer-- huge ball of fire. Then she searches out the bodyguard from the hospital. She finds him hungover on his boat, but a minute later they're both underwater sucking on a scuba

### Word embeddings
Our next approach is to include context in the word-level representations.  We'll be bringing SpaCy into the mix here, particularly their "medium" English web model, which uses GloVe embeddings.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import spacy

In [None]:
# only need to run this once
#!python -m spacy download en_core_web_md

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:

class GloveVectorizer(BaseEstimator, TransformerMixin):
    # this is a custom document transformer for use in the scikit-learn pipeline
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer
        return
    
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        vocab = self.vectorizer.vocabulary_
        self.vocab_glove = np.zeros(shape=(len(vocab), 300))
        for token, idx in vocab.items():
            self.vocab_glove[idx] = nlp(token).vector
        return self
    
    def transform(self, X, y=None):
        X_transformed = self.vectorizer.transform(X).toarray()
        sum_words = (X_transformed.sum(1)).reshape(-1, 1)
        glove_vecs = (X_transformed.dot(self.vocab_glove))/sum_words
        return glove_vecs

In [None]:
# we use binary here to handle longer reviews
count = CountVectorizer(stop_words='english', min_df=0.01, binary=False)
glove = GloveVectorizer(count)

glove_pipeline = Pipeline(
    steps=[("preprocessor", glove),
          ('model', LogisticRegression(max_iter=500, solver='liblinear'))]
)


In [None]:
np.random.seed(seed)
glove_pipeline.fit(X_train, y_train)
glove_pipeline.score(X_test, y_test)

In [88]:
print(
    classification_report(y_pred=glove_pipeline.predict(X_test),
                          y_true=y_test))

              precision    recall  f1-score   support

         neg       0.85      0.84      0.84      7522
         pos       0.84      0.85      0.84      7478

    accuracy                           0.84     15000
   macro avg       0.84      0.84      0.84     15000
weighted avg       0.84      0.84      0.84     15000

