In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import spacy
from collections import Counter

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Bagging to BERT: Sentiment analysis three ways
This notebook accompanies the ODSC blog post introducing the Bagging to BERT workshop.  This will be expanded for the full workshop.

Up first is some preprocessing.  You'll either need to download the [imdb review data](https://ai.stanford.edu/~amaas/data/sentiment/) and save it to this directory OR download the [processed data](https://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharinghttps://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharing).

In [None]:
# # processing the original data into DataFrame
# # here for reference, don't need to run this if you're using reviews.pkl.gz
# source_path = Path('./aclImdb/')
# #neg_files = source_path.glob('./*/neg/*.txt')
# #pos_files = source_path.glob('./*/pos/*.txt')
# all_files = []
# for f in source_path.glob('./*/*/*.txt'):
#     filename = f.as_posix()
#     if 'unsup' not in filename:
#         # split up into useful components
#         _, split, sent, idx = filename.split('/')
#         idx = int(idx.split('_')[0])
#         all_files.append([idx, split, sent, f.read_text()])
# review_df = pd.DataFrame(all_files)
# review_df.columns = ['idx', 'split', 'label', 'text']
# # some minor html cruft is in here
# review_df['text'] = review_df['text'].str.replace('<br /><br />', '')
# review_df = review_df.to_pickle('reviews.pkl.gz')

In [None]:
# can skip here if you already have reviews.pkl.gz
review_df = pd.read_pickle('reviews.pkl.gz')

### Word counts
A very basic way to use a sanitized list of tokens is to do a word count. This unlocks a lot of insights right off and is an important step in exploratory data analysis in text.

In [None]:
# take a positive and negative review for examples
# we'll use Star Wars Episode VI since everyone likes a Star War
neg_review = review_df.loc[(review_df.label=='neg')].iloc[0]['text']
pos_review = review_df[(review_df.label=='pos')].iloc[0]['text']
print('Negative\n', neg_review, '\n')
print('Positive\n', pos_review)

In [None]:
# base python word count - split on whitespace, use Counter object)
print(Counter(neg_review.split()))

Already see some things that need to be considered; capitalization treats "The" and "the" differently, words like "the" and "it" dominate counts.

Luckily, scikit-learn's CountVectorizer allows for simple preprocessing like this.

In [None]:
# scikit-learn's countvectorizer
count = CountVectorizer()
neg_vec = count.fit_transform([neg_review])
neg_vec

`CountVectorizer` outputs a sparse matrix by default.  We can convert that to a normal numpy array and stitch it together with the vocabulary from the `fit()` call.

In [None]:
print(
    dict(zip(count.get_feature_names_out(), 
             neg_vec.toarray().flatten())))

We can see the defaults have already done some amount of cleaning for us.

### Deterministic Approach

Let's try a deterministic approach, using word counts and a list of "positive" vs "negative" words.

In [None]:
pos_words = ["good", "great", "like", "loved"]
neg_words = ["bad", "awful", "dislike", "hated"]

# we're going to use this train/test split throughout
# we'll also use this seed for consistency
# NOTE: Usually you'll want to do a separate validation set when choosing models/featuresets!
seed = 37
np.random.seed(seed)
pct_train = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    review_df['text'],
    review_df['label'], train_size=pct_train)

cv = CountVectorizer(stop_words='english')
train_vecs = cv.fit_transform(X_train)
feats = cv.get_feature_names_out()
pos_idxs = np.where(np.isin(feats, pos_words))[0]
neg_idxs = np.where(np.isin(feats, neg_words))[0]
train_det_score = train_vecs[:, pos_idxs].sum(1) - train_vecs[:, neg_idxs].sum(1)
# easier for group-level score
train_det_score = pd.Series(np.array(train_det_score).ravel(), 
                            index=X_train.index)

In [None]:
# our threshold - the average score for negative, that or below = negative
neg_thresh = train_det_score.groupby(review_df['label'].loc[X_train.index]).mean()['neg']
test_vecs = cv.transform(X_test)
test_det_score = test_vecs[:, pos_idxs].sum(1) - test_vecs[:, neg_idxs].sum(1)
det_pred = test_det_score>neg_thresh

In [None]:
print(
    classification_report(y_pred=det_pred,
                          y_true=y_test=='pos'))

### Count Vector + Logistic Regression
Here we try a count vector with Logistic Regression.  This alleviates the need for chosing an arbitrary set of terms and arbitrary threshold as above.

Here I use scikit-learn's [Pipelines](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) functionality.  I won't try and explain that here, the docs do a much better job than I can.


In [None]:
count = CountVectorizer(stop_words='english')

count_pipeline = Pipeline(
    steps=[("preprocessor", count),
          ('model', LogisticRegression(max_iter=500, solver='liblinear'))]
)

In [None]:
np.random.seed(seed)
count_pipeline.fit(X_train, y_train)
count_pipeline.score(X_test, y_test)

In [None]:
print(
    classification_report(y_pred=count_pipeline.predict(X_test),
                          y_true=y_test))

This is actually really good! 90% of the time we're predicting the right class with this model.  But can we do...better?

### TF-IDF
One thing we notice with count vectors is that all words are being counted the same.  We might want to use a weighting scheme to ensure that words that are more informative about the content are flagged as more important.  One weighting scheme is Term Frequency - Inverse Document Frequency (TF-IDF).

Take as an example some kind of simplistic movie reviews.  We can already tell which words are most relevant to the specific content of each review (i.e. "good", "bad", "great").

In [None]:
docs = ['The movie was good',
        'The movie was bad',
        'The movie was great']

cv = CountVectorizer()
vecs = cv.fit_transform(docs).toarray()
# we'll use pandas DF for easier display
pd.DataFrame(vecs, columns=cv.get_feature_names_out())

You'll notice that `vecs` contains the term frequencies.  If we use sklearn's `TfidfVectorizer`, it will calculate those term counts and then multiply them by the Inverse Document Frequency (IDF).

In [None]:
tfidf = TfidfVectorizer()
# we'll use pandas DF for easier display
tfidf_vecs = tfidf.fit_transform(docs).toarray()
tfidf_df = pd.DataFrame(tfidf_vecs, columns=tfidf.get_feature_names_out())
tfidf_df

You can see that the discriminative words have higher weight than the non-discriminative words.  

It's worth noting here - in terms of "separability", having 0 v 1 (count of "good" vs count of "bad") might actually be better.  But these are highly curated examples - you can imagine cases where good and bad descriptive terms are mixed in a review, you want to capture the words that describe better the "aboutness" of the review.  (Think: "This movie was not bad, it was good!")

Now let's fit our regression as above with TF-IDF vectors.

In [None]:
# we use binary here to handle longer reviews
tfidf = TfidfVectorizer(stop_words='english')

tfidf_pipeline = Pipeline(
    steps=[("preprocessor", tfidf),
          ('model', LogisticRegression(max_iter=500, solver='liblinear'))]
)

In [None]:
np.random.seed(seed)
tfidf_pipeline.fit(X_train, y_train)
tfidf_pipeline.score(X_test, y_test)

In [None]:
print(
    classification_report(y_pred=tfidf_pipeline.predict(X_test),
                          y_true=y_test))

In [None]:
# looking at the coefficients on the LR for each model
word_feats = tfidf_pipeline['preprocessor'].get_feature_names_out()
# get the largest by magnitude, stitch together to compare
top = 10
top_tfidf = np.argsort(np.abs(tfidf_pipeline['model'].coef_.flatten()))[-top:]
top_count = np.argsort(np.abs(count_pipeline['model'].coef_.flatten()))[-top:]
# top
coef_df = pd.DataFrame([
    word_feats,
    tfidf_pipeline['model'].coef_.flatten(),
    count_pipeline['model'].coef_.flatten()],
    index=['word', 'tfidf', 'count']).T
# normalize result for compare
coef_df['tfidf'] = coef_df['tfidf'].rank()
coef_df['count'] = coef_df['count'].rank()
coef_df.loc[np.unique(np.concatenate([top_tfidf, top_count]))]

In [None]:
# examples where there's disagreement
tfidf_pred = tfidf_pipeline.predict_proba(X_test)[:, 1]
count_pred = count_pipeline.predict_proba(X_test)[:, 1]

In [None]:
# most interesting are where there's the largest disagreement
top_disagree_idx = np.argsort(np.abs(tfidf_pred - count_pred))[-10:]

In [None]:
# assemble in df
compare_df = pd.DataFrame([tfidf_pred, count_pred, y_test, X_test],
            index=['tfidf_pred', 'count_pred', 'label', 'text']).T
# would like some shorter mv reviews here
compare_df['text'] = compare_df['text'].apply(lambda x: x[:2000])

In [None]:
compare_df['tfidf_right'] = ((compare_df['tfidf_pred']>=0.5)&(compare_df['label']=='pos'))|\
    ((compare_df['tfidf_pred']<0.5)&(compare_df['label']=='neg'))

In [None]:
compare_df[compare_df.tfidf_right].reindex(top_disagree_idx).values

### Word embeddings
Our next approach is to include context in the word-level representations.  We'll be bringing SpaCy into the mix here, particularly their "medium" English web model, which uses GloVe embeddings.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import spacy

In [None]:
# only need to run this once
#!python -m spacy download en_core_web_md

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:

class GloveVectorizer(BaseEstimator, TransformerMixin):
    # this is a custom document transformer for use in the scikit-learn pipeline
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer
        return
    
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        vocab = self.vectorizer.vocabulary_
        self.vocab_glove = np.zeros(shape=(len(vocab), 300))
        for token, idx in vocab.items():
            self.vocab_glove[idx] = nlp(token).vector
        return self
    
    def transform(self, X, y=None):
        X_transformed = self.vectorizer.transform(X).toarray()
        sum_words = (X_transformed.sum(1)).reshape(-1, 1)
        glove_vecs = (X_transformed.dot(self.vocab_glove))/sum_words
        return glove_vecs

In [None]:
# we use binary here to handle longer reviews
count = CountVectorizer(stop_words='english', min_df=0.01, binary=False)
glove = GloveVectorizer(count)

glove_pipeline = Pipeline(
    steps=[("preprocessor", glove),
          ('model', LogisticRegression(max_iter=500, solver='liblinear'))]
)


In [None]:
np.random.seed(seed)
glove_pipeline.fit(X_train, y_train)
glove_pipeline.score(X_test, y_test)

In [None]:
print(
    classification_report(y_pred=glove_pipeline.predict(X_test),
                          y_true=y_test))