In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import spacy
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Bagging to BERT: A tour of applied NLP
### Part 2: Beyond bagging
### Table of Contents
* [LSTM](#lstm)
* [BERT](#bert)



### Data processing <a class="anchor" id="data"></a>

Copied from part 1

You'll either need to download the [imdb review data](https://ai.stanford.edu/~amaas/data/sentiment/) and save it to this directory OR download the [processed data](https://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharinghttps://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharing).

In [3]:
# # processing the original data into DataFrame
# # here for reference, don't need to run this if you're using reviews.pkl.gz
# source_path = Path('./aclImdb/')
# #neg_files = source_path.glob('./*/neg/*.txt')
# #pos_files = source_path.glob('./*/pos/*.txt')
# all_files = []
# for f in source_path.glob('./*/*/*.txt'):
#     filename = f.as_posix()
#     if 'unsup' not in filename:
#         # split up into useful components
#         _, split, sent, idx = filename.split('/')
#         idx = int(idx.split('_')[0])
#         all_files.append([idx, split, sent, f.read_text()])
# review_df = pd.DataFrame(all_files)
# review_df.columns = ['idx', 'split', 'label', 'text']
# # some minor html cruft is in here
# review_df['text'] = review_df['text'].str.replace('<br /><br />', '')
# review_df = review_df.to_pickle('reviews.pkl.gz')

In [4]:
# can skip here if you already have reviews.pkl.gz
review_df = pd.read_pickle('reviews.pkl.gz')

In [None]:
# copied from part 1: using the same train/test split
seed = 37
np.random.seed(seed)
pct_train = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    review_df['text'],
    review_df['label'], train_size=pct_train)

### LSTM <a class="anchor" id="lstm"></a>


In [293]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable 
# this will set the device on which to train
device = torch.device("cpu")


In [294]:
class SentLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
        super(SentLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to sentiment space
        self.fc = nn.Linear(hidden_dim, output_size)

        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        batch_size = x.shape[0]
        embeds = self.word_embeddings(x)
        lstm_out, _ = self.lstm(embeds)
        pred_space = self.fc(lstm_out)
        out = self.sigmoid(pred_space)
        # reshape - want to get the last prediction
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out
    
def simple_tokenizer(doc, model=en):
    # a simple tokenizer for individual documents 
    tokenized_docs = []
    parsed = model(doc)
    return([t.lower_ for t in parsed if (t.is_alpha)&(not t.is_stop)])

def doc_to_index(docs, vocab, tokenizer=simple_tokenizer):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        d_tokenized = simple_tokenizer(d)
        for w in d_tokenized:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown token = 1
                w_idxs.append(1)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=200):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features

In [209]:
# need to adapt vocab, leave space for padding
tfidf = TfidfVectorizer(tokenizer=simple_tokenizer,
                       token_pattern=None,
                       min_df=0.01)
tfidf.fit(X_train)
vocab = tfidf.vocabulary_
vocab = dict([(v, vocab[v]+2) for v in vocab])
vocab['_UNK'] = 1
vocab['_PAD'] = 0
parsed_train = doc_to_index(X_train, vocab)
padded_train = pad_sequence(parsed_train)
parsed_test = doc_to_index(X_test, vocab)
padded_test = pad_sequence(parsed_test)


In [210]:
# construct datasets for loading by PyTorch
train_data = TensorDataset(torch.from_numpy(padded_train), 
                           torch.from_numpy(y_train.values))
test_data = TensorDataset(torch.from_numpy(padded_test), 
                          torch.from_numpy(y_test.values))

batch_size = 100

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

In [226]:
model_params = {'output_size': 1,
               'hidden_dim': 512,
               'embedding_dim': 400,
               'vocab_size': vocab_size}
model = LSTMSimple(**model_params)
model.to(device)

LSTMSimple(
  (word_embeddings): Embedding(1534, 400)
  (lstm): LSTM(400, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [229]:
lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# increasing this will make the training take a while on CPU
# decrease to 5 if it's taking too long
epochs = 1
counter = 0
print_every = 5
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    for inputs, labels in train_loader:
        counter += 1
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_losses = []
            model.eval()
            for inp, lab in test_loader:
                inp, lab = inp.to(device), lab.to(device)
                out = model(inp)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/1... Step: 5... Loss: 0.686843... Val Loss: 0.688408
Validation loss decreased (inf --> 0.688408).  Saving model ...
Epoch: 1/1... Step: 10... Loss: 0.710092... Val Loss: 0.662540
Validation loss decreased (0.688408 --> 0.662540).  Saving model ...
Epoch: 1/1... Step: 15... Loss: 0.666220... Val Loss: 0.669263
Epoch: 1/1... Step: 20... Loss: 0.659887... Val Loss: 0.609165
Validation loss decreased (0.662540 --> 0.609165).  Saving model ...
Epoch: 1/1... Step: 25... Loss: 0.653383... Val Loss: 0.589641
Validation loss decreased (0.609165 --> 0.589641).  Saving model ...
Epoch: 1/1... Step: 30... Loss: 0.649555... Val Loss: 0.624702
Epoch: 1/1... Step: 35... Loss: 0.636202... Val Loss: 0.640866


In [230]:
# pytorch LSTM model
model.load_state_dict(torch.load('./state_dict.pt'))
num_correct = 0
model.eval()
pred_collect = np.array([])
eval_loader = DataLoader(test_data, batch_size=batch_size)
with torch.no_grad():
    for inputs, labels in eval_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        output = model(inputs)
        # takes output, rounds to 0/1
        pred = torch.round(output.squeeze())
        pred_collect = np.concatenate([
          pred_collect,
          pred.cpu().numpy()
        ])

In [231]:
print(f'accuracy: {np.where(pred_collect == y_test)[0].shape[0]/y_test.shape[0]}')
print(
    classification_report(y_pred=pred_collect,
                          y_true=y_test))

accuracy: 0.6816
              precision    recall  f1-score   support

       False       0.75      0.58      0.66       651
        True       0.63      0.79      0.70       599

    accuracy                           0.68      1250
   macro avg       0.69      0.69      0.68      1250
weighted avg       0.70      0.68      0.68      1250



### BERT <a class="anchor" id="bert"></a>
From [HF tutorials](https://huggingface.co/blog/sentiment-analysis-python).  The sentiment analysis pipeline packages together the tokenizer and the BERT model with a classification layer.  The default pipeline uses this [distilBERT model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english). 

In [298]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [299]:
# some manipulations for speed and to play nice with BERT
bert_pred = sentiment_pipeline(X_test.apply(lambda x: x[:512]).head(n=50).tolist())
bert_pred = [p['label']=='POSITIVE' for p in bert_pred]

In [300]:
np.where(bert_pred == y_test[:50])
print(f'accuracy: {np.where(bert_pred == y_test[:50])[0].shape[0]/50}')
print(
    classification_report(y_pred=bert_pred,
                          y_true=y_test[:50]))

accuracy: 0.86
              precision    recall  f1-score   support

       False       0.96      0.81      0.88        32
        True       0.74      0.94      0.83        18

    accuracy                           0.86        50
   macro avg       0.85      0.88      0.86        50
weighted avg       0.88      0.86      0.86        50



There it is - you've leveraged a cutting edge model to do sentiment analysis! This performance is pretty good, but our count vectors actually did a few points better.  Maybe there's an opportunity to fine-tune the BERT model specifically to the IMDB review dataset.  Let's try it.

NOTE: This takes some time to run, even with the Collab GPU.  You might want to experiment on subsets of the dataset