In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable 
# this will set the device on which to train
#device = torch.device("cpu")
# if you're using GPU on Colab, this is how to set that device up
# NOTE: GPU is recommended for this part
device = torch.device("cuda:0")


## Bagging to BERT: A tour of applied NLP
### Part 2: Beyond bagging
### Table of Contents
* [LSTM](#lstm)
* [BERT](#bert)



### Data processing <a class="anchor" id="data"></a>

Copied from part 1

You'll either need to download the [imdb review data](https://ai.stanford.edu/~amaas/data/sentiment/) and save it to this directory OR download the [processed data](https://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharinghttps://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharing).

In [None]:
# # processing the original data into DataFrame
# # here for reference, don't need to run this if you're using reviews.pkl.gz
# source_path = Path('./aclImdb/')
# #neg_files = source_path.glob('./*/neg/*.txt')
# #pos_files = source_path.glob('./*/pos/*.txt')
# all_files = []
# for f in source_path.glob('./*/*/*.txt'):
#     filename = f.as_posix()
#     if 'unsup' not in filename:
#         # split up into useful components
#         _, split, sent, idx = filename.split('/')
#         idx = int(idx.split('_')[0])
#         all_files.append([idx, split, sent, f.read_text()])
# review_df = pd.DataFrame(all_files)
# review_df.columns = ['idx', 'split', 'label', 'text']
# # some minor html cruft is in here
# review_df['text'] = review_df['text'].str.replace('<br /><br />', '')
# review_df = review_df.to_pickle('reviews.pkl.gz')

In [None]:
# can skip here if you already have reviews.pkl.gz
review_df = pd.read_pickle('reviews.pkl.gz')
review_df['label'] = review_df['label'] == 'pos'

In [None]:
# copied from part 1: using the same train/test split
seed = 37
np.random.seed(seed)
pct_train = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    review_df['text'],
    review_df['label'], train_size=pct_train)

### LSTM <a class="anchor" id="lstm"></a>


In [None]:
class SentLSTM(nn.Module):
# adapted from https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
        super(SentLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to sentiment space
        self.fc = nn.Linear(hidden_dim, output_size)

        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # forward pass of nn
        batch_size = x.shape[0]
        # this is fit during training
        embeds = self.word_embeddings(x)
        lstm_out, _ = self.lstm(embeds)
        # from lstm space to prediction space
        pred_space = self.fc(lstm_out)
        out = self.sigmoid(pred_space)
        # reshape - want to get the prediction at end of seq
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out

def doc_to_index(docs, vocab, tokenizer, unknown=1):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        d_tokenized = tokenizer(d)
        for w in d_tokenized:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown if not in vocab
                w_idxs.append(unknown)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=200):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features

In [None]:
# need to adapt vocab, leave space for padding
tfidf = TfidfVectorizer(stop_words='english', min_df=0.01)
tfidf.fit(X_train)
vocab = tfidf.vocabulary_
tokenizer = tfidf.build_tokenizer()
# need to add "special" tokens
vocab = dict([(v, vocab[v]+2) for v in vocab])
vocab['_UNK'] = 1
vocab['_PAD'] = 0
# from documents to vocab index
parsed_train = doc_to_index(X_train, vocab, tokenizer)
padded_train = pad_sequence(parsed_train)
parsed_test = doc_to_index(X_test, vocab, tokenizer)
padded_test = pad_sequence(parsed_test)


In [None]:
# construct datasets for loading by PyTorch
train_data = TensorDataset(torch.from_numpy(padded_train), 
                           torch.from_numpy(y_train.values))
test_data = TensorDataset(torch.from_numpy(padded_test), 
                          torch.from_numpy(y_test.values))

batch_size = 100

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

In [None]:
# binary problem, single output
model_params = {'output_size': 1,
               'hidden_dim': 512,
               'embedding_dim': 400,
               'vocab_size': len(vocab)}
model = SentLSTM(**model_params)
model.to(device)

In [None]:
# learning rate
lr=0.005
# binary cross-entropy loss
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# increasing this will make the training take a while on CPU
epochs = 1
counter = 0
print_every = 5
# gradient clipping: https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    for inputs, labels in train_loader:
        counter += 1
        inputs, labels = inputs.to(device), labels.to(device)
        # sets gradients to zero for each batch
        model.zero_grad()
        output = model(inputs)
        # pred vs actual
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        # print progress
        if counter%print_every == 0:
            val_losses = []
            model.eval()
            for inp, lab in test_loader:
                inp, lab = inp.to(device), lab.to(device)
                out = model(inp)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

In [None]:
# pytorch LSTM model
model.load_state_dict(torch.load('./state_dict.pt'))
model.eval()
pred_collect = np.array([])
eval_loader = DataLoader(test_data, batch_size=batch_size)
# eval mode
with torch.no_grad():
    for inputs, labels in eval_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        output = model(inputs)
        # takes output, rounds to 0/1
        pred = torch.round(output.squeeze())
        pred_collect = np.concatenate([
          pred_collect,
          pred.cpu().numpy()
        ])

In [None]:
print(f'accuracy: {np.where(pred_collect == y_test)[0].shape[0]/y_test.shape[0]}')
print(
    classification_report(y_pred=pred_collect,
                          y_true=y_test))

### BERT <a class="anchor" id="bert!pip install transformers"></a>
From [HF tutorials](https://huggingface.co/blog/sentiment-analysis-python).  The sentiment analysis pipeline packages together the tokenizer and the BERT model with a classification layer.  The default pipeline uses this [distilBERT model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english). 

In [None]:
# this will need to be run if you don't already have this package
#!pip install transformers

In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", truncation=True, padding=True)

In [None]:
# some manipulations for speed and to play nice with BERT
bert_pred = sentiment_pipeline(X_test.apply(lambda x: x).head(n=50).tolist())
bert_pred = [p['label']=='POSITIVE' for p in bert_pred]

In [None]:
np.where(bert_pred == y_test[:50])
print(f'accuracy: {np.where(bert_pred == y_test[:50])[0].shape[0]/50}')
print(
    classification_report(y_pred=bert_pred,
                          y_true=y_test[:50]))

There it is - you've leveraged a cutting edge model to do sentiment analysis! This performance is pretty good, but our count vectors actually did a few points better.  Maybe there's an opportunity to fine-tune the BERT model specifically to the IMDB review dataset.  Let's try it.

** IN PROGRESS **

Still have been having some issues - performance is conspicuously low.  Use with caution.

NOTE: This takes some time to run, even with the Collab GPU.  You might want to experiment on subsets of the dataset

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# WIP - "num_labels" seems like it should be 1, have seen 2 (pos/neg)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",  num_labels=1)


In [None]:
def preprocess_function(example):
    return tokenizer(example, truncation=True)

class IMDbDataset(torch.utils.data.Dataset):
    # comes from https://huggingface.co/transformers/v3.4.0/custom_datasets.html
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.astype(float)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# we will subsample because it makes this faster
n = 1000
tokenized_train = tokenizer(X_train.tolist()[:n], 
                            truncation=True, padding=True)
tokenized_test = tokenizer(X_test.tolist()[:n], 
                           truncation=True, padding=True)

train_dataset = IMDbDataset(tokenized_train, y_train.values[:n])
test_dataset = IMDbDataset(tokenized_test, y_test.values[:n])

In [None]:
# using HF's trainer
# again, this comes from https://huggingface.co/transformers/v3.4.0/custom_datasets.html

training_args = TrainingArguments(
   output_dir='./',
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=5,
   weight_decay=0.01,
   save_strategy="epoch",
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=test_dataset,
   tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
# hacky - moving back to cpu where the other data is located
# HF suggests a "prediction" Trainer
m = model.to(torch.device('cpu'))

In [None]:
# same parameters we trained with
classifier_pipeline = pipeline(task="text-classification", 
                               model=m, tokenizer=tokenizer,
                               truncation=True, padding=True)

In [None]:
# pred outputs slightly different from sentiment pipeline
bert_pred = classifier_pipeline(X_test.tolist())
bert_pred = [p['score']>0.5 for p in bert_pred]

In [None]:
# performance is pretty bad, even after 5 epochs
print(f'accuracy: {np.where(bert_pred == y_test)[0].shape[0]/len(y_test)}')
print(
    classification_report(y_pred=bert_pred,
                          y_true=y_test))