In [1]:
%%capture
!pip install -Uqq datasets pytorch-lightning tokenizers gensim

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset

import tokenizers
from tokenizers import pre_tokenizers, processors, normalizers

import pytorch_lightning as pl
import torchmetrics as metrics

from IPython.display import YouTubeVideo, display
SIZE = {'width':1000, 'height': 600}

In [16]:
dataset = load_dataset("SetFit/emotion")
class_names = set(dataset['train']['label_text'])
class_lookup = {i:c for i, c in enumerate(class_names)}
class_lookup

In [17]:
# Split the data into train and test
train = dataset['train']
valid = dataset['test']
len(train), len(valid)

In [7]:
# What does the data look like?
train[0]

In [18]:
tokenizer = tokenizers.Tokenizer(tokenizers.models.WordLevel(unk_token='[UNK]'))
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [19]:
def train_text_gen():
    for item in train:
        yield item['text']

In [20]:
trainer = tokenizers.trainers.WordLevelTrainer(vocab_size=30000, special_tokens=['[PAD]', '[UNK]'])

In [21]:
tokenizer.train_from_iterator(train_text_gen(), trainer)

In [22]:
tokenizer.enable_padding()

In [23]:
text = train[18:20]['text']
ids = torch.tensor([e.ids for e in tokenizer.encode_batch(text)])

print(f'Shape of ids: {ids.shape}')

for t, i in zip(text, ids):
    print(f'''
Text: {t}
IDs:  {i}
''')

In [25]:
emb_layer = nn.Embedding(num_embeddings=tokenizer.get_vocab_size(), embedding_dim=128, padding_idx=0)

In [26]:
emb_layer(ids).shape

In [27]:
emb_layer(ids)[0][0]

In [28]:
import gensim.downloader

In [29]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

In [30]:
glove_vectors.most_similar(positive=['king','woman'], negative=['man'])

# Exercise 15.1 - Bias in Language Models

<!-- startquestion -->

Language models are trained on text data that comes from different places.
One of the more convenient places to collect this text data is from the internet, where text content can contain infomration that is incorrect or even harmful.
This results in language models that learn from the context of data they're trained on.
Below, we see an example that examines the cosine distnace between the words "white" and "black" and a number of other words with negative or positive connotations.
We can see that the distance from the word "white" is greater than the distance from the word "black" in the negative cases, yet smaller in the positive case.
The downstream affect is that a model using these embeddings might make decisions differently based on the race, gender, sexual orientation, or other aspect of a person.

In [32]:
word1 = 'white'
word2 = 'black'
for word3 in ['poor', 'uneducated', 'gang', 'leader']:
    print(f'Word: {word3}, Distance from "{word1}": {glove_vectors.distance(word3, word1):.04f}, Distance from "{word2}": {glove_vectors.distance(word3, word2):.04f}')

In [None]:
word1 = 'man'
word2 = 'woman'
for word3 in []: # <-- Enter your words here <--
    print(f'Word: {word3}, Distance from "{word1}": {glove_vectors.distance(word3, word1):.04f}, Distance from "{word2}": {glove_vectors.distance(word3, word2):.04f}')

# Exercise 15.2

In this exercise, watch the video below and see the [related blog post](https://towardsdatascience.com/illustrated-guide-to-recurrent-neural-networks-79e5eb8049c9) (Michael Phi, 2018).
Then, please complete the quiz following this lesson.

<!-- startquestion -->

In [None]:
vid = YouTubeVideo('LHXXI4-IEns', **SIZE)
display(vid)

In [33]:
text

In [34]:
ids

In [35]:
word_embs = emb_layer(ids)
word_embs.shape

In [36]:
lin = nn.Linear(128, 128)

In [37]:
lin_outputs = lin(word_embs)
lin_outputs.shape, word_embs.shape

In [40]:
rnn = nn.RNN(input_size=128, hidden_size=128, num_layers=1, batch_first=True)

In [41]:
rnn_outputs, hidden_states = rnn(word_embs)
rnn_outputs.shape, hidden_states.shape

In [42]:
assert (rnn_outputs.permute(1, 0, 2)[-1] == hidden_states).all()

In [43]:
%%capture
rnn.weight_hh_l0.data.zero_() # set Whh to 0
rnn.bias_hh_l0.data.zero_() # set bhh to 0
rnn.weight_ih_l0.data.copy_(lin.weight.data) # set Wih to the weights of the linear layer
rnn.bias_ih_l0.data.copy_(lin.bias.data) # set bih to the bias of the linear layer

In [44]:
# obtain the rnn outputs and hidden state after modifying weights and biases
rnn_outputs, hidden_states = rnn(word_embs)

In [45]:
# check that lin_outputs.tanh() are the same as the rnn outputs
assert (lin_outputs.tanh() == rnn_outputs).all()

In [46]:
for n_layers in range(1, 6):
    rnn = nn.RNN(input_size=128, hidden_size=128, num_layers=n_layers, batch_first=True)
    rnn_outputs, hidden_states = rnn(word_embs)
    print(f"Number of RNN layers: {n_layers}, hidden state shape: {hidden_states.shape}")

In [47]:
gru = nn.GRU(input_size=128, hidden_size=128, num_layers=1, batch_first=True)

In [48]:
gru_outputs, gru_hidden_states = gru(word_embs)
gru_outputs.shape, gru_hidden_states.shape

In [49]:
lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=1, batch_first=True)

In [50]:
lstm_outputs, (lstm_state, lstm_memory) = lstm(emb_layer(ids))
lstm_outputs.shape, lstm_state.shape, lstm_memory.shape

In [51]:
assert (lstm_outputs[:, -1, :].squeeze() == lstm_state.squeeze()).all()

In [52]:
BATCH_SIZE = 64

In [53]:
class TokenizeCollate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.tokenizer.enable_padding()

    def __call__(self, batch):
        texts = [i['text'] for i in batch]
        inputs = torch.tensor([item.ids for item in self.tokenizer.encode_batch(texts)])
        labels = torch.tensor([i['label'] for i in batch])
        return inputs, labels

collate_fn = TokenizeCollate(tokenizer)

# Exercise 15.3

<!-- startquestion -->

Create the train and validation dataloaders using the `train` and `valid` datasets, `BATCH_SIZE`, and `collate_fn`.

In [54]:
train_dl = ...
valid_dl = ...

In [74]:
class Model(pl.LightningModule):
    def __init__(self, dim_model, n_rnn_layers, n_outputs=len(class_names), vocab_size=tokenizer.get_vocab_size(), pad_idx=0):
        super().__init__()
        # Save parameters
        self.dim_model = dim_model
        self.n_rnn_layers = n_rnn_layers
        self.n_outputs = n_outputs
        self.vocab_size = vocab_size
        self.pad_idx = pad_idx

        # Set up loss
        self.loss = nn.CrossEntropyLoss()

        # Set up metrics
        self.train_acc = metrics.Accuracy(task="multiclass", num_classes=self.n_outputs)
        self.valid_acc = metrics.Accuracy(task="multiclass", num_classes=self.n_outputs)
        self.train_auroc = metrics.AUROC(task="multiclass", num_classes=self.n_outputs)
        self.valid_auroc = metrics.AUROC(task="multiclass", num_classes=self.n_outputs)

        # Build the model
        self.emb = nn.Embedding(self.vocab_size, self.dim_model, padding_idx=self.pad_idx)
        self.rnn = nn.GRU(self.dim_model, self.dim_model, num_layers=self.n_rnn_layers, batch_first=True)
        self.output = nn.Linear(self.dim_model, self.n_outputs)

        self.save_hyperparameters()

    def forward(self, inputs):
        rnn_outputs, hidden_state = self.rnn(self.emb(inputs))
        outputs = self.output(hidden_state.squeeze()[-1])
        return outputs

    def training_step(self, batch, batch_idx):
        x, y = batch
        yhat = self(x)
        loss = self.loss(yhat, y)
        probas = yhat.softmax(axis=-1)
        preds = probas.argmax(axis=-1).int()
        acc = self.train_acc(y, preds)
        auroc = self.train_auroc(probas, y)
        self.log('train_loss', loss, prog_bar=True, on_epoch=True, on_step=True)
        self.log('train_acc', acc, prog_bar=True, on_epoch=True, on_step=True)
        self.log('train_auroc', auroc, prog_bar=True, on_epoch=True, on_step=False)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        yhat = self(x)
        loss = self.loss(yhat, y)
        probas = yhat.softmax(axis=-1)
        preds = probas.argmax(axis=-1).int()
        acc = self.valid_acc(y, preds)
        auroc = self.valid_auroc(probas, y)
        self.log('valid_loss', loss, prog_bar=True, on_epoch=True, on_step=False)
        self.log('valid_acc', acc, prog_bar=True, on_epoch=True, on_step=False)
        self.log('valid_auroc', auroc, prog_bar=True, on_epoch=True, on_step=False)
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-3)

In [92]:
cbs = [pl.callbacks.EarlyStopping(monitor='valid_loss', patience=2)]
logger = pl.loggers.CSVLogger(save_dir='.', name='GRU', version=0)
trainer = pl.Trainer(gpus=1, callbacks=cbs, logger=logger, log_every_n_steps=1, track_grad_norm=2)

In [93]:
model = Model(128, 2)

In [94]:
trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=valid_dl)

In [96]:
logs = pd.read_csv('GRU/version_0/metrics.csv')
logs[logs.columns[logs.columns.str.contains('loss|acc|auc')]]

In [97]:
pd.Series(valid['label']).value_counts(normalize=True).sort_index().plot.bar(figsize=(10,8))
plt.title('Class balance in the validation dataset')
plt.xticks(range(6), class_names)
plt.ylabel('frequency')
plt.xlabel('label')
plt.show()

In [98]:
best_valid_loss = logs.valid_loss.min()
best_valid_acc = logs[logs.valid_loss == best_valid_loss].valid_acc.max()
best_valid_auroc = logs[logs.valid_loss == best_valid_loss].valid_auroc.max()
print(f"""
Best valid loss: {best_valid_loss:.04f}
Accuracy at best valid loss: {best_valid_acc:.04f}
ROC-AUC at best valid loss: {best_valid_auroc:.04f}
""")

In [99]:
fig, axes = plt.subplots(4, 2, figsize=(16, 12))
logs.set_index('step').train_loss_step.plot(ax=axes[0,0])
axes[0,0].set_ylabel('Train Loss')
logs.set_index('step').valid_loss.dropna().plot(ax=axes[0,1])
axes[0,1].set_ylabel('Valid Loss')
logs.set_index('step').train_acc_step.plot(ax=axes[1,0])
axes[1,0].set_ylabel('Train Accuracy')
logs.set_index('step').valid_acc.dropna().plot(ax=axes[1,1])
axes[1,1].set_ylabel('Valid Accuracy')
axes[0,1].set_ylim(axes[0,0].get_ylim())
axes[1,1].set_ylim(axes[1,0].get_ylim())
logs.set_index('step').train_auroc.dropna().plot(ax=axes[2,0])
axes[2,0].set_ylabel('Train ROC-AUC')
logs.set_index('step').valid_auroc.dropna().plot(ax=axes[2,1])
axes[2,1].set_ylabel('Valid ROC-AUC')
axes[2,1].set_ylim(axes[2,0].get_ylim())
logs[['grad_2.0_norm/rnn.weight_hh_l0_step', 'grad_2.0_norm/rnn.weight_ih_l0_step']].plot(ax=axes[3,0])
axes[3,0].set_ylabel('l-2 norm of first RNN layer gradients')
axes[3,0].set_xlabel('step')
if model.n_rnn_layers >= 2:
    logs[['grad_2.0_norm/rnn.weight_ih_l1_step', 'grad_2.0_norm/rnn.weight_hh_l1_step']].plot(ax=axes[3,1])
    axes[3,1].set_ylabel('l-2 norm of second RNN layer gradients')
    axes[3,1].set_xlabel('step')
fig.tight_layout()

# Exercise 15.4

<!-- startquestion -->

In the example above, we trained a GRU network that was able to successfully classify the emotion of tweets with over 90% accuracy on the validaiton dataset.
To complete this exercise, please train a vanilla RNN and a LSTM and report the best validation loss and accuracy.
What do you notice about the vanilla RNN during training?
What haeve you learned about vanilla RNNs that may explain these results?

In [None]:
# Your code here