NLP WORKSHOP

In [1]:
!nvidia-smi

Thu Sep 24 20:30:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers

In [37]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
from transformers import BertModel, BertConfig, BertTokenizerFast, get_linear_schedule_with_warmup, AdamW
from transformers import DistilBertConfig, DistilBertModel, DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
from dataclasses import dataclass

In [4]:
class TextClassificationDataset(Dataset):
    def __init__(self, inputs, labels, tokenizer, max_len):
        super(TextClassificationDataset, self).__init__()

        encoded_inputs = tokenizer(inputs, max_length=max_len, padding="max_length", truncation=True)
        self.data = list(zip(encoded_inputs["input_ids"], encoded_inputs["attention_mask"], labels))

    def __getitem__(self, i):
        return self.data[i]

    def __len__(self):
        return len(self.data)

In [5]:
class BertTextClassificationModel(nn.Module):
    def __init__(self, bert_config, num_classes, dropout_prob=0.1):
        super(BertTextClassificationModel, self).__init__()

        self.bert = BertModel(bert_config)
        self.dropout = nn.Dropout(dropout_prob)
        self.classification_layer = nn.Linear(in_features=bert_config.hidden_size, out_features=num_classes)

    def forward(self, x):
        outputs = self.bert(input_ids=x["input_ids"], attention_mask=x["attention_mask"])
        cls = outputs[1]
        cls = self.dropout(cls)
        return self.classification_layer(cls)

    def load(self, path_to_dir):
        self.bert = BertModel.from_pretrained(path_to_dir)
        model_path = os.path.join(path_to_dir, "model.tar")
        if os.path.exists(model_path):
            checkpoint = torch.load(model_path)
            self.dropout.load_state_dict(checkpoint["dropout"])
            self.classification_layer.load_state_dict(checkpoint["cls"])
            print("No model.tar in provided directory, only loading bert model.")

    def save(self, path_to_dir):
        self.bert.save_pretrained(path_to_dir)
        torch.save(
            {"dropout": self.dropout.state_dict(), "cls": self.classification_layer.state_dict()},
            os.path.join(path_to_dir, "model.tar")
        )

In [45]:
class DistilBertTextClassificationModel(nn.Module):
    def __init__(self, bert_config, num_classes, dropout_prob=0.1):
        super(DistilBertTextClassificationModel, self).__init__()

        self.bert = DistilBertModel(bert_config)
        self.dropout = nn.Dropout(dropout_prob)
        self.classification_layer = nn.Linear(in_features=bert_config.hidden_size, out_features=num_classes)

    def forward(self, x):
        outputs = self.bert(input_ids=x["input_ids"], attention_mask=x["attention_mask"])[0]
        cls = outputs[:, 0, :]  # cls is the first token of the sequence
        cls = self.dropout(cls)
        return self.classification_layer(cls)

    def load(self, path_to_dir):
        self.bert = DistilBertModel.from_pretrained(path_to_dir)
        model_path = os.path.join(path_to_dir, "model.tar")
        if os.path.exists(model_path):
            checkpoint = torch.load(model_path)
            self.dropout.load_state_dict(checkpoint["dropout"])
            self.classification_layer.load_state_dict(checkpoint["cls"])
            print("No model.tar in provided directory, only loading bert model.")

    def save(self, path_to_dir):
        self.bert.save_pretrained(path_to_dir)
        torch.save(
            {"dropout": self.dropout.state_dict(), "cls": self.classification_layer.state_dict()},
            os.path.join(path_to_dir, "model.tar")
        )

In [6]:
def text_classification_collate(inputs):
    # groups selected inputs into a batch -> tuple (x, y)
    batch = (
        {"input_ids": torch.tensor([dat[0] for dat in inputs], dtype=torch.long),
         "attention_mask": torch.tensor([dat[1] for dat in inputs], dtype=torch.long)},
        torch.tensor([dat[2] for dat in inputs], dtype=torch.long))
    return batch

In [7]:
def train(model, train_dataset, val_dataset, loss_fn, device, run_config):
    if not os.path.isdir(run_config.output_dir):
        os.makedirs(run_config.output_dir)
    tb_writer = SummaryWriter(log_dir=os.path.join(run_config.output_dir, "tensorboard"))

    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=run_config.batch_size,
                                  collate_fn=text_classification_collate)
    print(len(train_dataloader)) # todo delete

    optimizer = AdamW(model.parameters(), lr=run_config.learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=run_config.num_warmup_steps,
                                                num_training_steps=len(train_dataloader)*run_config.num_epochs)
    print("Training started:")
    print(f"\tNum examples = {len(train_dataset)}")
    print(f"\tNum Epochs = {run_config.num_epochs}",)

    train_iterator = trange(0, int(run_config.num_epochs), desc="Epoch")
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True)
        model.train()
        epoch_losses = []
        for step, (x, y) in enumerate(epoch_iterator):
            # move batch to GPU
            if isinstance(x, dict):
                for k, v in x.items():
                    x[k] = v.to(device)
            else:
                x = x.to(device)
            y = y.to(device)

            # forward pass to compute logits
            logits_y = model(x)

            loss = loss_fn(logits_y, y)
            epoch_losses.append(loss.item())

            # backward pass - backprop
            model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            epoch_iterator.set_description(f"Training loss = {loss.item():.4f}")

        output_dir = os.path.join(run_config.output_dir, f"Epoch_{epoch + 1}")
        model.save(output_dir)  # we save after each epoch, perhaps an improvement is to save after n steps
        # evaluate and write to tensorboard
        test_ce, test_acc = evaluate(model, val_dataset, device, run_config)
        tb_writer.add_scalar("Avg Train CE", np.mean(epoch_losses), epoch + 1)
        tb_writer.add_scalar("Test CE", test_ce, epoch + 1)
        tb_writer.add_scalar("Test accuracy", test_acc, epoch + 1)
        print(f"After epoch {epoch + 1}: \n-train CE={np.mean(epoch_losses)}\n-test CE={test_ce}\n-test acc.={test_acc}")


In [8]:
def evaluate(model, test_dataset, device, run_config):
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset),
                                 batch_size=run_config.batch_size, collate_fn=text_classification_collate)
    model.eval()
    ce_losses, acc_losses = [], []
    with torch.no_grad():
      for batch in tqdm(test_dataloader, desc="Evaluating", position=0, leave=True):
          x, y = batch
          # move batch to GPU
          if isinstance(x, dict):
              for k, v in x.items():
                  x[k] = v.to(device)
          else:
              x = x.to(device)
          y = y.to(device)

          logits_y = model(x)
          ce_losses.append(nn.functional.cross_entropy(logits_y, y).item())
          pred_y = np.argmax(nn.functional.softmax(logits_y, dim=1).squeeze().detach().cpu().numpy(), axis=1)  # beautiful
          true_y = y.cpu().numpy()
          acc_losses.append(np.mean(pred_y == true_y))

    return np.mean(ce_losses), np.mean(acc_losses)

In [13]:
@dataclass
class RunConfig:
    learning_rate: float = 3e-5
    batch_size: int = 4
    num_epochs: int = 1
    num_warmup_steps: int = 10
    max_len: int = 512
    output_dir: str = "./model/"

In [11]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [67]:
run_config = RunConfig(
    batch_size = 32,
    num_epochs = 20,
    output_dir = "/content/drive/My Drive/NLP workshop/imdb/distill/"
)

In [16]:
path_to_train_csv = "/content/drive/My Drive/NLP workshop/imdb/imdb_train.csv"

sentiment_to_label = {"negative": 0, "positive": 1}
label_to_sentiment = {0: "negative", 1: "positive"}

df = pd.read_csv(path_to_train_csv)
df["label"] = df["sentiment"].map(sentiment_to_label)
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"])

In [68]:
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased", do_lower_case=True)

train_dataset = TextClassificationDataset(train_df["review"].tolist(), train_df["label"].tolist(), tokenizer, run_config.max_len)
val_dataset = TextClassificationDataset(val_df["review"].tolist(), val_df["label"].tolist(), tokenizer, run_config.max_len)

# model = BertTextClassificationModel(BertConfig.from_pretrained("bert-base-cased"), num_classes=2)
# model.load("bert-base-cased")

model = DistilBertTextClassificationModel(DistilBertConfig.from_pretrained("distilbert-base-uncased"), num_classes=2)
model.load("distilbert-base-uncased")

In [69]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [71]:
train(model, train_dataset, val_dataset, nn.CrossEntropyLoss().to(device), device, run_config)



Iteration:   0%|          | 0/1000 [00:00<?, ?it/s]

1000
Training started:
	Num examples = 32000
	Num Epochs = 20


Training loss CE = 0.2325: 100%|██████████| 1000/1000 [27:28<00:00,  1.64s/it]
[A

Training loss CE = 0.2325: 100%|██████████| 1000/1000 [27:28<00:00,  1.65s/it]
Evaluating: 100%|██████████| 250/250 [02:26<00:00,  1.70it/s]
Evaluating: 100%|██████████| 250/250 [02:26<00:00,  1.70it/s]
Iteration:   0%|          | 0/1000 [00:00<?, ?it/s]

After epoch 1: 
-train CE=0.2325395093653351
-test CE=0.21761479644477366
-test acc.=0.925375


Training loss CE = 0.1215: 100%|██████████| 1000/1000 [27:27<00:00,  1.65s/it]
Evaluating: 100%|██████████| 250/250 [02:27<00:00,  1.70it/s]
Iteration:   0%|          | 0/1000 [00:00<?, ?it/s]

After epoch 2: 
-train CE=0.12146003976743669
-test CE=0.19302185540646313
-test acc.=0.93275


Training loss CE = 0.0585: 100%|██████████| 1000/1000 [27:25<00:00,  1.65s/it]
Evaluating: 100%|██████████| 250/250 [02:26<00:00,  1.70it/s]
Iteration:   0%|          | 0/1000 [00:00<?, ?it/s]

After epoch 3: 
-train CE=0.05851310090336483
-test CE=0.22955984109267594
-test acc.=0.933375


Training loss CE = 0.0098:   0%|          | 4/1000 [00:06<27:16,  1.64s/it]

KeyboardInterrupt: ignored

In [64]:
model = None
train_dataloder = None
epoch_iterator = None
x, y = None, None
loss, optimizer = None, None
logits_y = None
scheduler = None
import gc
gc.collect()
torch.cuda.empty_cache()

In [70]:
!nvidia-smi

Thu Sep 24 23:14:44 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    34W /  70W |   1283MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [62]:
1/0

ZeroDivisionError: ignored

In [92]:
idx = 4
text = train_df["review"].tolist()[idx]
text = "I fell asleep six minutes into the movie."
text = "It sucked!"
text = "It was so good."
gt = train_df["label"].tolist()[idx]
enc = tokenizer(text)
input_to_model = {"input_ids": torch.tensor(enc["input_ids"], dtype=torch.long).unsqueeze(0).to(device),
                  "attention_mask": torch.tensor(enc["attention_mask"], dtype=torch.long).unsqueeze(0).to(device)}

In [93]:
model.eval()
with torch.no_grad():
  print(gt)
  print(model(input_to_model))
  print(nn.functional.softmax(model(input_to_model), dim=1))

1
tensor([[-1.8024,  2.1634]], device='cuda:0')
tensor([[0.0186, 0.9814]], device='cuda:0')


In [94]:
print(review_to_sentiment(model, tokenizer, label_to_sentiment, device, text))

positive


In [90]:
def review_to_sentiment(model, tokenizer, label_to_sentiment, device, review):
  x = tokenizer.encode_plus(text, return_tensors="pt").to(device)
  with torch.no_grad():
    pred = np.argmax(nn.functional.softmax(model(x), dim=1).cpu().numpy(), axis=1)[0]
  return label_to_sentiment.get(pred)

In [None]:
# todo above: evaluate on test set