In [3]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

In [4]:
tokenizer.encode("definitely not a 5 star resort")

[0, 9232, 27941, 45, 10, 195, 999, 5753, 2]

In [1]:
from typing import Any, Callable, Optional, Union
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.modeling_longformer import LongformerModel, LongformerPreTrainedModel

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger

In [2]:
import wandb
# wandb.init(project="my-project")

In [3]:
class ReviewDataset(Dataset):

    def __init__(self, df_path):
        self.df = pd.read_pickle(df_path)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return (self.df.iloc[idx,6], self.df.iloc[idx,0:6].to_numpy().astype(np.float) )

In [4]:
class LongformerBaseline(LongformerPreTrainedModel):

    authorized_unexpected_keys = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.longformer = LongformerModel(config, add_pooling_layer=False)
        self.classifier = BaselineClasHead(config, num_aspect=6, num_rating=5)
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        global_attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if global_attention_mask is None:
            global_attention_mask = torch.zeros_like(input_ids)
            # global attention on cls token
            global_attention_mask[:, 0] = 1

        outputs = self.longformer(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        return logits


class BaselineClasHead(nn.Module):

    def __init__(self, config, num_aspect, num_rating):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.hidden_size)
        self.dp1 = nn.Dropout(0.5)
        self.dense1 = nn.Linear(config.hidden_size, 400)
        
        self.ln2 = nn.LayerNorm(400)
        self.dp2 = nn.Dropout(0.4)
        self.dense2 = nn.Linear(400, num_aspect * num_rating)

    def forward(self, hidden_states, **kwargs):
        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
        
        hidden_states = self.ln1(hidden_states)
        hidden_states = self.dp1(hidden_states)
        hidden_states = self.dense1(hidden_states)
        
        hidden_states = torch.tanh(hidden_states)
        
        hidden_states = self.ln2(hidden_states)
        hidden_states = self.dp2(hidden_states)
        hidden_states = self.dense2(hidden_states)
        
        return hidden_states.view(-1, 6, 5)

In [5]:
class TokenizerCollate:
    def __init__(self):
        self.tkz = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
    
    def __call__(self, batch):
        batch_split = list(zip(*batch))
        seqs, targs= batch_split[0], batch_split[1]
        encode = self.tkz(seqs, padding="longest")
        return torch.tensor(encode["input_ids"]), torch.tensor(encode["attention_mask"]), torch.tensor(targs)
    
class MultiLabelCEL(nn.CrossEntropyLoss):
    def forward(self, input, target, nasp=6):
        target = target.long()
        loss = 0
        for i in range(nasp):
            loss = loss + super(MultiLabelCEL, self).forward(input[:,i,:], target[:,i])
        
        return loss
    
class AspectACC(pl.metrics.metric.Metric):
    def __init__(self, aspect: int,
                compute_on_step: bool = True,
                ddp_sync_on_step: bool = False,
                process_group: Optional[Any] = None,):
        super().__init__(
            compute_on_step=compute_on_step,
            ddp_sync_on_step=ddp_sync_on_step,
            process_group=process_group,)
        
        self.aspect = aspect
        self.add_state("correct", default=torch.tensor(0).cuda(), dist_reduce_fx="sum")
        self.add_state("total", default=torch.tensor(0).cuda(), dist_reduce_fx="sum")
    
    def update(self, preds: torch.Tensor, target: torch.Tensor):
        preds = torch.argmax(preds, dim=2)
        assert preds.shape == target.shape
        
        target = target.contiguous().long()
        
        self.correct += torch.sum( preds[:, self.aspect]==target[:, self.aspect] )
        self.total += target[:, self.aspect].numel()
        
    def compute(self):
        return self.correct.float() / self.total

In [6]:
class LightningLongformerBaseline(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.train_config = config
        self.longformer = LongformerBaseline.from_pretrained('allenai/longformer-base-4096',
                                                             cache_dir=self.train_config["cache_dir"],
                                                             gradient_checkpointing=True
                                                               )
        self.lossfunc = MultiLabelCEL()
        self.metrics = [AspectACC(aspect=i) for i in range(6)]

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.train_config["learning_rate"])
        return optimizer

    def train_dataloader(self):
        self.dataset_train = ReviewDataset("../../data/hotel_balance_LengthFix1_3000per/df_train.pickle")
        self.loader_train = DataLoader(self.dataset_train,
                                        batch_size=train_config["batch_size"],
                                        collate_fn=TokenizerCollate(),
                                        num_workers=2,
                                        pin_memory=True, drop_last=False, shuffle=False)
        return self.loader_train

    def val_dataloader(self):
        self.dataset_val = ReviewDataset("../../data/hotel_balance_LengthFix1_3000per/df_test.pickle")
        self.loader_val = DataLoader(self.dataset_val,
                                        batch_size=train_config["batch_size"],
                                        collate_fn=TokenizerCollate(),
                                        num_workers=2,
                                        pin_memory=True, drop_last=True, shuffle=False)
        return self.loader_val
    
#     @autocast()
    def forward(self, input_ids, attention_mask, labels):
        logits = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        loss = self.lossfunc(logits, labels)

        return (loss, logits)
    
    def training_step(self, batch, batch_idx):
        input_ids, mask, label  = batch[0].type(torch.int64), batch[1].type(torch.int64), batch[2].type(torch.int64)
        
        loss, logits = self(input_ids=input_ids, attention_mask=mask, labels=label)
        
        self.log("train_loss", loss)
        
        return {"loss":loss}
    
    def validation_step(self, batch, batch_idx):
        input_ids, mask, label  = batch[0].type(torch.int64), batch[1].type(torch.int64), batch[2].type(torch.int64)
        
        loss, logits = self(input_ids=input_ids, attention_mask=mask, labels=label)
        
        self.log('val_loss', loss, on_step=False, on_epoch=True, reduce_fx=torch.mean, prog_bar=False)
        accs = [m(logits, label) for m in self.metrics]  # update metric counters
        
        return loss
    
    def validation_epoch_end(self, validation_step_outputs):
        for i,m in enumerate(self.metrics):
            self.log('acc'+str(i), m.compute())

In [7]:
train_config = {}
train_config["cache_dir"] = "./cache/"
train_config["epochs"] = 6
train_config["batch_size"] = 10
train_config["accumulate_grad_batches"] = 8
train_config["gradient_clip_val"] = 1.2
train_config["learning_rate"] = 0.0005

In [None]:
# wandb.init(project="saam_hotel_longformer")

In [8]:
wandb_logger = WandbLogger(name='baseline_accumu',project='saam_hotel_longformer')
wandb_logger.log_hyperparams(train_config)

[34m[1mwandb[0m: Currently logged in as: [33maeryen[0m (use `wandb login --relogin` to force relogin)


In [9]:
model = LightningLongformerBaseline(train_config)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerBaseline: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerBaseline from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerBaseline from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerBaseline were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.ln1.weight', 'classifier.ln1.bias', 'classifier.dense1.weight', 'classifier.dense1.bias', 'classifier.ln2.wei

In [10]:
trainer = pl.Trainer(max_epochs=train_config["epochs"],
                     accumulate_grad_batches=train_config["accumulate_grad_batches"],
                     gradient_clip_val=train_config["gradient_clip_val"],
                     gpus=1, num_nodes=1,
                     logger=wandb_logger,
                     log_every_n_steps=5)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [11]:
trainer.fit(model)


  | Name       | Type               | Params
--------------------------------------------------
0 | longformer | LongformerBaseline | 148 M 
1 | lossfunc   | MultiLabelCEL      | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






1

Traceback (most recent call last):
  File "/home/aeryen/anaconda3/envs/saam/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/home/aeryen/anaconda3/envs/saam/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/aeryen/anaconda3/envs/saam/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/aeryen/anaconda3/envs/saam/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


In [11]:
!nvidia-smi

Sun Oct  4 01:45:54 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.95.01    Driver Version: 440.95.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN RTX           Off  | 00000000:01:00.0 Off |                  N/A |
| 41%   56C    P2    68W / 280W |   1733MiB / 24220MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:02:00.0  On |                  N/A |
|  0%   35C    P8    26W / 250W |    800MiB /  7959MiB |      2%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------