In [1]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890

env: http_proxy=127.0.0.1:7890
env: https_proxy=127.0.0.1:7890


In [2]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
from argparse import ArgumentParser
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX, PAD_LABEL

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
print("Initializing model...")

char_level = False
model = LightningBiLSTMCRF(LABEL_TO_IDX, 1, 128, 
                        bert_lr=0, lstm_lr=0, crf_lr=0,
                        char_level=char_level)
print("Initializing dataset...")
train_dataset_name = "toy"
train_dataset = NERDataset(train_dataset_name, LABEL_TO_IDX)
val_dataset_name = "toy"
val_dataset = NERDataset(val_dataset_name, LABEL_TO_IDX)
def train_generator():
    for i in range(len(train_dataset)):
        yield {"text": train_dataset.text[i], "labels": train_dataset.labels[i]}
def val_generator():
    for i in range(len(val_dataset)):
        yield {"text": val_dataset.text[i], "labels": val_dataset.labels[i]}
train_dataset = Dataset.from_generator(train_generator)
val_dataset = Dataset.from_generator(val_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['labels']))]
    if  not char_level:
        encoding['labels'] = example['labels']
    else:
        # align labels with word_ids
        labels = example['labels']
        word_ids = encoding['word_ids']
        new_labels = []
        for b in range(len(labels)):
            new_label = []
            for w in word_ids[b]:
                if w is not None:
                    new_label.append(labels[b][w])
                else:
                    new_label.append('O')
            new_labels.append(new_label)
        encoding['labels'] = new_labels
    return encoding
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=4, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=4, remove_columns=["text"])
print("Training model...")
collator = NERDataCollator(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, collate_fn=collator, batch_size=4)
val_loader = DataLoader(val_dataset, collate_fn=collator, batch_size=4)

Initializing tokenizer...
Initializing model...
Initializing dataset...
Tokenizing dataset...


Map: 100%|██████████| 2/2 [00:00<00:00, 136.60 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 319.15 examples/s]

Training model...





In [7]:
from src.variable import IDX_TO_LABEL
input = next(iter(train_loader))
print("Input ids(Decoded):")
for input_id in input["input_ids"]:
    print(tokenizer.decode(input_id))
print("-"*50)
print("Attention mask:")
for attention_mask in input["attention_mask"]:
    print(attention_mask)
print("-"*50)
print("Labels:")
for labels in input["labels"]:
    for label in labels:
        print(IDX_TO_LABEL[label.item()], end=' ')
    print()
print("-"*50)
print("Word ids:")
for word_ids in input["word_ids"]:
    print(word_ids)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Input ids(Decoded):
[CLS] 上 海 市 场 。 [SEP]
[CLS] 腾 讯 游 戏 [SEP] [PAD]
--------------------------------------------------
Attention mask:
tensor([1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 0])
--------------------------------------------------
Labels:
B-GPE O <PAD> 
B-ORG M-ORG E-GPE 
--------------------------------------------------
Word ids:
tensor([-1,  0,  0,  0,  0,  1, -1])
tensor([-1,  0,  1,  2,  2, -1, -1])


In [8]:
model.model.calculate_loss(**input)

Embedding time: 0.03s
Convert time: 0.00s
CRF forward time: 0.00s


tensor(3.0255, grad_fn=<NegBackward0>)

In [9]:
model.model.predict(input['input_ids'], input['attention_mask'], input['word_ids'])

Embedding time: 0.03s
Convert time: 0.00s
CRF decode time: 0.00s


[[15, 3], [11, 3, 3]]

In [None]:
# confirm that no from S-xxx to M-*/E-*
# no from B-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from M-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from E-* to M-*/E-*/
# no from O to M-*/E-*
# no from START_LABEL to M-*/E-*
# no from B-*/M-* to STOP_LABEL
for encoding in train_dataset:
    labels = encoding['labels']
    for t in range(len(labels) - 1):
        from_entity = labels[t]
        to_entity = labels[t + 1]
        assert not (from_entity.startswith('S-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('B-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('B-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('M-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('M-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('E-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity == 'O' and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        if t == 0: 
            assert not (from_entity.startswith('M-') or from_entity.startswith('E-'))
        if t == len(labels) - 2:
            assert not (to_entity.startswith('E-') or to_entity.startswith('M-'))


AssertionError: [CLS] 上 海 市 场 。 [SEP]

In [None]:
# lstm
from torch import nn
import torch

lstm = nn.LSTM(2, 10, 2, bidirectional=True, batch_first=True, bias=False)

In [None]:
input = torch.tensor([[1, 1],
                    [0, 0]], dtype=torch.float32)
lstm(input)

(tensor([[ 0.0090,  0.0097,  0.0124,  0.0273,  0.0010, -0.0052,  0.0005,  0.0159,
          -0.0103,  0.0107, -0.0036, -0.0376,  0.0099,  0.0062,  0.0067, -0.0009,
          -0.0074,  0.0110, -0.0308,  0.0083],
         [ 0.0037,  0.0124,  0.0101,  0.0176,  0.0008, -0.0032, -0.0060,  0.0124,
          -0.0057,  0.0100,  0.0050, -0.0124, -0.0004,  0.0034,  0.0033, -0.0007,
          -0.0051,  0.0049, -0.0048,  0.0021]], grad_fn=<SqueezeBackward1>),
 (tensor([[ 0.0752, -0.0223, -0.0282, -0.0005,  0.0200,  0.0265,  0.0407,  0.0272,
           -0.0415,  0.0149],
          [ 0.0646,  0.0357,  0.0567,  0.0488, -0.0922,  0.0379,  0.1003,  0.0322,
           -0.0094, -0.0836],
          [ 0.0037,  0.0124,  0.0101,  0.0176,  0.0008, -0.0032, -0.0060,  0.0124,
           -0.0057,  0.0100],
          [-0.0036, -0.0376,  0.0099,  0.0062,  0.0067, -0.0009, -0.0074,  0.0110,
           -0.0308,  0.0083]], grad_fn=<SqueezeBackward1>),
  tensor([[ 0.1501, -0.0443, -0.0553, -0.0010,  0.0393,  0.0527,  

In [None]:
import torch
L, B = 4, 2
a = torch.arange(8).view(4, 2)
mask = torch.tensor([[2, 1],
                    [1, 0],
                    [1, 1],
                    [0, 0]])
a[mask]

tensor([[[4, 5],
         [4, 5]],

        [[2, 3],
         [0, 1]],

        [[2, 3],
         [2, 3]],

        [[0, 1],
         [0, 1]]])

In [3]:
from src.pl_module import LightningBiLSTMCRF
import torch
ckpt = 'lightning_logs/version_38/checkpoints/epoch=28-step=11977.ckpt'
model = LightningBiLSTMCRF.load_from_checkpoint(ckpt)


  from .autonotebook import tqdm as notebook_tqdm


TypeError: __init__() missing 10 required positional arguments: 'label_to_idx', 'lstm_layer_num', 'lstm_state_dim', 'char_level', 'bert_lr', 'lstm_lr', 'crf_lr', 'optimizer', 'pretrained_model_name', and 'freeze_bert'

In [2]:
%env CUDA_VISIBLE_DEVICES 5

env: CUDA_VISIBLE_DEVICES=5
