In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
from argparse import ArgumentParser
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX, PAD_LABEL
builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True

In [None]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('ckiplab/bert-base-chinese-ner')
print("Initializing dataset...")
train_dataset_name = "toy"
train_dataset = NERDataset(train_dataset_name, LABEL_TO_IDX, upsample=False)
val_dataset_name = "dev"
val_dataset = NERDataset(val_dataset_name, LABEL_TO_IDX, upsample=False)
def train_generator():
    for i in range(len(train_dataset)):
        yield {"text": train_dataset.text[i], "labels": train_dataset.labels[i]}
def val_generator():
    for i in range(len(val_dataset)):
        yield {"text": val_dataset.text[i], "labels": val_dataset.labels[i]}
train_dataset = Dataset.from_generator(train_generator)
val_dataset = Dataset.from_generator(val_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['labels']))]
    encoding['word_ids'] = [list(map(lambda x: -1 if x is None else x, word_id)) for word_id in encoding['word_ids']]
    encoding['labels'] = [[LABEL_TO_IDX[y] for y in b] for b in example['labels']]
    return encoding
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=32, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=32, remove_columns=["text"])
collator = NERDataCollator(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, collate_fn=collator, batch_size=32, num_workers=47, shuffle=True)
val_loader = DataLoader(val_dataset, collate_fn=collator, batch_size=32, num_workers=47)
print("Initializing model...")
model = LightningBiLSTMCRF(LABEL_TO_IDX, 1, 256, 
                        bert_lr=0.0, lr=3e-5,
                        optimizer='adamw', scheduler='onecycle',
                        pretrained_model_name='ckiplab/bert-base-chinese-ner', freeze_bert=True,
                        epochs=20, steps_per_epoch=len(train_loader))

In [8]:
print("Input ids(Decoded):")
for data in train_dataset:
    print(tokenizer.decode(data['input_ids']))
print("Attention mask:")
for data in train_dataset:
    print(data['attention_mask'])
print("Word ids:")
for data in train_dataset:
    print(data['word_ids'])
print("labels:")
for data in train_dataset:
    print(data['labels'])
    


Input ids(Decoded):
[CLS] 上 海 市 市 长 。 [SEP]
[CLS] 腾 讯 游 戏 [SEP]
Attention mask:
[1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1]
Word ids:
[-1, 0, 0, 0, 1, 1, 2, -1]
[-1, 0, 1, 2, 2, -1]
labels:
[1, 0, 0]
[3, 6, 4]


In [6]:
from src.variable import IDX_TO_LABEL
input = next(iter(train_loader))
print("Input ids(Decoded):")
for input_id in input["input_ids"]:
    print(tokenizer.decode(input_id))
print("-"*50)
print("Attention mask:")
for attention_mask in input["attention_mask"]:
    print(attention_mask)
print("-"*50)
print("Labels:")
for labels in input["labels"]:
    for label in labels:
        print(IDX_TO_LABEL[label.item()], end=' ')
    print()
print("-"*50)
print("Word ids:")
for word_ids in input["word_ids"]:
    print(word_ids)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Input ids(Decoded):
[CLS] 上 海 市 市 长 。 [SEP]
[CLS] 腾 讯 游 戏 [SEP] [PAD] [PAD]
--------------------------------------------------
Attention mask:
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 0, 0])
--------------------------------------------------
Labels:
S-GPE O O 
B-ORG M-ORG E-ORG 
--------------------------------------------------
Word ids:
tensor([-1,  0,  0,  0,  1,  1,  2, -1])
tensor([-1,  0,  1,  2,  2, -1, -1, -1])


In [7]:
model.model.calculate_loss(**input)

Embedding time: 0.04s
Convert time: 0.03s
CRF forward time: 0.03s


tensor(2.8476, grad_fn=<NegBackward0>)

In [None]:
model.model.predict(input['input_ids'], input['attention_mask'], input['word_ids'])

In [None]:
# confirm that no from S-xxx to M-*/E-*
# no from B-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from M-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from E-* to M-*/E-*/
# no from O to M-*/E-*
# no from START_LABEL to M-*/E-*
# no from B-*/M-* to STOP_LABEL
for encoding in train_dataset:
    labels = encoding['labels']
    for t in range(len(labels) - 1):
        from_entity = labels[t]
        to_entity = labels[t + 1]
        assert not (from_entity.startswith('S-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('B-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('B-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('M-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('M-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('E-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity == 'O' and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        if t == 0: 
            assert not (from_entity.startswith('M-') or from_entity.startswith('E-'))
        if t == len(labels) - 2:
            assert not (to_entity.startswith('E-') or to_entity.startswith('M-'))


In [23]:
from torch import nn
import torch
lstm_nobias = nn.LSTM(2, 10, 2, bidirectional=True, batch_first=True, bias=True)
pad_1_step = torch.tensor([[1, 1],
                    [0, 0]], dtype=torch.float32)
pad_2_steps = torch.tensor([[1, 1],
                    [0, 0],
                    [1, 1]], dtype=torch.float32)

# the bilstm output of the 1st time step
print(lstm_nobias(pad_1_step)[0][0])
print(lstm_nobias(pad_2_steps)[0][0])


tensor([-0.1021, -0.0886, -0.0439,  0.0088,  0.0806,  0.0882, -0.0667,  0.0619,
        -0.0218, -0.0558, -0.0643,  0.0889, -0.0395, -0.0266,  0.1135, -0.0393,
        -0.0112, -0.0581,  0.0005, -0.0513], grad_fn=<SelectBackward0>)
tensor([-0.1033, -0.0897, -0.0431,  0.0091,  0.0799,  0.0862, -0.0673,  0.0612,
        -0.0228, -0.0544, -0.0627,  0.1047, -0.0451, -0.0353,  0.1374, -0.0383,
        -0.0083, -0.0743,  0.0036, -0.0573], grad_fn=<SelectBackward0>)


In [None]:
import torch
L, B = 4, 2
a = torch.arange(8).view(4, 2)
mask = torch.tensor([[2, 1],
                    [1, 0],
                    [1, 1],
                    [0, 0]])
a[mask]

In [None]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
import os
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(f"bert-base-chinese")
print("Initializing dataset...")
test_dataset = NERDataset('test', LABEL_TO_IDX, upsample=False)
def test_generator():
    for i in range(len(test_dataset)):
        yield {"text": test_dataset.text[i]}
test_dataset = Dataset.from_generator(test_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['labels']))]
    encoding['word_ids'] = [list(map(lambda x: -1 if x is None else x, word_id)) for word_id in encoding['word_ids']]
    return encoding
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["text"], batch_size=32)
print("Initializing dataloader...")
collator = NERDataCollator(tokenizer)
test_dataloader = DataLoader(test_dataset, collate_fn=collator, batch_size=1)

In [None]:
next(iter(test_dataloader))['input_ids'].shape[1]

In [None]:
from src.dataset import NERDataset
from src.variable import LABEL_TO_IDX
ds = NERDataset('train', LABEL_TO_IDX, upsample=True)

In [7]:
from src.pl_module import LightningBiLSTMCRF
from transformers import BertTokenizerFast
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
ckpt = LightningBiLSTMCRF.load_from_checkpoint('best_models/order/epoch=2-val_loss=0.0340-val_f1=0.8879.ckpt').to('cpu')
tokenizer = BertTokenizerFast.from_pretrained('ckiplab/bert-base-chinese-ner')

env: http_proxy=127.0.0.1:7890
env: https_proxy=127.0.0.1:7890


Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese-ner and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import torch
from src.variable import IDX_TO_LABEL
text = "警方逮捕了这个'电车之狼'"
encoding = tokenizer([text], return_tensors='pt')
word_ids = encoding.word_ids(0)
word_ids = list(map(lambda x: -1 if x is None else x, word_ids))
word_ids = torch.tensor(word_ids).unsqueeze(0)
pred = ckpt.model.predict(encoding['input_ids'], encoding['attention_mask'], word_ids)[0]
pred = list(map(lambda x: IDX_TO_LABEL[x], pred))
print(pred)

Embedding time: 0.03s
Convert time: 0.00s
CRF decode time: 0.00s
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [20]:
for start_label in range(len(IDX_TO_LABEL)):
    for end_label in range(len(IDX_TO_LABEL)):
        print(f"From {IDX_TO_LABEL[start_label]} to {IDX_TO_LABEL[end_label]}: {ckpt.model.crf.transitions[start_label, end_label].item()}")

From O to O: 0.1009884849190712
From O to S-GPE: 0.09808619320392609
From O to S-PER: 0.13781671226024628
From O to B-ORG: 0.37097394466400146
From O to E-ORG: -2.478883743286133
From O to S-ORG: 0.23651066422462463
From O to M-ORG: -2.2648699283599854
From O to S-LOC: 0.2411877065896988
From O to E-GPE: -1.6969536542892456
From O to B-GPE: 0.08536899089813232
From O to B-LOC: 0.1812945008277893
From O to E-LOC: -1.3395737409591675
From O to M-LOC: -1.0918552875518799
From O to M-GPE: -1.4719953536987305
From O to B-PER: 0.16019275784492493
From O to E-PER: -1.585919976234436
From O to M-PER: -0.7482620477676392
From S-GPE to O: 0.12119190394878387
From S-GPE to S-GPE: 0.28014716506004333
From S-GPE to S-PER: -0.20965144038200378
From S-GPE to B-ORG: 0.6756178140640259
From S-GPE to E-ORG: -0.6303654313087463
From S-GPE to S-ORG: 0.26704293489456177
From S-GPE to M-ORG: -0.5888674855232239
From S-GPE to S-LOC: 0.3229258954524994
From S-GPE to E-GPE: -1.3272525072097778
From S-GPE to B-