In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
from argparse import ArgumentParser
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX, PAD_LABEL
builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True

In [None]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('ckiplab/bert-base-chinese-ner')
print("Initializing dataset...")
train_dataset_name = "toy"
train_dataset = NERDataset(train_dataset_name, LABEL_TO_IDX, upsample=False)
val_dataset_name = "dev"
val_dataset = NERDataset(val_dataset_name, LABEL_TO_IDX, upsample=False)
def train_generator():
    for i in range(len(train_dataset)):
        yield {"text": train_dataset.text[i], "labels": train_dataset.labels[i]}
def val_generator():
    for i in range(len(val_dataset)):
        yield {"text": val_dataset.text[i], "labels": val_dataset.labels[i]}
train_dataset = Dataset.from_generator(train_generator)
val_dataset = Dataset.from_generator(val_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['labels']))]
    encoding['word_ids'] = [list(map(lambda x: -1 if x is None else x, word_id)) for word_id in encoding['word_ids']]
    encoding['labels'] = [[LABEL_TO_IDX[y] for y in b] for b in example['labels']]
    return encoding
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=32, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=32, remove_columns=["text"])
collator = NERDataCollator(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, collate_fn=collator, batch_size=32, num_workers=47, shuffle=True)
val_loader = DataLoader(val_dataset, collate_fn=collator, batch_size=32, num_workers=47)
print("Initializing model...")
model = LightningBiLSTMCRF(LABEL_TO_IDX, 1, 256, 
                        bert_lr=0.0, lr=3e-5,
                        optimizer='adamw', scheduler='onecycle',
                        pretrained_model_name='ckiplab/bert-base-chinese-ner', freeze_bert=True,
                        epochs=20, steps_per_epoch=len(train_loader))

In [8]:
print("Input ids(Decoded):")
for data in train_dataset:
    print(tokenizer.decode(data['input_ids']))
print("Attention mask:")
for data in train_dataset:
    print(data['attention_mask'])
print("Word ids:")
for data in train_dataset:
    print(data['word_ids'])
print("labels:")
for data in train_dataset:
    print(data['labels'])
    


Input ids(Decoded):
[CLS] 上 海 市 市 长 。 [SEP]
[CLS] 腾 讯 游 戏 [SEP]
Attention mask:
[1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1]
Word ids:
[-1, 0, 0, 0, 1, 1, 2, -1]
[-1, 0, 1, 2, 2, -1]
labels:
[1, 0, 0]
[3, 6, 4]


In [6]:
from src.variable import IDX_TO_LABEL
input = next(iter(train_loader))
print("Input ids(Decoded):")
for input_id in input["input_ids"]:
    print(tokenizer.decode(input_id))
print("-"*50)
print("Attention mask:")
for attention_mask in input["attention_mask"]:
    print(attention_mask)
print("-"*50)
print("Labels:")
for labels in input["labels"]:
    for label in labels:
        print(IDX_TO_LABEL[label.item()], end=' ')
    print()
print("-"*50)
print("Word ids:")
for word_ids in input["word_ids"]:
    print(word_ids)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Input ids(Decoded):
[CLS] 上 海 市 市 长 。 [SEP]
[CLS] 腾 讯 游 戏 [SEP] [PAD] [PAD]
--------------------------------------------------
Attention mask:
tensor([1, 1, 1, 1, 1, 1, 1, 1])
tensor([1, 1, 1, 1, 1, 1, 0, 0])
--------------------------------------------------
Labels:
S-GPE O O 
B-ORG M-ORG E-ORG 
--------------------------------------------------
Word ids:
tensor([-1,  0,  0,  0,  1,  1,  2, -1])
tensor([-1,  0,  1,  2,  2, -1, -1, -1])


In [7]:
model.model.calculate_loss(**input)

Embedding time: 0.04s
Convert time: 0.03s
CRF forward time: 0.03s


tensor(2.8476, grad_fn=<NegBackward0>)

In [None]:
model.model.predict(input['input_ids'], input['attention_mask'], input['word_ids'])

In [None]:
# confirm that no from S-xxx to M-*/E-*
# no from B-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from M-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from E-* to M-*/E-*/
# no from O to M-*/E-*
# no from START_LABEL to M-*/E-*
# no from B-*/M-* to STOP_LABEL
for encoding in train_dataset:
    labels = encoding['labels']
    for t in range(len(labels) - 1):
        from_entity = labels[t]
        to_entity = labels[t + 1]
        assert not (from_entity.startswith('S-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('B-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('B-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('M-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('M-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('E-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity == 'O' and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        if t == 0: 
            assert not (from_entity.startswith('M-') or from_entity.startswith('E-'))
        if t == len(labels) - 2:
            assert not (to_entity.startswith('E-') or to_entity.startswith('M-'))


In [None]:
# lstm
from torch import nn
import torch

lstm = nn.LSTM(2, 10, 2, bidirectional=True, batch_first=True, bias=False)

In [None]:
input = torch.tensor([[1, 1],
                    [0, 0]], dtype=torch.float32)
lstm(input)

In [None]:
import torch
L, B = 4, 2
a = torch.arange(8).view(4, 2)
mask = torch.tensor([[2, 1],
                    [1, 0],
                    [1, 1],
                    [0, 0]])
a[mask]

In [None]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
import os
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(f"bert-base-chinese")
print("Initializing dataset...")
test_dataset = NERDataset('test', LABEL_TO_IDX, upsample=False)
def test_generator():
    for i in range(len(test_dataset)):
        yield {"text": test_dataset.text[i]}
test_dataset = Dataset.from_generator(test_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['labels']))]
    encoding['word_ids'] = [list(map(lambda x: -1 if x is None else x, word_id)) for word_id in encoding['word_ids']]
    return encoding
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["text"], batch_size=32)
print("Initializing dataloader...")
collator = NERDataCollator(tokenizer)
test_dataloader = DataLoader(test_dataset, collate_fn=collator, batch_size=1)

In [None]:
next(iter(test_dataloader))['input_ids'].shape[1]

In [None]:
from src.dataset import NERDataset
from src.variable import LABEL_TO_IDX
ds = NERDataset('train', LABEL_TO_IDX, upsample=True)

In [None]:
ds[2]

In [None]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
from transformers import (
  BertTokenizerFast,
  AutoModel,
)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = AutoModel.from_pretrained('ckiplab/bert-base-chinese-ner')


In [None]:
model(**tokenizer('我叫沃尔夫冈，我住在柏林。', return_tensors='pt')).last_hidden_state.shape

In [None]:
for input_id in tokenizer('shanghai').input_ids:
    print(tokenizer.decode(input_id))

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
input_ids = tokenizer(["Héllò hôw", "are", "ü?"], is_split_into_words=True).input_ids
# input_ids = tokenizer(["Héllò hôw are ü?"]).input_ids
for input_id in input_ids:
    print(tokenizer.decode(input_id))