In [2]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890

env: http_proxy=127.0.0.1:7890
env: https_proxy=127.0.0.1:7890


In [3]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
from argparse import ArgumentParser
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX, PAD_LABEL

KeyboardInterrupt: 

In [None]:
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
print("Initializing model...")

char_level = False
model = LightningBiLSTMCRF(LABEL_TO_IDX, 1, 128, 
                        bert_lr=0, lstm_lr=0, crf_lr=0,
                        char_level=char_level)
print("Initializing dataset...")
train_dataset_name = "toy"
train_dataset = NERDataset(train_dataset_name, LABEL_TO_IDX)
val_dataset_name = "toy"
val_dataset = NERDataset(val_dataset_name, LABEL_TO_IDX)
def train_generator():
    for i in range(len(train_dataset)):
        yield {"text": train_dataset.text[i], "labels": train_dataset.labels[i]}
def val_generator():
    for i in range(len(val_dataset)):
        yield {"text": val_dataset.text[i], "labels": val_dataset.labels[i]}
train_dataset = Dataset.from_generator(train_generator)
val_dataset = Dataset.from_generator(val_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['labels']))]
    if  not char_level:
        encoding['labels'] = example['labels']
    else:
        # align labels with word_ids
        labels = example['labels']
        word_ids = encoding['word_ids']
        new_labels = []
        for b in range(len(labels)):
            new_label = []
            for w in word_ids[b]:
                if w is not None:
                    new_label.append(labels[b][w])
                else:
                    new_label.append('O')
            new_labels.append(new_label)
        encoding['labels'] = new_labels
    return encoding
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=4, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=4, remove_columns=["text"])
print("Training model...")
collator = NERDataCollator(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, collate_fn=collator, batch_size=4)
val_loader = DataLoader(val_dataset, collate_fn=collator, batch_size=4)

In [None]:
from src.variable import IDX_TO_LABEL
input = next(iter(train_loader))
print("Input ids(Decoded):")
for input_id in input["input_ids"]:
    print(tokenizer.decode(input_id))
print("-"*50)
print("Attention mask:")
for attention_mask in input["attention_mask"]:
    print(attention_mask)
print("-"*50)
print("Labels:")
for labels in input["labels"]:
    for label in labels:
        print(IDX_TO_LABEL[label.item()], end=' ')
    print()
print("-"*50)
print("Word ids:")
for word_ids in input["word_ids"]:
    print(word_ids)

In [None]:
model.model.calculate_loss(**input)

In [None]:
model.model.predict(input['input_ids'], input['attention_mask'], input['word_ids'])

In [None]:
# confirm that no from S-xxx to M-*/E-*
# no from B-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from M-xxx to B-*/M-yyy/E-yyy/O/<STOP>
# no from E-* to M-*/E-*/
# no from O to M-*/E-*
# no from START_LABEL to M-*/E-*
# no from B-*/M-* to STOP_LABEL
for encoding in train_dataset:
    labels = encoding['labels']
    for t in range(len(labels) - 1):
        from_entity = labels[t]
        to_entity = labels[t + 1]
        assert not (from_entity.startswith('S-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('B-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('B-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('M-') and to_entity.startswith('B-')), tokenizer.decode(encoding['input_ids'])
        if from_entity.startswith('M-') and (to_entity.startswith('M-') or to_entity.startswith('E-')):
            assert from_entity[2:] == to_entity[2:], tokenizer.decode(encoding['input_ids'])
        assert not (from_entity.startswith('E-') and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        assert not (from_entity == 'O' and (to_entity.startswith('M-') or to_entity.startswith('E-'))), tokenizer.decode(encoding['input_ids'])
        if t == 0: 
            assert not (from_entity.startswith('M-') or from_entity.startswith('E-'))
        if t == len(labels) - 2:
            assert not (to_entity.startswith('E-') or to_entity.startswith('M-'))


In [None]:
# lstm
from torch import nn
import torch

lstm = nn.LSTM(2, 10, 2, bidirectional=True, batch_first=True, bias=False)

In [None]:
input = torch.tensor([[1, 1],
                    [0, 0]], dtype=torch.float32)
lstm(input)

In [None]:
import torch
L, B = 4, 2
a = torch.arange(8).view(4, 2)
mask = torch.tensor([[2, 1],
                    [1, 0],
                    [1, 1],
                    [0, 0]])
a[mask]

In [None]:
%env http_proxy 127.0.0.1:7890
%env https_proxy 127.0.0.1:7890
import os
from src.dataset import NERDataset
from src.collator import NERDataCollator
from src.pl_module import LightningBiLSTMCRF
from src.variable import LABEL_TO_IDX
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import Dataset, builder
from pytorch_lightning import Trainer
builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(f"bert-base-chinese")
print("Initializing dataset...")
test_dataset = NERDataset('test', LABEL_TO_IDX)
def test_generator():
    for i in range(len(test_dataset)):
        yield {"text": test_dataset.text[i]}
test_dataset = Dataset.from_generator(test_generator)
print("Tokenizing dataset...")
def tokenize(example):
    encoding = tokenizer(example["text"], is_split_into_words=True)
    encoding['word_ids'] = [encoding.word_ids(b) for b in range(len(example['text']))]
    return encoding
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["text"], batch_size=32)
print("Initializing dataloader...")
collator = NERDataCollator(tokenizer)
test_dataloader = DataLoader(test_dataset, collate_fn=collator, batch_size=1)

env: http_proxy=127.0.0.1:7890
env: https_proxy=127.0.0.1:7890


  from .autonotebook import tqdm as notebook_tqdm


Initializing tokenizer...
Initializing dataset...
> [0;32m/root/nlp_pj/ner/src/dataset.py[0m(77)[0;36mload_test_data[0;34m()[0m
[0;32m     75 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     76 [0;31m        [0;31m# split on '。' and '.' and '!' and '?' and '？' and '！'[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 77 [0;31m        [0msplitter[0m [0;34m=[0m [0;34m[[0m[0;34m'。'[0m[0;34m,[0m [0;34m'.'[0m[0;34m,[0m[0;34m'．'[0m[0;34m,[0m [0;34m'!'[0m[0;34m,[0m [0;34m'！'[0m[0;34m,[0m [0;34m'?'[0m[0;34m,[0m [0;34m'？'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     78 [0;31m        [0msentences[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     79 [0;31m        [0mstart[0m [0;34m=[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m
[0m
> [0;32m/root/nlp_pj/ner/src/dataset.py[0m(78)[0;36ml

In [None]:
next(iter(test_dataloader))['input_ids'].shape[1]

In [1]:
from src.dataset import NERDataset
from src.variable import LABEL_TO_IDX
ds = NERDataset('train', LABEL_TO_IDX, upsample=True)

  from .autonotebook import tqdm as notebook_tqdm


S-GPE: 4755 (2x)
O: 297155 (0x)
S-PER: 4305 (2x)
B-PER: 87 (114x)
E-PER: 87 (114x)
S-LOC: 701 (14x)
S-ORG: 1665 (6x)
B-ORG: 1425 (7x)
M-ORG: 1014 (9x)
E-ORG: 1425 (7x)
B-GPE: 207 (48x)
E-GPE: 207 (48x)
B-LOC: 227 (44x)
E-LOC: 227 (44x)
M-PER: 39 (256x)
M-LOC: 86 (116x)
M-GPE: 36 (277x)


13214it [00:00, 112387.22it/s]

S-GPE: 52122
S-PER: 35862
B-PER: 10005
E-PER: 10005
S-LOC: 12711
S-ORG: 18618
B-ORG: 16079
M-ORG: 11868
E-ORG: 16079
B-GPE: 10143
E-GPE: 10143
B-LOC: 10227
E-LOC: 10227
M-PER: 4485
M-LOC: 3882
M-GPE: 1764
O: 297155





In [4]:
ds[2]

(['菲律宾', '埃斯特拉达', '马尼拉', '阿卜', '沙耶夫', '非国', '和落岛', '菲律宾'],
 ['S-GPE', 'S-PER', 'S-GPE', 'B-PER', 'E-PER', 'S-GPE', 'S-LOC', 'S-GPE'])