In [8]:
import torch
from torch.utils.data import DataLoader
from dataset import PinyinMaskedLMDataset
from transformers import BertTokenizer
from model import PinyinBertForMaskedLM

device = torch.device("cuda:7" if torch.cuda.is_available else "cpu")

In [9]:
tokenizer = BertTokenizer.from_pretrained("../chinese-bert-wwm/")
# model = BertForMaskedLM.from_pretrained("../chinese-bert-wwm/")
model = PinyinBertForMaskedLM.from_pretrained("../chinese-bert-wwm/").to(device)
train_dataset = PinyinMaskedLMDataset("../SIGHAN/train.txt")
train_dataloader = DataLoader(train_dataset, batch_size=32, num_workers=4)

Some weights of the model checkpoint at ../chinese-bert-wwm/ were not used when initializing PinyinBertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing PinyinBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PinyinBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of PinyinBertForMaskedLM were not initialized from the model checkpoint at ../chinese-bert-wwm/ and are newly initialized: ['bert.embeddings.pinyin_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 804: forward compatibility was attempted on non supported HW

In [3]:
model.train()
optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()))

In [7]:
import torch
import json
from tqdm import tqdm
from torch import nn
loss_fn = nn.CrossEntropyLoss(ignore_index=0).to(device)

with open("../chinese-bert-wwm/pinyin_map.json") as f:
    pinyin_map = json.load(f)

total = len(train_dataloader)
with tqdm(total=total) as progress:
    for i, (inputs, labels) in enumerate(train_dataloader):
        inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
        labels = tokenizer(labels, return_tensors="pt", padding=True, truncation=True, max_length=64)["input_ids"].to(device)
        pinyin_ids = labels.clone()
        for i in range(pinyin_ids.size(0)):
            for j in range(pinyin_ids.size(1)):
                pinyin_ids[i][j] = pinyin_map[str(pinyin_ids[i][j].item())]
        output = model(**inputs, pinyin_ids=pinyin_ids)
        loss = loss_fn(output[0].permute(0, 2, 1), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        progress.set_postfix(loss=loss.item())
        progress.update(1)

100%|██████████| 7870/7870 [23:32<00:00,  5.57it/s, loss=6.38]


In [20]:
text = "明天是周[MASK]"
label = ""
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
    pinyin_ids = inputs["input_ids"].clone()
    for i in range(pinyin_ids.size(0)):
        for j in range(pinyin_ids.size(1)):
            pinyin_ids[i][j] = pinyin_map[str(pinyin_ids[i][j].item())]

tensor([[94201382860928,           0,           0,  ...,           0,
                  45, 193273528320],
        [          0, 140122620008784, 94201295023200,  ..., 94201265439360,
         94201265316416, 94201363046208],
        [          0,          64,           0,  ..., 140119013064704,
         7365418617031190068, 3617566086534488877],
        ...,
        [7002937342835032064, 7307199746910528123, 2318330554153050739,  ...,          64,
                 113, 94201383018048],
        [140122620447344, 140119817332976, 140122620447344,  ..., 3616445622929465956,
         6067528668143496499, 3976742471101722929],
        [          0,         145, 140122627488736,  ..., 94201382936960,
         94201356442512, 140116257499856]], device='cuda:3')

In [49]:
# print(inputs['input_ids'].size(), pinyin_ids.size(), labels.size(), i)
for s, l ,p in zip(*list(train_dataloader)[i]):
    print(len(tokenizer(s)["input_ids"]), len(pinyin_tokenizer(p)["input_ids"]), s)

56 56 白宫发[MASK]人麦克雷兰表示，南韩国会上周压倒性通过派遣三千多名军队赴伊拉克后，[MASK]希曾打电话给卢武铉表达感激之意。
47 47 利比亚扬[MASK]核生化[MASK]器发展计画后，获得美国解除对的黎波里当[MASK]实施的大部分制裁措施，作为回报。
31 31 自从一九九九年十二月以来，公司在海外已投资六千五百亿日圆。
60 60 贝兹在陈述时说：[MASK]可以清楚的看见一个年轻中国男子的尸体，可能是个[MASK]年。我记得里面非常热，[MASK]运水果的冷冻货柜通常很[MASK]。
53 53 柯利巴里与其他七名嫌犯今天将出庭，接受法国最高反恐怖活动法官[MASK]审讯，届时他们料将被交付正式的司法调查。
40 40 另一方面，[MASK]赛[MASK]天抵达德黑兰，这是他就任阿富汗临时政府总统之后首次[MASK]问伊朗。
32 32 来自世界各地的数千通电话，纷纷对罹[MASK][MASK]的家属表示慰问[MASK]支持。
17 17 市民提供的相关合同记者兰洋摄。
14 14 [MASK]地金服总[MASK]杨晓冬透露。
48 48 国务院发言人埃雷利在形容两人[MASK]电话交谈时说：他们[MASK][MASK]了海地的[MASK]展，以及上周末事[MASK]发[MASK]的经过。
13 13 以便政府解锁加密通讯。
36 36 美国官员说，中国政府今天驱逐在本月十四日被判间谍罪的美国学者李少民。
55 55 国防部说，根据代号-3[MASK]海军建军计划，第一艘神盾级驱逐舰将在二八年底前完成，[MASK]二艘二一年，第三[MASK]二一二年。
26 26 同时华盛[MASK]呼吁[MASK]西[MASK]二五年以前缔结泛美贸易条约。
15 15 无人机的关注度便迅速蹿升。
23 23 也有不少人对为[MASK]女读书购房后的生活很满意。
33 33 李登辉在一九九四年访问东南亚时，中国曾向相关国家提出强烈抗议。
33 33 在政治压力下皮萨无法再从事广播工作，不过他以为纪录片配音为生。
30 30 [MASK]十年代对学术自由[MASK]重新强调并提供了各抒己见的文化环境。
46 46 阿丹表示，[MASK]批顾问可望于一月十五日抵达，为一九五一年两国共同防御条约中综合条款的一部份。
47 47 工人将不分昼夜的先清理十天来积压

In [4]:
masked_lm_logits = output[0]
predicted_token_ids = torch.argmax(masked_lm_logits, dim=-1)

# 将预测的标记转换回文本
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids[0])

# 输出最佳预测
print("Masked tokens:", "".join(input_tokens[1:-1]))
print("Predicted tokens:", "".join(predicted_tokens[1:-1]))

Masked tokens: 星期[MASK]我要上课
Predicted tokens: 星期六我要上课


In [1]:
from transformers.models.bert.modeling_bert import BertConfig, BertEmbeddings
from model import PinyinBertEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
with open("../chinese-bert-wwm/config.json") as f:
    config = json.load(f)
config = BertConfig(**config)
embedding = PinyinBertEmbeddings(config)

In [11]:
loss

tensor(0.7991, grad_fn=<NllLoss2DBackward0>)

In [5]:
output

(tensor(7.8109, grad_fn=<NllLossBackward0>),
 tensor([[[ -9.6381,  -9.3832,  -9.6146,  ...,  -8.2691,  -7.9518,  -8.2145],
          [-15.1690, -14.6613, -15.3457,  ..., -13.3527, -16.9765,  -9.8997],
          [-17.4650, -18.3399, -17.1545,  ..., -11.7036, -13.2146,  -7.1388],
          ...,
          [ -8.7113,  -8.6327,  -8.7318,  ...,  -6.4853,  -5.6840,  -3.5734],
          [ -8.8180,  -8.9244,  -8.9428,  ...,  -7.5209,  -8.9219,  -4.4119],
          [ -8.5750,  -8.7258,  -8.6462,  ...,  -6.6508,  -7.3145,  -3.6594]],
 
         [[-10.0095,  -9.6976, -10.0291,  ...,  -8.7146,  -8.2553,  -8.9444],
          [-14.4107, -13.9038, -13.8943,  ..., -13.5255, -13.0657, -10.8762],
          [-14.1578, -13.4532, -13.3644,  ..., -13.8097,  -9.4016, -11.7767],
          ...,
          [ -9.8684, -10.0710,  -9.4653,  ...,  -7.2605,  -5.4559,  -3.9632],
          [ -9.6306,  -9.8551,  -9.0933,  ...,  -7.1194,  -5.7564,  -4.2419],
          [ -8.8739,  -8.7286,  -8.4401,  ...,  -7.0411,  -7.832