In [4]:
import pandas as pd
import numpy as np

In [5]:
import os
print(os.path.dirname(os.path.abspath('main.py')))

D:\python_learning\CS224ND\NER_bert


In [6]:
file_dir = 'D:\python_learning\CS224ND\\NER_bert'
for dirname, _, files in os.walk(file_dir):
    print(files)

['ner.csv', 'NER.ipynb', 'ner_dataset.csv']
['NER-checkpoint.ipynb']


In [7]:
"""读取数据"""
data = pd.read_csv("ner_dataset.csv", encoding='latin1').fillna(method="ffill")

In [8]:
"""展示最后十笔data"""
print(data.tail(10))

              Sentence #       Word  POS    Tag
1048565  Sentence: 47958     impact   NN      O
1048566  Sentence: 47958          .    .      O
1048567  Sentence: 47959     Indian   JJ  B-gpe
1048568  Sentence: 47959     forces  NNS      O
1048569  Sentence: 47959       said  VBD      O
1048570  Sentence: 47959       they  PRP      O
1048571  Sentence: 47959  responded  VBD      O
1048572  Sentence: 47959         to   TO      O
1048573  Sentence: 47959        the   DT      O
1048574  Sentence: 47959     attack   NN      O


In [9]:
"""构建SetenceGetter"""
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),s["POS"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in  self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            self.empty = True
            return None

getter = SentenceGetter(data)

In [10]:
sentences = [" ".join(word[0] for word in sentence) for sentence in getter.sentences]
print(sentences[0])

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .


In [11]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [12]:
"""构建tag词典"""
tags_vals = list(set(data['Tag'].values))
print(tags_vals)

['B-org', 'B-nat', 'I-per', 'I-tim', 'O', 'I-eve', 'B-art', 'I-art', 'I-nat', 'B-tim', 'I-org', 'B-gpe', 'I-gpe', 'B-eve', 'B-per', 'I-geo', 'B-geo']


In [13]:
tag2idx = {t: i for i, t in enumerate(tags_vals)}
print(tag2idx)

{'B-org': 0, 'B-nat': 1, 'I-per': 2, 'I-tim': 3, 'O': 4, 'I-eve': 5, 'B-art': 6, 'I-art': 7, 'I-nat': 8, 'B-tim': 9, 'I-org': 10, 'B-gpe': 11, 'I-gpe': 12, 'B-eve': 13, 'B-per': 14, 'I-geo': 15, 'B-geo': 16}


In [14]:
"""导入相关库"""
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [15]:
"""设置基本参数"""

max_len = 60
batch_size = 32

In [16]:
"""设置device"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1650'

In [18]:
"""tokenize处理"""

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[1])

['iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'wednesday', ',', 'after', 'an', 'ia', '##ea', 'surveillance', 'system', 'begins', 'functioning', '.']


In [19]:
"""将输入转化为id 并且 截长补短"""

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=max_len, value=tag2idx['O'], padding="post",
                     dtype="long", truncating="post")
print(tags[0])

[ 5190  1997 28337  2031  9847  2083  2414  2000  6186  1996  2162  1999
  5712  1998  5157  1996 10534  1997  2329  3629  2013  2008  2406  1012
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0]
[ 4  4  4  4  4  4 16  4  4  4  4  4 16  4  4  4  4  4 11  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4]


In [20]:
"""准备mask_attention"""

attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [21]:
"""将数据进行划分"""

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=2019, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2019, test_size=0.1)

In [22]:
"""将数据转化为tensor的形式"""

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [23]:
"""定义dataloader,在训练阶段shuffle数据，预测阶段不需要shuffle"""

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data) #预测阶段需要shuffle
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)  #测试阶段不需要shuffle
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [24]:
"""**开始训练过程**"""

model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
      

In [25]:
"""定义optimizer(分为是否调整全部参数两种情况)"""

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta'] # 不需要正则化的参数
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = BertAdam(optimizer_grouped_parameters, lr=3e-5)

t_total value of -1 results in schedule not being applied


In [26]:
"""定义评估accuracy的函数"""
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [27]:
epochs = 5
max_grad_norm = 1.0

In [28]:
for _ in range(epochs): # trange有可视化功能
    # 训练过程
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    for step, batch in enumerate(train_dataloader):
        # 将batch设置为gpu模式
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # 前向过程
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # 后向过程
        loss.backward()
        # 损失
        tr_loss += loss.item()
        nb_tr_steps += 1
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # 更新参数
        optimizer.step()
        model.zero_grad()
    #打印每个epoch的损失
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # 验证过程
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()#detach的方法，将variable参数从网络中隔离开，不参与参数更新
        label_ids = b_labels.to('cpu').numpy()

        # print("label_ids", label_ids)
        # print("np.argmax(logits, axis=2)", np.argmax(logits, axis=2))

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        # 计算accuracy 和 loss
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    # 打印信息
    print("Validation loss: {}".format(eval_loss/nb_eval_steps))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))#传入的是具体的tag

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.37 GiB already allocated; 18.20 MiB free; 2.57 GiB reserved in total by PyTorch)