In [2]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
import os
import json

import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams

import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig
from examples.extract_features import *

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
class Args:
    def __init__(self):
        pass
    
args = Args()
args.no_cuda = True

CONFIG_NAME = 'bert_config.json'
BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'
config_file = os.path.join(BERT_DIR, CONFIG_NAME)
config = BertConfig.from_json_file(config_file)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#do_lower_case：在标记化时将文本转换为小写。默认= True
model = BertForPreTraining.from_pretrained(BERT_DIR)
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
_ = model.to(device)
_ = model.eval()

04/16/2019 09:11:27 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
04/16/2019 09:11:27 - INFO - pytorch_pretrained_bert.modeling -   loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/
04/16/2019 09:11:27 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [29]:
print(tokenizer.vocab['doubts'])
print(tokenizer.tokenize("I arive home."))

13579
['i', 'ari', '##ve', 'home', '.']


BertForPreTraining：
Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
            - the next sentence classification logits of shape [batch_size, 2].

from_pretrained：
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.

In [15]:
import re
def convert_text_to_examples(text): #把每一行的句子变成一个实例，一个实例中包含text_a,text_b(text_b目前是没用的)
    examples = []
    unique_id = 0
    if True:
        for line in text:
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.'
            
            if m is None:
                text_a = line
            else:
                text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine.
                text_b = m.group(2) #匹配的第二句，比如I love you.
            
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
            unique_id += 1
    return examples
#疑问，当text是一行的时候，line是一个个字母 -> text是["***"]的形式
#print(convert_text_to_examples({"I love you.","hello everybody."})[0].text_a)

def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):
    #把实例变成一个特征
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a) #tokenizer的作用是
        #print(example.unique_id) #*****************************
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        tokens = []
        input_type_ids = [] #segment embedding
        if append_special_tokens: #输入参数中默认为true
            tokens.append("[CLS]")
            input_type_ids.append(0)
        for token in tokens_a:
            if replace_mask and token == '_':  # XD
                token = "[MASK]"
            tokens.append(token)
            input_type_ids.append(0)
        if append_special_tokens:
            tokens.append("[SEP]")
            input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                if replace_mask and token == '_':  # XD
                    token = "[MASK]"
                tokens.append(token)
                input_type_ids.append(1)
            if append_special_tokens:
                tokens.append("[SEP]")
                input_type_ids.append(1)
        #print(tokens) #*******************************
        input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号
        input_mask = [1] * len(input_ids) 
        #print(input_ids)#***********************************
        if ex_index < 5:
#             logger.info("*** Example ***")
#             logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
#             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#             logger.info(
#                 "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
            
        features.append(
            InputFeatures(
                unique_id=example.unique_id,
                tokens=tokens,
                input_ids=input_ids,#字符串中的每个单词在词典中的index序列
                input_mask=input_mask, #一堆1
                input_type_ids=input_type_ids)) #第0类和第1类，对text_a,text_b的区分，本代码中全都是零
    return features
                


def copy_and_mask_feature(feature, step, masked_tokens=None): #step参数用来表示每多少个单词mask一次
    import copy
    tokens = feature.tokens
    len_token = len(tokens)
    if len_token<step:
        batches = range(0,len(tokens))
    else:
        batches = range(0,step)
    
    assert len_token > 0
    masked_feature_copies = []
    for i in batches: #用[mask]依次掩盖每一个位置
        feature_copy = copy.deepcopy(feature)
        masked_pos = i
        while masked_pos < len_token:
            feature_copy.input_ids[masked_pos] = tokenizer.vocab["[MASK]"]
            masked_pos = masked_pos + step
        masked_feature_copies.append(feature_copy)
    return masked_feature_copies, batches

#examples = convert_text_to_examples({"I love you.Hello everybody."})
#features = convert_examples_to_features(examples, tokenizer, print_info=False)
#masked_feature_copies, batches = copy_and_mask_feature(features[0],3)
#for i in range(0,5):
#    print(masked_feature_copies[i].input_ids) #结果[101, 1045, 2293, 103, 102]


04/16/2019 09:34:51 - INFO - examples.extract_features -   tokens: [CLS] i love you . hello everybody . [SEP]


[103, 1045, 2293, 103, 1012, 7592, 103, 1012, 102]
[101, 103, 2293, 2017, 103, 7592, 7955, 103, 102]
[101, 1045, 103, 2017, 1012, 103, 7955, 1012, 103]


IndexError: list index out of range

In [5]:
import nltk
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG

def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数，要最高概率topk个输出
    def print_pair(token, prob, end_str='', hit_mark=' '):
        if i < firstk:
            # token = token.replace('</w>', '').replace('\n', '/n')
            print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)
    
    ret = None
    for i in range(len(tokens)):
        ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]
        prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率，prob_是词典上原来天的这个词的概率
        print_pair(tokens[i], prob_, end_str='\t')
        values, indices = probs[i].topk(topk)
        #print(values, indices)
        #print("****************************************************************************************************")
        top_pairs = []
        for j in range(topk):
            ind, prob = indices[j].item(), values[j].item()
            hit_mark = '*' if ind == ind_ else ' '
            token = tokenizer.ids_to_tokens[ind]
            print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\n')
            top_pairs.append((token, prob))
        if tokens[i] == "[MASK]":
            ret = top_pairs
    return ret #返回的这是个啥

In [6]:
import colored
from colored import stylize

def show_abnormals(tokens, probs, show_suggestions=False):
    def gap2color(gap):
        if gap <= 5:
            return 'yellow_1'
        elif gap <= 10:
            return 'orange_1'
        else:
            return 'red_1'
        
    def print_token(token, suggestion, gap):
        if gap == 0:
            print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')
        else:
            print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')
            if show_suggestions and gap > 5:
                print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')
            else:
                print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')
                # print('/' + suggestion, end=' ')
            # print('%.2f' % gap, end=' ')
        
    avg_gap = 0.
    for i in range(1, len(tokens) - 1):  # skip first [CLS] and last [SEP]
        ind_ = tokenizer.vocab[tokens[i]]
        prob_ = probs[i][ind_].item()
        top_prob = probs[i].max().item()
        top_ind = probs[i].argmax().item()
        gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距
        suggestion = tokenizer.ids_to_tokens[top_ind]
        print_token(tokens[i], suggestion, gap)
        avg_gap += gap
    avg_gap /= (len(tokens) - 2)
    print()
    print('平均gap:'+ str(avg_gap))

In [7]:
analyzed_cache = {}
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG
#print (lemma('gave'))
#print (lexeme('production'))
#print (conjugate(verb='give',tense=PRESENT,number=SG))
def process_text(text): 
#处理输入文本，包括将文本按句子分成若干token，得出原来text中index位置的单词在x句子的y位置，还得出各个句子类别码
    token =[]
    token0 = tokenizer.tokenize(text)
    token.append('[CLS]')
    for i in token0:
        token.append(i)
    token.append('[SEP]')
    print(token)
    in_sentence = [[0,0]] 
    sentence_n = 0
    index = 1
    for i in range(1,len(token)-1):
        in_sentence.append([sentence_n,index])  #每个token中的词在所在句中的位置表示出来，以及该位置在哪一句中
        index = index + 1                           #比如，位置i这个词在第sentence句的index位置上
        if token[i] == '.':
            sentence_n = sentence_n + 1
            index = 1
    sentences = text.split(".")
    sentences.remove('')

    sen_token = []
    input_ids_sen = []
    input_type_ids_sen = []
    for i,sentence in enumerate(sentences):
        sentence = sentence + '.'
        sentences[i] = sentences[i] + '.'
        token = []
        input_type_ids = []
        tokens = tokenizer.tokenize(sentence)
        token.append('[CLS]')
        input_type_ids.append(0) 
        for i in tokens:
            token.append(i)
            input_type_ids.append(0)        
        token.append('[SEP]')        
        input_type_ids.append(0)
        input_ids_sen.append(tokenizer.convert_tokens_to_ids(token))
        input_type_ids_sen.append(input_type_ids)
    #input_ids_sen = torch.tensor(input_ids_sen)
    #input_type_ids_sen = torch.tensor(input_type_ids_sen)
    return input_ids_sen,input_type_ids_sen,in_sentence,sentences
text = "Last week I went to the theatre. I had a very good seat. The play was very interesting."
input_ids_sen,input_type_ids_sen,in_sentence,sentences = process_text(text)
print(input_ids_sen)
print(in_sentence)
print(input_type_ids_sen)
print(sentences)

['[CLS]', 'last', 'week', 'i', 'went', 'to', 'the', 'theatre', '.', 'i', 'had', 'a', 'very', 'good', 'seat', '.', 'the', 'play', 'was', 'very', 'interesting', '.', '[SEP]']
[[101, 2197, 2733, 1045, 2253, 2000, 1996, 3004, 1012, 102], [101, 1045, 2018, 1037, 2200, 2204, 2835, 1012, 102], [101, 1996, 2377, 2001, 2200, 5875, 1012, 102]]
[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7], [0, 8], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]]
['Last week I went to the theatre.', ' I had a very good seat.', ' The play was very interesting.']


这个函数是在该位置上的单词可能性很低时才使用，不会把原来就较为合理的面目全非

In [8]:
import copy
import nltk
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE

def analyse_V(index):
#这是一个处理动词语法问题的函数，输入为问题词在text的token中的下标index

#******************************************初始数据处理**************************************************************************
    need_to = 0 #表示是否需要变为不定式形式，0表示不需要，1表示需要
    need_be = 0 #表示是否需要变为被动语态0表示不需要，1表示需要
    
    sentence_id = in_sentence[index][0]
    id_in_sen = in_sentence[index][1]
    wordV = input_ids_sen[sentence_id][id_in_sen]
    wordV = tokenizer.ids_to_tokens[wordV]
    
    input_ids = copy.deepcopy(input_ids_sen[sentence_id])
    input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])
#*****************************************判断语法应不应该是不定式抑或是被动语态**************************************************************
    '''
    input_ids1 = copy.deepcopy(input_ids)
    input_ids1.insert(id_in_sen,tokenizer.vocab["[MASK]"])
    input_type_ids1 = copy.deepcopy(input_type_ids)
    input_type_ids1.append(0)
    
    T_input_ids1 = torch.tensor([input_ids1], dtype=torch.long) #把input_ids增加了一个维度
    T_input_ids1 = T_input_ids1.to(device) #拿去GPU

    T_input_type_ids1 = torch.tensor([input_type_ids1], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
    T_input_type_ids1 = T_input_type_ids1.to(device)    
    
    mlm_logits1, _ = model(T_input_ids1, T_input_type_ids1)
    mlm_probs1 = F.softmax(mlm_logits1, dim=-1)
    reduced_mlm_probs1 = mlm_probs1[0][id_in_sen]
    '''
#**************************************判断是不是不定式*********************  
    input_ids1 = copy.deepcopy(input_ids)
    input_ids1.insert(id_in_sen,tokenizer.vocab["[MASK]"])
    input_ids1[id_in_sen + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,person = 1)]
    input_type_ids1 = copy.deepcopy(input_type_ids)
    input_type_ids1.append(0)
    
    T_input_ids1 = torch.tensor([input_ids1], dtype=torch.long) #把input_ids增加了一个维度
    T_input_ids1 = T_input_ids1.to(device) #拿去GPU

    T_input_type_ids1 = torch.tensor([input_type_ids1], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
    T_input_type_ids1 = T_input_type_ids1.to(device)    
    
    mlm_logits1, _ = model(T_input_ids1, T_input_type_ids1)
    mlm_probs1 = F.softmax(mlm_logits1, dim=-1)
    reduced_mlm_probs1 = mlm_probs1[0][id_in_sen]
    
    prob_to = float(reduced_mlm_probs1[tokenizer.vocab["to"]])
    top_prob1 = reduced_mlm_probs1.max().item()
    print("是否用不定式：")
    print("用to的可能性"+str(prob_to))
    print("可能性最大的词概率"+str(top_prob1))
    gap1 = math.log(top_prob1) - math.log(prob_to)
    if gap1 < 1:
        need_to = 1 
#**************************************判断是不是被动语态或者进行时*******************   
    print("是否用被动或进行时：")
    input_ids3 = copy.deepcopy(input_ids)
    input_ids3.insert(id_in_sen,tokenizer.vocab["[MASK]"])
    input_ids3_ = copy.deepcopy(input_ids3)
    input_ids3[id_in_sen + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PAST,aspect=PROGRESSIVE)]
    input_ids3_[id_in_sen + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]
    input_type_ids3 = copy.deepcopy(input_type_ids)
    input_type_ids3.append(0)
    
    T_input_ids3 = torch.tensor([input_ids3], dtype=torch.long) #把input_ids增加了一个维度
    T_input_ids3 = T_input_ids3.to(device) #拿去GPU
    T_input_ids3_ = torch.tensor([input_ids3_], dtype=torch.long)
    T_input_ids3_ = T_input_ids3_.to(device)

    T_input_type_ids3 = torch.tensor([input_type_ids3], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
    T_input_type_ids3 = T_input_type_ids3.to(device)    
    
    mlm_logits3, _ = model(T_input_ids3, T_input_type_ids3)
    mlm_logits3_,_ = model(T_input_ids3_, T_input_type_ids3)
    mlm_probs3 = F.softmax(mlm_logits3, dim=-1)
    reduced_mlm_probs3 = mlm_probs3[0][id_in_sen]
    mlm_probs3_= F.softmax(mlm_logits3_, dim=-1)
    reduced_mlm_probs3_ = mlm_probs3_[0][id_in_sen]
    
    list_be = lexeme('be')
    list_be = lexeme('be')[:8]

    list_be_id = tokenizer.convert_tokens_to_ids(list_be)
    list_be_prob = {}
    for word,word_id in zip(list_be,list_be_id):
        list_be_prob.update({word:float(reduced_mlm_probs3[word_id].data)})
    prob_ord3 = sorted(list_be_prob.items(),key = lambda x:x[1],reverse = True)
    print(prob_ord3)
    top_ind3 = reduced_mlm_probs3.argmax().item()
    top_prob3 = reduced_mlm_probs3.max().item()
    print(tokenizer.ids_to_tokens[top_ind3],top_prob3)
    print(prob_ord3[0][0],prob_ord3[0][1])
    top_prob_be = prob_ord3[0][1]
    gap3 = math.log(top_prob3) - math.log(top_prob_be)
    if gap3 < 1:
        need_be = 1 
        be_ = prob_ord3[0][0]
    else:
        print('不是被动')
#*******************************************是不是现在分词********************************        
    list_be_prob = {}
    for word,word_id in zip(list_be,list_be_id):
        list_be_prob.update({word:float(reduced_mlm_probs3_[word_id].data)})
    prob_ord3 = sorted(list_be_prob.items(),key = lambda x:x[1],reverse = True)
    print(prob_ord3)
    top_ind3 = reduced_mlm_probs3_.argmax().item()
    top_prob3 = reduced_mlm_probs3_.max().item()
    print(tokenizer.ids_to_tokens[top_ind3],top_prob3)
    print(prob_ord3[0][0],prob_ord3[0][1])
    top_prob_be = prob_ord3[0][1]
    gap3 = math.log(top_prob3) - math.log(top_prob_be)
    if gap3 < 1:
        need_be = 1 
        be_ = prob_ord3[0][0]    
#*************************************************判断其他语法******************************************************************
    print("判断其他语法：")
    if need_to == 0 and need_be == 0:
        input_ids[id_in_sen] = tokenizer.vocab["[MASK]"]
        input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])

        T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度
        T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
        T_input_ids = T_input_ids.to(device) #拿去GPU
        T_input_type_ids = T_input_type_ids.to(device)

        mlm_logits, _ = model(T_input_ids, T_input_type_ids)
        mlm_probs = F.softmax(mlm_logits, dim=-1)
        reduced_mlm_probs = mlm_probs[0][id_in_sen]

        list_word = lexeme(wordV)
        #list_word = [word]

        list_word_id = tokenizer.convert_tokens_to_ids(list_word)
        print(list_word)
        print(list_word_id)    
        list_word_prob = {}
        for word,word_id in zip(list_word,list_word_id):
            list_word_prob.update({word:float(reduced_mlm_probs[word_id].data)})
        print(list_word_prob)
        prob_ord = sorted(list_word_prob.items(),key = lambda x:x[1],reverse = True)

        top_ind = reduced_mlm_probs.argmax().item()
        top_prob = reduced_mlm_probs.max().item()
        top_prob_thisV = prob_ord[0][1]
        gap = math.log(top_prob) - math.log(top_prob_thisV)
        
        suggestion = tokenizer.ids_to_tokens[top_ind]
        sentence = copy.deepcopy(sentences[sentence_id])
        sentence = tokenizer.tokenize(sentence)
        sentence[id_in_sen - 1] = suggestion
        sentence_tag = nltk.pos_tag(sentence)
        
        suggestion_tag = sentence_tag[id_in_sen - 1][1]
        #print(sentence_tag[id_in_sen - 1][0])
        print(suggestion_tag)
        
        if gap < 5 or suggestion_tag.find("V")==-1:
            suggestion = prob_ord[0][0]
        
            

        """”values, indices = reduced_mlm_probs.topk(topk)
        for j in range(topk):
            ind, prob = indices[j].item(), values[j].item()
            
            token = tokenizer.ids_to_tokens[ind]
            print(token,prob)"""
    elif need_to == 1:
        input_ids2 = copy.deepcopy(input_ids)
        input_ids2.insert(id_in_sen,tokenizer.vocab["to"])
        input_ids2[id_in_sen + 1] = tokenizer.vocab["[MASK]"]
        T_input_ids2 = torch.tensor([input_ids2], dtype=torch.long) #把input_ids增加了一个维度
        T_input_ids2 = T_input_ids2.to(device) #拿去GPU
        
        input_type_ids2 = copy.deepcopy(input_type_ids1)
        T_input_type_ids2 = torch.tensor([input_type_ids2], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
        T_input_type_ids2 = T_input_type_ids2.to(device)   
        mlm_logits2, _ = model(T_input_ids2, T_input_type_ids2)
        mlm_probs2 = F.softmax(mlm_logits2, dim=-1)
        reduced_mlm_probs2 = mlm_probs2[0][id_in_sen + 1]
        
        thisV = conjugate(verb = wordV,tense=PRESENT,person = 1)
        print(thisV)
        #list_word = [wordV]
        thisV_id = tokenizer.vocab[thisV]
   
        top_ind2 = reduced_mlm_probs2.argmax().item()
        top_prob2 = reduced_mlm_probs2.max().item()
        prob_thisV2 = reduced_mlm_probs2[thisV_id]
        gap = math.log(top_prob2) - math.log(prob_thisV2)
        
        suggestion = tokenizer.ids_to_tokens[top_ind2]
        sentence = copy.deepcopy(sentences[sentence_id])
        sentence = tokenizer.tokenize(sentence)
        sentence.insert(id_in_sen - 1,'to')
        sentence[id_in_sen] = suggestion
        print("sentence是：",sentence)
        sentence_tag = nltk.pos_tag(sentence)
        
        suggestion_tag = sentence_tag[id_in_sen][1]
        if gap < 5 or suggestion_tag.find("V")== -1:
            suggestion = 'to '+ thisV
        else:
            suggestion = 'to '+ tokenizer.ids_to_tokens[top_ind2]
    elif need_be == 1:#********************************处理需要be动词的时态*****************
        print("need_be == 1")
        input_ids3 = copy.deepcopy(input_ids1)
        input_ids3[id_in_sen] = tokenizer.vocab[be_]
        input_ids3[id_in_sen + 1] = tokenizer.vocab["[MASK]"]
        T_input_ids3 = torch.tensor([input_ids3], dtype=torch.long) #把input_ids增加了一个维度
        T_input_ids3 = T_input_ids3.to(device) #拿去GPU
        
        input_type_ids3 = copy.deepcopy(input_type_ids1)
        T_input_type_ids3 = torch.tensor([input_type_ids3], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
        T_input_type_ids3 = T_input_type_ids3.to(device)
        mlm_logits3, _ = model(T_input_ids3, T_input_type_ids3)
        mlm_probs3 = F.softmax(mlm_logits3, dim=-1)
        reduced_mlm_probs3 = mlm_probs3[0][id_in_sen + 1]
        
        list_word3 = lexeme(wordV)
        #list_word = [wordV]
        list_word_id3 = tokenizer.convert_tokens_to_ids(list_word3)
        print(list_word3)
        print(list_word_id3)    
        list_word_prob3 = {}
        for word,word_id in zip(list_word3,list_word_id3):
            list_word_prob3.update({word:float(reduced_mlm_probs3[word_id].data)})
        print(list_word_prob3)
        prob_ord3 = sorted(list_word_prob3.items(),key = lambda x:x[1],reverse = True)

        top_ind3 = reduced_mlm_probs3.argmax().item()
        top_prob3 = reduced_mlm_probs3.max().item()
        top_prob_thisV3 = prob_ord3[0][1]
        gap = math.log(top_prob3) - math.log(top_prob_thisV3)
        print(tokenizer.ids_to_tokens[top_ind3])
        
        suggestion = tokenizer.ids_to_tokens[top_ind3]
        sentence = copy.deepcopy(sentences[sentence_id])
        sentence = tokenizer.tokenize(sentence)
        sentence.insert(id_in_sen -1,be_)
        sentence[id_in_sen] = suggestion
        #print("sentence是：",sentence)
        sentence_tag = nltk.pos_tag(sentence)
        
        suggestion_tag = sentence_tag[id_in_sen][1]
        if gap < 5 or suggestion_tag.find("VB")== -1:
            suggestion = be_ + ' ' + prob_ord3[0][0]
        else:
            suggestion = be_ + ' ' + tokenizer.ids_to_tokens[top_ind3]
    print(suggestion)
    return suggestion
    
analyse_V(4)

是否用不定式：
用to的可能性0.00036304089007899165
可能性最大的词概率0.23709583282470703
是否用被动或进行时：
[('was', 0.0002793713065329939), ('am', 4.049863855470903e-05), ('were', 1.306664398725843e-05), ('been', 4.840642304770881e-06), ('be', 1.453689151276194e-06), ('are', 7.996850399649702e-07), ('is', 5.958298174846277e-07), ('being', 9.706550230248467e-09)]
had 0.8573063611984253
was 0.0002793713065329939
不是被动
[('was', 0.9590925574302673), ('am', 0.006898669525980949), ('were', 0.0016424404457211494), ('been', 0.0004373548727016896), ('is', 0.00035717932041734457), ('be', 3.4134478482883424e-05), ('are', 2.2988733689999208e-05), ('being', 3.1775894626662193e-07)]
was 0.9590925574302673
was 0.9590925574302673
判断其他语法：
need_be == 1
['go', 'goes', 'going', 'went', 'gone']
[2175, 3632, 2183, 2253, 2908]
{'go': 0.00043932811240665615, 'goes': 0.00012179886834928766, 'going': 0.6597349047660828, 'went': 0.00122930109500885, 'gone': 0.002755501540377736}
going
was going


'was going'

In [17]:
from pattern.en import article,referenced,pluralize, singularize
def analyse_N(index):
#******************************************初始数据处理**************************************************************************
    need_DT = 0 #表示是否需要在前面加冠词
    prob_N = 0 #表示这个名词的单复数中最高的概率    
    sentence_id = in_sentence[index][0]
    id_in_sen = in_sentence[index][1]
    wordN = input_ids_sen[sentence_id][id_in_sen]
    wordN = tokenizer.ids_to_tokens[wordN]
    
    input_ids = copy.deepcopy(input_ids_sen[sentence_id])
    input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])
#*****************************************若一个词有问题*************************************************************************    
    input_ids[id_in_sen] = tokenizer.vocab["[MASK]"]
    input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])

    T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度
    T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
    T_input_ids = T_input_ids.to(device) #拿去GPU
    T_input_type_ids = T_input_type_ids.to(device)

    mlm_logits, _ = model(T_input_ids, T_input_type_ids)
    mlm_probs = F.softmax(mlm_logits, dim=-1)
    reduced_mlm_probs = mlm_probs[0][id_in_sen]
    
    N_ = singularize(wordN)
    N_s= pluralize(wordN)
    N_id = tokenizer.vocab[N_]
    N_s_id = tokenizer.vocab[N_s]
    if(reduced_mlm_probs[N_id] > reduced_mlm_probs[N_s_id]):
        suggestion = N_
        prob_N = reduced_mlm_probs[N_id]
    else:
        suggestion = N_s
        prob_N = reduced_mlm_probs[N_s_id]
    
    top_ind = reduced_mlm_probs.argmax().item()
    top_prob = reduced_mlm_probs.max().item()
    
    gap = math.log(top_prob)- math.log(prob_N)
    if gap > 6.5: #我觉得代词的阈值应该回比名词小一点
        need_DT = 1 #不见棺材不落泪，认为缺冠词 
        
    
    input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"])
    input_ids.insert[id_in_sen + 1] = tokenizer.vocab[suggestion]
    input_type_ids.append(0)
    
        T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度
        T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
        T_input_ids = T_input_ids.to(device) #拿去GPU
        T_input_type_ids = T_input_type_ids.to(device)

        mlm_logits, _ = model(T_input_ids, T_input_type_ids)
        mlm_probs = F.softmax(mlm_logits, dim=-1)
        reduced_mlm_probs = mlm_probs[0][id_in_sen]

IndentationError: unexpected indent (<ipython-input-17-ed6919504531>, line 49)

In [18]:
analyzed_cache = {}

def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=20):
    step = 7
    if text[0] in analyzed_cache: #分析过的缓存
        features, mlm_probs = analyzed_cache[text[0]]
        given_mask = "[MASK]" in features[0].tokens
        tokens = features[0].tokens 
    else:
        examples = convert_text_to_examples(text)
        features = convert_examples_to_features(examples, tokenizer, print_info=False)
        given_mask = "[MASK]" in features[0].tokens
        if not given_mask or masked_tokens is not None:
            assert len(features) == 1
            features, batches = copy_and_mask_feature(features[0],step, masked_tokens=masked_tokens)
            #print(len(features))

        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度，变成[n_features,sequence_len]
        #这里的n_features实际上是句子有多少批训练
        
        input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
        input_ids = input_ids.to(device) #拿去GPU
        input_type_ids = input_type_ids.to(device)

        mlm_logits, _ = model(input_ids, input_type_ids)
        mlm_probs = F.softmax(mlm_logits, dim=-1) #最后一维，也就是vocab 换算成概率和为百分之百
        #print(mlm_probs.size())#这里实验的是torch.Size([5, 5, 30522])
        tokens = features[0].tokens #为了输出，[mask]在input_ids里面表示出来，features的token都一样
        #print(tokens)
        if not given_mask or masked_tokens is not None:
            bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size
            assert bsz == len(batches)
            # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)
            # for i in range(seq_len):
            #    reduced_mlm_probs[0, i] = mlm_probs[i, i]
            reduced_mlm_probs = torch.Tensor(1, len(tokens), vocab_size)
            for i in batches:
                pos = i
                while pos < len(tokens):
                    reduced_mlm_probs[0, pos] = mlm_probs[i, pos]
                    pos = pos + step
            mlm_probs = reduced_mlm_probs #压缩一下大小，节约不必要浪费的空间（只需要第i个batch里面[mask]位置的词汇表概率即可）
            #tokens = [tokens[i] for i in masked_positions]
        
        analyzed_cache[text[0]] = (features, mlm_probs)
        
    top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的
    #print(top_pairs) #******************************
    if not given_mask:
        show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)
    #return top_pairs


In [16]:
# text = ["Who was Jim Henson? Jim Henson _ a puppeteer."]
# text = ["Last week I went to the theatre. I had a very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]
# text = ["After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented."]
# text = ["Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence."]
import time
time_start=time.time()
text = ["I hate you."]
#text =["Last week I go to the zoo. I had a very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry."]#因为外面有中括号，所以是二维的
analyze_text(text, show_firstk_probs=100)
#print(analyzed_cache)
time_end=time.time()
print('time cost',time_end-time_start,'s')

NameError: name 'analyze_text' is not defined

In [11]:
text = ["The trophy doesn't fit into the brown suitcase because the _ is too large."]
# text = ["Mary beat John in the match because _ was very strong."]
features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)
input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)
mlm_logits, _ = model(input_ids, input_type_ids)
mlm_probs = F.softmax(mlm_logits, dim=-1)
tokens = features[0].tokens
top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)

01/03/2019 17:10:45 - INFO - examples.extract_features -   tokens: [CLS] the trophy doesn ' t fit into the brown suitcase because the [MASK] is too large . [SEP]


   0 | [CLS]       	   2 | .              1 | )              1 | the            1 | ,              1 | "           
 100 | the         	*100 | the            0 | his            0 | a              0 | its            0 | her         
  97 | trophy      	* 97 | trophy         0 | cup            0 | prize          0 | trophies       0 | competition 
 100 | doesn       	*100 | doesn          0 | can            0 | does           0 | won            0 | didn        
 100 | '           	*100 | '              0 | t              0 | "              0 | =              0 | `           
 100 | t           	*100 | t              0 | not            0 | s              0 | n              0 | to          
 100 | fit         	*100 | fit            0 | fits           0 | sit            0 | get            0 | fitting     
 100 | into        	*100 | into           0 | in             0 | inside         0 | onto           0 | within      
 100 | the         	*100 | the            0 | her            0 | his    

In [19]:
text = [
    # same / different
    "Tom has black hair. Mary has black hair. John has yellow hair. _  and Mary have the same hair color.",
    "Tom has black hair. Mary has black hair. John has yellow hair. _  and Mary have different hair colors.",
    "Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.",
    # because / although
    "John is taller/shorter than Mary because/although _ is older/younger.",
    "The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.",
    "Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.",
    "The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.",
    "John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.",
    # reverse
    "John came then Mary came. They left in reverse order. _ left then _ left.",
    "John came after Mary. They left in reverse order. _ left after _ .",
    "John came first, then came Mary. They left in reverse order: _ left first, then left _ .",
    # compare
    "Though John is tall, Tom is taller than John. So John is _ than Tom.",
    "Tom is taller than John. So _ is shorter than _.",
    # WSC-style: before /after
    "Mary came before/after John. _ was late/early .",
    # yes / no
    "Was Tom taller than Susan? Yes, _ was taller.",
    # right / wrong, epistemic modality
    "John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong.",
    
    "The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.",
    "John thanked Mary because  _ had given help to _ . ",
    "John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.",
    "John couldn't see the stage with Mary in front of him because _ is so short/tall.",
    "Although they ran at about the same speed, John beat Sally because _ had such a bad start.",
    "The fish ate the worm. The _ was hungry/tasty.",
    
    "John beat Mary. _ won the game/e winner.",
]
text

['Tom has black hair. Mary has black hair. John has yellow hair. _  and Mary have the same hair color.',
 'Tom has black hair. Mary has black hair. John has yellow hair. _  and Mary have different hair colors.']

In [None]:
config

In [22]:
with open('WSC_switched_label.json') as f:
    examples = json.load(f)

In [9]:
with open('WSC_child_problem.json') as f:
    cexamples = json.load(f)

In [89]:
for ce in cexamples:
    for s in ce['sentences']:
        for a in s['answer0'] + s['answer1']:
            a = a.lower()
            if a not in tokenizer.vocab:
                ce
                print(a, 'not in vocab!!!')

In [23]:
for ce in cexamples:
    if len(ce['sentences']) > 0:
        e = examples[ce['index']]
        assert ce['index'] == e['index']
        e['score'] = all([s['score'] for s in ce['sentences']])
        assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'
        e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']

In [24]:
from collections import defaultdict

groups = defaultdict(list)
for e in examples:
    if 'score' in e:
        index = e['index']
        if index < 252:
            if index % 2 == 1:
                index -= 1
        elif index in [252, 253, 254]:
            index = 252
        else:
            if index % 2 == 0:
                index -= 1
        groups[index].append(e)

In [62]:
def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):
    return {k: d[k] for k in d if k in keys}

# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])
[(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']
# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])
# for e in examples:
#     if e['index'] % 2 == 0:
#         print(e['sentence'])

[(2, 'fit into:large/small', False),
 (4, 'thank:receive/give', False),
 (6, 'call:successful available', True),
 (8, 'ask:repeat answer', False),
 (10, 'zoom by:fast/slow', False),
 (12, 'vindicated/crushed:be the winner', False),
 (14, 'lift:weak heavy', False),
 (16, 'crash through:[hard]/[soft]', False),
 (18, '[block]:short/tall', False),
 (20, 'down to:top/bottom', False),
 (22, 'beat:good/bad', False),
 (24, 'roll off:anchored level', False),
 (26, 'above/below', False),
 (28, 'better/worse:study hard', False),
 (30, 'after/before:far away', False),
 (32, 'be upset with:buy from not work/sell not work', True),
 (34, '?yell at comfort:upset', False),
 (36, 'above/below:moved first', False),
 (38, 'although/because', False),
 (40, 'bully:punish rescue', False),
 (42, 'pour:empty/full', False),
 (44, 'know:nosy indiscreet', False),
 (46, 'explain:convince/understand', True),
 (48, '?know tell:so/because', True),
 (50, 'beat:younger/older', False),
 (56, 'clog:cleaned removed', True

In [51]:
sum(['because' in e['sentence'] for e in examples]) + \
sum(['so ' in e['sentence'] for e in examples]) + \
sum(['but ' in e['sentence'] for e in examples]) + \
sum(['though' in e['sentence'] for e in examples])

179

In [73]:
# with open('WSC_switched_label.json', 'w') as f:
#     json.dump(examples, f)

In [19]:
vis_attn_topk = 3

def has_chinese_label(labels):
    labels = [label.split('->')[0].strip() for label in labels]
    r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)
    return 0 < r < 0.5  # r == 0 means empty query labels used in self attention

def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):
    assert len(query_labels) == attn.size(0)
    assert len(key_labels) == attn.size(1)

    ax1.set_xlim([-1, 1])
    ax1.set_xticks([])
    ax2 = ax1.twinx()
    nlabels = max(len(key_labels), len(query_labels))
    pos = range(nlabels)
    
    if 'self' in attn_name and col < ncols - 1:
        query_labels = ['' for _ in query_labels]

    for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:
        ax.set_yticks(pos)
        if has_chinese_label(labels):
            ax.set_yticklabels(labels, fontproperties=zhfont)
        else:
            ax.set_yticklabels(labels)
        ax.set_ylim([nlabels - 1, 0])
        ax.tick_params(width=0, labelsize='xx-large')

        for spine in ax.spines.values():
            spine.set_visible(False)

#     mask, attn = filter_attn(attn)
    for qi in range(attn.size(0)):
#         if not mask[qi]:
#             continue
#         for ki in range(attn.size(1)):
        for ki in attn[qi].topk(vis_attn_topk)[1]:
            a = attn[qi, ki]
            ax1.plot((-1, 1), (ki, qi), color, alpha=a)
#     print(attn.mean(dim=0).topk(5)[0])
#     ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())

def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):
    hypo, nheads, labels_dict = result_tuple
    key_labels, query_labels = labels_dict[attn_name]
    if heads is None:
        heads = range(nheads)
    else:
        nheads = len(heads)
    
    stride = 2 if attn_name == 'dec_enc_attns' else 1
    nlabels = max(len(key_labels), len(query_labels))
    rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))
    
    rows = nheads // ncols * stride
    fig, axes = plt.subplots(rows, ncols)
    
    # for head in range(nheads):
    for head_i, head in enumerate(heads):
        row, col = head_i * stride // ncols, head_i * stride % ncols
        ax1 = axes[row, col]
        attn = hypo[attn_name][layer][head]
        _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)
        if attn_name == 'dec_enc_attns':
            col = col + 1
            axes[row, col].axis('off')  # next subfig acts as blank place holder
    # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)
    plt.show()  
            
ncols = 4

In [31]:
attn_name = 'enc_self_attns'
hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}
key_labels = query_labels = tokens
labels_dict = {attn_name: (key_labels, query_labels)}
result_tuple = (hypo, config.num_attention_heads, labels_dict)
plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)

AttributeError: 'BertSelfAttention' object has no attribute 'attention_probs'