In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import os
import json
import nltk
import numpy as np
import math
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams

import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig
from examples.extract_features import *

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
class Args:
    def __init__(self):
        pass
    
args = Args()
args.no_cuda = True

CONFIG_NAME = 'bert_config.json'
BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'
config_file = os.path.join(BERT_DIR, CONFIG_NAME)
config = BertConfig.from_json_file(config_file)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#do_lower_case：在标记化时将文本转换为小写。默认= True
#tokenizer.tokenize = nltk.word_tokenize
model = BertForPreTraining.from_pretrained(BERT_DIR)
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
_ = model.to(device)
_ = model.eval()

03/21/2019 18:04:54 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
03/21/2019 18:04:54 - INFO - pytorch_pretrained_bert.modeling -   loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/
03/21/2019 18:04:54 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



BertForPreTraining：
Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
            - the next sentence classification logits of shape [batch_size, 2].

from_pretrained：
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.

In [4]:
import re
def convert_text_to_examples(text): 
    '''功能：
            把输入的文本变成一个实例，一个实例中包含text_a,text_b(text_b用于是否为上下句的任务，该任务不使用此功能)
       输入：
            text：一个列表结构，列表中包含原始文本字符串，由于仅完成mlm任务，所以text列表中仅包含一个字符串，就是待检查的字符串
       输出：
            example：实例，其中包含：
                unique_id：此任务仅用到0
                text_a：text列表内的字符串
                text_b：此任务下该变量为None
    '''
    examples = []
    unique_id = 0
    if True:
        for line in text:
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.'
            
            if m is None:
                text_a = line
            else:
                text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine.
                text_b = m.group(2) #匹配的第二句，比如I love you.
            
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
            unique_id += 1
    return examples
#print(convert_text_to_examples(['I love you. The cat is so cute.'])[0].text_a)

def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):
    '''功能：
            把实例变成一个特征列表
       输入：
            examples：实例，convert_text_to_examples()函数的输出
            tokenizer：BERT的tokenizer，用于将文本进行各种处理，它可以把一个text转变成tokens，把tokens变成每个token在词典中的编号以及逆运算
            append_special_tokens：是否允许在生成的tokens中加入特殊符号，也就是[CLS]、[MASK]和[SEP]，默认为True
            replace_mask：不明
            print_info：不明
       输出：
            features：每一个feature包含：
                unique_id：编号，目前实现的功能features里面仅有一个feature
                tokens=tokens,tokens：是形如['i','love','you','.']的一个列表
                input_ids=input_ids：字符串中的每个单词在词典中的index序列
                input_mask=input_mask：一堆1
                input_type_ids=input_type_ids))：对text_a,text_b的区分，用于上下句任务，对于本任务，该参数为一个列表，其中包含token长度个的0
    '''
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a) #tokenize的作用是把"i love you."变成['i','love','you','.']
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        tokens = []
        input_type_ids = [] #segment embedding
        if append_special_tokens: #输入参数中默认为true
            tokens.append("[CLS]")
            input_type_ids.append(0)
        for token in tokens_a:
            if replace_mask and token == '_':  # XD
                token = "[MASK]"
            tokens.append(token)
            input_type_ids.append(0)
        if append_special_tokens:
            tokens.append("[SEP]")
            input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                if replace_mask and token == '_':  # XD
                    token = "[MASK]"
                tokens.append(token)
                input_type_ids.append(1)
            if append_special_tokens:
                tokens.append("[SEP]")
                input_type_ids.append(1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号
        input_mask = [1] * len(input_ids) 
        
        if ex_index < 5:
#             logger.info("*** Example ***")
#             logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
#             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#             logger.info(
#                 "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
            
        features.append(
            InputFeatures(
                unique_id=example.unique_id,#编号，目前实现的功能features里面仅有一个feature
                tokens=tokens,#形如['i','love','you','.']的一个列表
                input_ids=input_ids,#字符串中的每个单词在词典中的index序列
                input_mask=input_mask, #一堆1
                input_type_ids=input_type_ids)) #第0类和第1类，对text_a,text_b的区分，本代码中全都是零
    return features            

def copy_and_mask_feature(feature, step, masked_tokens=None): 
    '''
        功能：
            输入feature生成训练的批次数以及mask好的训练素材
        输入：
            feature：convert_examples_to_features函数的输出
            step：两个[mask]位置的步长
            masked_tokens：默认为None，在程序中没有使用
    '''
    import copy
    tokens = feature.tokens
    len_token = len(tokens)
    if len_token<step:
        batches = range(0,len(tokens))
    else:
        batches = range(0,step)
    
    assert len_token > 0
    masked_feature_copies = []
    for i in batches: #用[mask]依次掩盖每一个位置
        feature_copy = copy.deepcopy(feature)
        masked_pos = i
        while masked_pos < len_token:
            feature_copy.input_ids[masked_pos] = tokenizer.vocab["[MASK]"]
            masked_pos = masked_pos + step
        masked_feature_copies.append(feature_copy)
    return masked_feature_copies, batches

#masked_feature_copies, batches = copy_and_mask_feature(features[0],3)
#print(masked_feature_copies[0].input_ids) #结果[101, 1045, 2293, 103, 102]
#print(batches) #结果是一个range(0,5)

In [17]:
analyzed_cache = {}
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG
#print (lemma('gave'))
#print (lexeme('production'))
#print (conjugate(verb='give',tense=PRESENT,number=SG))
def process_text(text): 
    '''
        功能：
            处理输入文本，将文本按句子分成若干token，得出原来text中index位置的单词在x句子的y位置，还得出各个句子类别码
        输入：
            text：文本字符串，注意区别
        输出：
            input_ids_sen：二维列表，第一维列表的元素是每个句子的input_ids列表
            input_type_ids_sen：二维列表，第一维列表的元素是每个句子的input_type_ids列表
            in_sentence：通过这个二维数组可以很方便的通过在完整text中的下标找到这个下标所在的句子和在句子中的下标
            sentences：字符串列表，列表中每一个元素是一个句子字符串
            entire_ids：整个text的input_ids
            entire_type_ids：整个text的input_type_ids
    '''
    token =[]
    entire_type_ids = []
    token0 = tokenizer.tokenize(text)
    token.append('[CLS]')
    entire_type_ids.append(0)
    for i in token0:
        token.append(i)
        entire_type_ids.append(0)
    token.append('[SEP]')
    entire_type_ids.append(0)
    
    entire_ids = tokenizer.convert_tokens_to_ids(token)
    in_sentence = [[0,0]] 
    sentence_n = 0
    index = 1
    for i in range(1,len(token)-1):
        in_sentence.append([sentence_n,index])  #每个token中的词在所在句中的位置表示出来，以及该位置在哪一句中
        index = index + 1                           #比如，位置i这个词在第sentence句的index位置上
        if token[i] == '.':
            sentence_n = sentence_n + 1
            index = 1
    sentences = text.split(".")
    
    sen_token = []
    input_ids_sen = []
    input_type_ids_sen = []
    for i,sentence in enumerate(sentences):
        sentence = sentence + '.'
        sentences[i] = sentences[i] + '.'
        token = []
        input_type_ids = []
        tokens = tokenizer.tokenize(sentence)
        token.append('[CLS]')
        input_type_ids.append(0) 
        for i in tokens:
            token.append(i)
            input_type_ids.append(0)        
        token.append('[SEP]')        
        input_type_ids.append(0)
        input_ids_sen.append(tokenizer.convert_tokens_to_ids(token))
        input_type_ids_sen.append(input_type_ids)
    return input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids
#text = ["Last week I went to the theatre. I had an very good a seat.The play were very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]
#text = ["Last week I went to the theatre. I had very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]
#text = ["The question is more easy than that one."]
text = ["Last week I went to the theater. There are many person . Luckily I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]

input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = process_text(text[0])
'''print(input_ids_sen)
print(in_sentence)
print(input_type_ids_sen)
print(sentences)
print(entire_ids)
print(entire_type_ids)
#input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = None'''

'print(input_ids_sen)\nprint(in_sentence)\nprint(input_type_ids_sen)\nprint(sentences)\nprint(entire_ids)\nprint(entire_type_ids)\n#input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = None'

In [18]:
def get_word(index):
    '''
        输入：
            index：在完整text中的位置
        输出
            word:该位置上的单词
    '''
    word_id = entire_ids[index]
    word = tokenizer.ids_to_tokens[word_id]
    return word


In [19]:
import copy
import nltk
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE

def give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold):
    '''
        功能：
            给出指定文本指定位置的推荐用词
        输入：
            input_ids_：要分析的文本的input_ids
            input_type_ids_：要分析的文本的的input_type_ids
            id_in_sen：要分析的文本中[MASK]的位置下标，也就是需要给出建议用词的位置
            alternative_word：推荐的备选词范围
            threshold：阈值
        输出：
            suggestion：推荐
            need：推荐的是否是备选词中的词
            suggestion_prob：推荐词填在id_in_sen位置的概率
            top_of_alternative:备选词中最值得推荐的词
    '''
    input_ids = copy.deepcopy(input_ids_)
    input_type_ids = copy.deepcopy(input_type_ids_)
    word0 = input_ids[id_in_sen]
    word0 = tokenizer.ids_to_tokens[word0]
    list_word_id = []
    
    input_ids[id_in_sen] = tokenizer.vocab["[MASK]"]
    T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度
    T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
    T_input_ids = T_input_ids.to(device) #拿去GPU
    T_input_type_ids = T_input_type_ids.to(device)

    mlm_logits, _ = model(T_input_ids, T_input_type_ids)
    mlm_probs = F.softmax(mlm_logits, dim=-1)
    reduced_mlm_probs = mlm_probs[0][id_in_sen]

    top_ind = reduced_mlm_probs.argmax().item()
    top_prob = reduced_mlm_probs.max().item() 
    
    list_word = []
    
    top_of_alternative = None
    if len(alternative_word)>0:
        list_word_prob = {}
        for word in alternative_word:
            try:
                list_word_id.append(tokenizer.vocab[word])
                list_word.append(word)
            except KeyError:
                pass
        #print(list_word_id)    
        #print(list_word)
        for word,word_id in zip(list_word,list_word_id):
            list_word_prob.update({word:float(reduced_mlm_probs[word_id].data)})
        prob_ord = sorted(list_word_prob.items(),key = lambda x:x[1],reverse = True)
        #print(prob_ord)
        #print(tokenizer.ids_to_tokens[top_ind],top_prob)
        #print(prob_ord[0][0],prob_ord[0][1])
        top_prob_word = prob_ord[0][1]
        top_of_alternative = prob_ord[0][0]
        gap = math.log(top_prob) - math.log(top_prob_word)
        if gap < threshold:
            suggestion = prob_ord[0][0]
            suggestion_prob = prob_ord[0][1]
            need = 1
        else:
            suggestion = tokenizer.ids_to_tokens[top_ind]
            suggestion_prob = top_prob
            need = 0
        #print("gap = " + str(gap))
        #print(prob_ord)
    else:
        suggestion = tokenizer.ids_to_tokens[top_ind]
        suggestion_prob = top_prob
        need = 0
        
    return suggestion,need,suggestion_prob,top_of_alternative 

#返回变量5
#suggestion -> 最值得推荐的词
#need -> 是否需要可选词中的一个
#suggestion_prob ->最值得推荐的词的概率
#top_of_alternative -> 可选词中最值得推荐的
#suggestion,need,suggestion_prob,top_of_alternative = give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold)

In [20]:
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from pattern.en import comparative, superlative
from pattern.en import suggest
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
import enchant
d = enchant.Dict("en_US")


In [21]:
stemmers=[]
stemmers.append(LancasterStemmer()) 
stemmers.append(SnowballStemmer("english"))
stemmers.append(PorterStemmer())
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
#分情况讨论，如果新词比旧的词长，或者是短
def word_convert(word,new_word,Stemmer):
    '''
        功能：
            根据提供的word和可能的变形new_word,得到正确的变形，例如给出basic，basicly得到basically
        输入：
            word：需要变形的词
            new_word:猜想的变形
        输出：
            suggest_word:推荐的正确变形
    '''
    suggest_word = None
    word_stem = Stemmer().stem(word)
    suggest_ = new_word
    
    suggest_list = suggest(suggest_)

    if len(word)<len(new_word):
        flag = 0
    else:
        flag = 1
    word_stem = word_stem[:-1]
    suggestion_word_stem = Stemmer().stem(suggest_)
    
    for word_ in suggest_list:
        if word == word_[0]:
            continue
        if (word_[0] == new_word and word_[1] > 0.95):# or word_[1] > 0.95 :
            suggest_word = word_[0]
            break           
        if word_[1] < 0.001:
            break
        stem_list = []
        for stemmer in stemmers:
            suggest_stem = stemmer.stem(word_[0])
            if flag == 1 and suggest_stem[:-1] in word_stem and word_stem[:3] in suggest_stem[:3]: #一般是去后缀
                suggest_word = word_[0]
                break
            elif flag == 0 and word_stem in suggest_stem and word_[0][-1:] in suggest_[-1:]: #一般是加后缀，后缀一定要一样
                suggest_word = word_[0]
                break
                
        if suggest_word != None:
            break
    return suggest_word 

import time
time_start=time.time()
for i in range(1):
    print(word_convert("dark","darkment",PorterStemmer))
time_end=time.time()
print('totally time cost',time_end-time_start,'s')


None
totally time cost 0.2184145450592041 s


In [22]:
stemmers=[]
stemmers.append(LancasterStemmer()) 
stemmers.append(SnowballStemmer("english"))
stemmers.append(PorterStemmer())
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
def word_convert(word,new_word,Stemmer):
    '''
        说明;
            与上面的区别是使用的拼写改错算法不同，上面那个平均速度慢，但更符合我的要求，这个平均速度更快
        功能：
            根据提供的word和可能的变形new_word,得到正确的变形，例如给出basic，basicly得到basically
        输入：
            word：需要变形的词
            new_word:猜想的变形
            Stemmer:词根提取器
        输出：
            suggest_word:推荐的正确变形
    '''
    if d.check(new_word)==True: #如果发现new_word拼写正确，则直接返回
        return new_word
    else:
        suggest_word = None
        word_stem = Stemmer().stem(word)
        suggest_ = new_word
        suggest_list = d.suggest(suggest_) #可能的正确单词列表

        if len(word)<len(new_word): #一般都是加后缀
            flag = 0
        else: #一般都是去后缀
            flag = 1
        word_stem = word_stem[:-1] #这样效果更好一点，防止某些去e加后缀或者y变i的变形被忽略
        suggestion_word_stem = Stemmer().stem(suggest_)
        for word_ in suggest_list:
            if word == word_: #如果变形和原型一样，就跳过这个词
                continue
            if (word_ == new_word): #如果推荐的和new_word一样，直接把该词作为结果
                suggest_word = word_
                break
            if ' ' in word_ or '-' in word_: #enchant.Dict模型特有的问题，一个拼写错误的词可能会给你返回一个带连字符词的或者是两个词
                continue
            stem_list = []
            for stemmer in stemmers:
                suggest_stem = stemmer.stem(word_)
                if flag == 1 and suggest_stem in word_stem and word_stem[:3] in suggest_stem[:3]: #一般是去后缀
                    suggest_word = word_
                    break
                elif flag == 0 and word_stem in suggest_stem and word_[-1:] in suggest_[-1:]: #一般是加后缀，后缀一定要一样
                    suggest_word = word_
                    break

            if suggest_word != None:
                break
        return suggest_word 
import time
time_start=time.time()
for i in range(1):
    print(word_convert("beautiful","beauti",PorterStemmer))
time_end=time.time()
print('totally time cost',time_end-time_start,'s')


beaus
totally time cost 0.0006299018859863281 s


In [23]:

def adj_to_adv(word):
    suggest_word = None
    if(word == "good"):
        return "well"
    else:
        suggest_ = word + 'ly'
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        return suggest_word
#如果形容词副词同形，那么他会返回none，但是不影响计算，因为形容词副词同形啊
print(adj_to_adv("successful"))

def adv_to_adj(word):
    suggest_word = None
    if(word == "well"):
        return "good"    
    elif word[-2:] == 'ly':
        suggest_ = word[:-2]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
    return suggest_word
print(adv_to_adj("basically"))



successfully
basic


In [24]:
def adj_to_anything(word):#形容词变成其他词性
    suggest_word = None
    suggest_list = []
    if word[-1:] == 'y': #举例 healthy->health
        suggest_ = word[:-1]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-3:] == 'ful':#举例 successful->success
        suggest_ = word[:-3]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-3:] == 'ive': #举例 active -> act
        suggest_ = word[:-4]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-2:] == 'ed': #举例 interested->interest->interesting
        suggest_ = word[:-2]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)     
        suggest_ = suggest_ + 'ing'
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)      
            
    elif word[-3:] == 'ing':#举例 interesting->interest->interested
        suggest_ = word[:-3]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
        suggest_ = suggest_ + 'ed'
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)  
            
    elif word[-4:] == 'less': #举例 careless -> care
        suggest_ = word[:-4]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-2:] == 'ly':  #举例： friendly -> friend , lovely -> love
        suggest_ = word[:-2]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
 
    elif word[-1:] == 't': #举例 different -> different
        suggest_ = word[:-1]
        suggest_ = suggest_ + 'ce'
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-3:] == 'ous': #举例 dangerous -> danger
        suggest_ = word[:-3]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-2:] == 'al': #举例 original -> origin
        suggest_ = word[:-2]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-4:] == 'able':
        suggest_ = word[:-4]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-2:] == 'en': #举例 woolen -> wool
        suggest_ = word[:-2]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-2:] == 'ic': 
        suggest_ = word[:-2]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)        
    elif word[-3:] == 'ish':
        suggest_ = word[:-3]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word == None:
            suggest_ = word[:-3]
            suggest_ = suggest_ + 'and'
            suggest_word = word_convert(word,suggest_,PorterStemmer) 
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-3:] == 'ese':
        suggest_ = word[:-3]
        suggest_ = suggest_ + 'a'
        suggest_word = word_convert(word,suggest_,PorterStemmer)  
        if suggest_word != None:
            suggest_list.append(suggest_word)
    elif word[-3:] == 'ian':
        suggest_ = word[:-1]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word == None:
            suggest_ = word[:-3]
            suggest_ = suggest_ + 'y'
            suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    if suggest_word == None:
        HouZhui_list = ['ment','ness','tion','ture','sion','ty','y','tive','sive']
        for HouZhui in HouZhui_list:
            suggest_ = word + HouZhui
            new_word = word_convert(word,suggest_,PorterStemmer)
            if new_word != None:
                suggest_word = new_word
                suggest_list.append(suggest_word)
    suggest_list = list(set(suggest_list))      
    return suggest_list

print(adj_to_anything('interesting'))



['interested', 'interest']


In [25]:
def N_to_anything(word):#名词变成其他词性
    suggest_list = []
    list_HouZhui = ['y','ful','tive','sive','ed','ing','less','ly','ous','al','able','en','tic','ish','ance','er','or']
    list_QianZhui = ['a']
    if word[-4:] in ['ment','ness','tion','ture','sion','tive','sive']:
        suggest_ = word[:-4]
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)
    else:
        for HouZhui in list_HouZhui:
            suggest_ = word + HouZhui
            suggest_word = word_convert(word,suggest_,PorterStemmer)
            if suggest_word != None:
                suggest_list.append(suggest_word)
        for QianZhui in list_QianZhui:
            suggest_ = QianZhui + word
            suggest_word = word_convert(word,suggest_,PorterStemmer)
            if suggest_word != None:
                suggest_list.append(suggest_word)
        if word[-2:] == 'ce':
            suggest_ = word[:-2]
            suggest_ = syggest_ + 't'
            suggest_word = word_convert(word,suggest_,PorterStemmer)
            if suggest_word != None:
                suggest_list.append(suggest_word)        
        elif word[-4:] == 'land':
            suggest_ = word[:-4]
            suggest_word = word_convert(word,suggest_,PorterStemmer)
            if suggest_word == None:
                suggest_ = suggest_ + 'lish'
                suggest_word = word_convert(word,suggest_,PorterStemmer)
            if suggest_word != None:
                suggest_list.append(suggest_word)  
        #print(suggest_list)
    suggest_list = list(set(suggest_list))
    return suggest_list
'''import time
time_start=time.time()
print(N_to_anything("success"))
time_end=time.time()
print('time cost',time_end-time_start,'s')'''

'import time\ntime_start=time.time()\nprint(N_to_anything("success"))\ntime_end=time.time()\nprint(\'time cost\',time_end-time_start,\'s\')'

In [26]:
def V_to_anything(word):#动词变成其他词性
    suggest_word = None
    suggest_list = []

    HouZhui_list = ['ful','tive','sive','ed','less','ly','ous','al','able','en','tic','ish','ance','tion','sion','ment','er','or','ee']
    for HouZhui in HouZhui_list:
        suggest_ = word + HouZhui
        suggest_word = word_convert(word,suggest_,PorterStemmer)
        if suggest_word != None:
            suggest_list.append(suggest_word)

    suggest_list = list(set(suggest_list))
    return suggest_list

time_start=time.time()
print(V_to_anything('succeed'))
time_end=time.time()
print('time cost',time_end-time_start,'s')  

['succeeder', 'succeeds', 'succeeded']
time cost 0.654491662979126 s


In [27]:
'''
    功能：
        生成形容词，副词关联词表
    输入：
        word：形容词/副词
    输出：
        list_word：为没有添加词的其他形式，包括三音节以下词的比较级最高级
        list_word2：为三音节及以上的词的比较级最高级，如果输入形容词比较级最高级没有more/most，该列表为空
    说明：
        由于三音节形容词/副词的比较级，最高级为more/most+原形容词/副词，所以特别把形容词/副词和其他词性变形区分出来
'''

def build_like_word_adj(word): #创建类似形容词列表
    list_word = []
    list_word2 = [] #把比较级最高级带more的放在这里
    lemmas = lemmatizer(word, u'adj')
    #print(lemmas)
    for i in lemmas:
        list_word.append(i)
        word_er = comparative(i)
        if "more" in word_er:  #把比较级带more，most的词放在另一个列表list_word2
            list_word2.append(word_er)
        else:
            list_word.append(word_er)
        word_est = superlative(i)
        if "most" in word_est:
            list_word2.append(word_est)
        else:
            list_word.append(word_est)
        word_adv = adj_to_adv(i)
        if word_adv != None:
            list_word.append(word_adv)
    list_N = adj_to_anything(word)
    for N in list_N:
        list_word.append(N)
    
    list_word = list(set(list_word))
    return list_word,list_word2

def build_like_word_adv(word): #创建类似形容词列表
    list_word = []
    list_word2 = []
    list_special = ['however','seldom','often','never','otherwise']
    if word in list_special:
        list_word = [word]
        list_word2 = []
    else:
        lemmas = lemmatizer(word, u'adj')
        #print(lemmas)
        for i in lemmas:
            list_word.append(i)
            word_er = comparative(i)
            if "more" in word_er:
                list_word2.append(word_er)
            else:
                list_word.append(word_er)
            word_est = superlative(i)
            if "most" in word_est:
                list_word2.append(word_est)
            else:
                list_word.append(word_est)
            word_adv = adv_to_adj(i)
            if word_adv != None:
                list_word.append(word_adv)
    list_word = list(set(list_word))
    return list_word,list_word2



print(build_like_word_adj("difficult"))
print(build_like_word_adv("early"))




'\n    功能：\n        生成形容词，副词关联词表\n    输入：\n        word：形容词/副词\n    输出：\n        list_word：为没有添加词的其他形式，包括三音节以下词的比较级最高级\n        list_word2：为三音节及以上的词的比较级最高级，如果输入形容词比较级最高级没有more/most，该列表为空\n    说明：\n        由于三音节形容词/副词的比较级，最高级为more/most+原形容词/副词，所以特别把形容词/副词和其他词性变形区分出来\n'

(['difficult', 'difficulty', 'difficultly'], ['more difficult', 'most difficult'])
(['early', 'ear', 'earliest', 'earlier'], [])


In [28]:
'''
    功能：
        根据检查的位置整理出放入BERT模型的input_ids,input_type_ids以及检查位置在input_ids中的下标位置
        pre_training_input_in_sentence得到检查位置所在句子的信息
        pre_training_input_entire得到检查位置所在句子的信息
    输入：
        index：在完整text中的位置
    输出：
        input_ids：
        input_type_ids：
        id_in_sen：检查位置在句子中的下标
        index：检查位置在完整text中的下标，其实就是输入的下标
'''
def pre_training_input_in_sentence(index): 
    sentence_id = in_sentence[index][0]
    id_in_sen = in_sentence[index][1]
    word = input_ids_sen[sentence_id][id_in_sen]
    word = tokenizer.ids_to_tokens[word]
    input_ids = copy.deepcopy(input_ids_sen[sentence_id])
    input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])

    return word,input_ids,input_type_ids,id_in_sen

def pre_training_input_entire(index): 
    word = entire_ids[index]
    word = tokenizer.ids_to_tokens[word]
    input_ids = copy.deepcopy(entire_ids)
    input_type_ids = copy.deepcopy(entire_type_ids)

    return word,input_ids,input_type_ids,index

word,input_ids,input_type_ids,index = pre_training_input_in_sentence(6)
print(word)
print(input_ids)
print(input_type_ids)
print(index)
#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 102]
#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 1045, 2018, 1037, 2200, 2204, 2835, 1012, 1996, 2377, 2001, 2200, 5875, 1012, 102]

'\n    功能：\n        根据检查的位置整理出放入BERT模型的input_ids,input_type_ids以及检查位置在input_ids中的下标位置\n        pre_training_input_in_sentence得到检查位置所在句子的信息\n        pre_training_input_entire得到检查位置所在句子的信息\n    输入：\n        index：在完整text中的位置\n    输出：\n        input_ids：\n        input_type_ids：\n        id_in_sen：检查位置在句子中的下标\n        index：检查位置在完整text中的下标，其实就是输入的下标\n'

the
[101, 2197, 2733, 1045, 2253, 2000, 1996, 4258, 1012, 102]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
6


In [29]:
import copy
import nltk
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE

def analyse_V(index):
#这是一个处理动词语法问题的函数，输入为问题词在text的token中的下标index
    need_to_will = need_be = 0
    list_be = lexeme('be')
    list_be = lexeme('be')[:8]
    #**************************************判断是不是动词其他形式************************
    wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)
    if wordV in list_be:
        list_word = list_be
    else:
        list_word = lexeme(wordV)
        list_others = V_to_anything(conjugate(verb=wordV,tense=PRESENT,person = 1))
        for other in list_others:
            list_word.append(other)
    #print("list_word = ",list_word)
    #print(tokenizer.convert_ids_to_tokens(input_ids))
    suggestion0,need,_,_= give_suggestion(input_ids,input_type_ids,index,list_word,5)
    if need == 1 and suggestion0 != wordV:
        return suggestion0
    
    else:#**************************************判断是不是缺介词***************************
        wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)
        input_ids.insert(id_in_sen + 1,tokenizer.vocab['at'])#就随便插入一个东西，占位子
        input_type_ids.append(0)
        list_IN = ["at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of"]
        suggestion4,need_IN,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_IN,1)
        if need_IN == 1:
            input_ids[id_in_sen + 1] = tokenizer.vocab[suggestion4]
            list_word = lexeme(wordV)
            suggestion44,need,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,3)
            if need == 1:
                suggestion = suggestion44 + ' ' +suggestion4
                return suggestion
        #**************************************判断是不是不定式或者将来时***************************    
        #print("是否用不定式或将来时")
        wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)
        input_ids.insert(id_in_sen,tokenizer.vocab['to'])#就随便插入一个东西，占位子
        input_type_ids.append(0)
        input_ids[id_in_sen + 1] = tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,person = 1)]
        #print(tokenizer.convert_ids_to_tokens(input_ids))
        suggestion_to_will,need_to_will,prob0,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,["to","will"],1)
        if need_to_will == 1:
            list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1),conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]
            suggestion,need0,_,prob00= give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_word,5) 
            
        #**********************************判断是不是被动语态或者进行时*******************   

        #********************是不是被动语态****************   
        #print("是不是被动语态")
        wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)
        input_ids.insert(index,tokenizer.vocab['be'])#就随便插入一个东西，占位子
        try:
            input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PAST,aspect=PROGRESSIVE)]
            input_type_ids.append(0)
            #print(tokenizer.convert_ids_to_tokens(input_ids))
            suggestion1,need_be1,prob1,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1)
        except KeyError:
            need_be1 = 0
        #********************是不是现在分词****************   
        #print("是不是进行时")
        try:
            input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]
            suggestion2,need_be2,prob2,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1)
            #print(tokenizer.convert_ids_to_tokens(input_ids))
        except KeyError:
            need_be2 = 0
            
        #if need_be1 == 1 or need_be2 == 1:
            #print("需要be")
        #***************************选择是不定式还是被动语态还是进行时****************************
        prob_max = 0
        if need_to_will == 1:
            prob_max = max(prob_max,prob0)
        if need_be1 == 1:
            prob_max = max(prob_max,prob1)
        if need_be2 == 1:
            prob_max = max(prob_max,prob2)

        if need_to_will == 1 and prob_max == prob0:
            need_be = 0
        if need_be1 == 1 and prob_max == prob1:
            need_to_will = 0
            need_be = 1
            be_ = suggestion1
        if need_be2 == 1 and prob_max == prob2:
            need_to_will = 0
            need_be = 1
            be_ = suggestion2
        #*************************************************处理各种语法******************************************************************
        if need_to_will == 1:
            wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)
            input_ids.insert(index,tokenizer.vocab[suggestion_to_will])
            input_type_ids.append(0)
            list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1),conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]
            suggestion,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5)
            return 'to ' + suggestion

        elif need_be == 1:
            #********************************被动语态或者进行时*****************
            wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)
            input_ids.insert(index,tokenizer.vocab[be_])
            input_type_ids.append(0)
            list_word = lexeme(wordV)
            suggestion,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5)
            suggestion = be_ + ' '+ suggestion
        else:
            #*****************************************判断该位置是不是动词的其他时态**************************************************************
            suggestion = suggestion0

        return suggestion
    
analyse_V(2)



'week'

In [55]:
def analyse_adj(index):
    #这是一个处理形容词语法问题的函数，输入为问题词在text的token中的下标index
    wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_entire(index)  
    list_word,list_word2 = build_like_word_adj(wordADJ)
    #print(list_word)
    suggestion0,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,5)
    
    if need_adj == 1 and suggestion0 != wordADJ:#判断是不是形容词其他变形
        return suggestion0
    elif get_word(index - 1) in ['more','most'] and len(list_word2) == 0:
        #判断是不是比较级使用错误,如果该形容词比较级/最高级不需要加more/most，但是前面有more/most
        wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) 
        del input_ids[id_in_sen - 1]
        del input_type_ids[0]
        suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,6)
        return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3
    elif get_word(index + 1) in ['##er','##est','##r','##st'] and len(list_word2) != 0:
        #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most，但是错写成形容词+er/est
        wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) 
        input_ids[id_in_sen + 1] = tokenizer.vocab[wordADJ]
        suggestion4,need_bijiao,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,['more','most'],2)
        if need_bijiao == 1:
            input_ids[id_in_sen] = tokenizer.vocab[suggestion4]
            suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,6)
            return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ suggestion4 + ' ' + suggestion5  
    else:#检查形容词前面是否需要加冠词或者是需要more，most的比较级，最高级抑或是be动词
        #print("缺冠词或者没用比较级")
        wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) 
        input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"])
        input_type_ids.append(0)
        #print(tokenizer.convert_ids_to_tokens(input_ids))
        list_DT = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] 
        suggestion,need_DT,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT,1)
        if need_DT == 1:
            wordADJ,input_ids,input_type_ids,index = pre_training_input_entire(index)
            input_ids.insert(index,tokenizer.vocab[suggestion])
            input_type_ids.append(0)
            #print(tokenizer.convert_ids_to_tokens(input_ids))
            suggestion2,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,6)     
            return suggestion + ' ' + suggestion2
        else:
            return suggestion0
print(analyse_adj(78))

was unwilling


In [56]:
def analyse_adv(index):
    #这是一个处理形容词语法问题的函数，输入为问题词在text的token中的下标index
    need_DT = 0
    need_douhao = 0
    wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_entire(index)
    list_word,list_word2 = build_like_word_adv(wordADV)
    suggestion0,need_adv,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,3.5)
    if need_adv == 1 and suggestion0 != wordADV:
        return suggestion0
    elif get_word(index - 1) in ['more','most'] and len(list_word2) == 0:
        #判断是不是比较级使用错误,如果该形容词比较级/最高级不需要加more/most，但是前面有more/most
        wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) 
        del input_ids[id_in_sen - 1]
        del input_type_ids[0]
        suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,5)
        return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3
    elif get_word(index + 1) in ['##er','##est','##r','##st'] and len(list_word2) != 0:
        #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most，但是错写成形容词+er/est
        wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) 
        input_ids[id_in_sen + 1] = tokenizer.vocab[wordADV]
        suggestion4,need_bijiao,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,['more','most'],2)
        if need_bijiao == 1:
            input_ids[id_in_sen] = tokenizer.vocab[suggestion4]
            suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,5)
            return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ suggestion4 + ' ' + suggestion5  
    else:#检查形容词前面是否需要加冠词或者是需要more，most的比较级，最高级，be动词
        wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)
        input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"])
        input_type_ids.append(0)
        #print(tokenizer.convert_ids_to_tokens(input_ids))
        list_DT = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] 
        suggestion,need_DT,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT,1)
        if need_DT == 1:
            #print("需要冠词")
            wordADV,input_ids,input_type_ids,index = pre_training_input_entire(index)
            input_ids.insert(index,tokenizer.vocab[suggestion])
            input_type_ids.append(0)
            #print(tokenizer.convert_ids_to_tokens(input_ids))
            suggestion2,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,3)     
            return suggestion + ' ' + suggestion2
        else:
            #副词后面可能缺少逗号，比如 Luckily,I won the game.
            wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)
            input_ids.insert(id_in_sen + 1,tokenizer.vocab[","])
            input_type_ids.append(0)
            suggestion3,need_douhao,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,2)
            if need_douhao == 1:
                return suggestion3 + ' ,'
            else:
                return suggestion0
print(analyse_adv(5))

,


In [57]:
from pattern.en import article,referenced,pluralize, singularize
import nltk
def analyse_N(index):
    #这是一个处理名词语法问题的函数，输入为问题词在text的token中的下标index
#******************************************初始数据处理**************************************************************************
    need_DT = 0 #表示是否需要在前面加冠词 
    wordN,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)
    word_tag = nltk.pos_tag([wordN])
    if word_tag[0][1] == "NN":
        N_ = wordN
        N_s= pluralize(wordN)
    else:
        N_ = singularize(wordN)
        N_s= wordN
    list_word = [N_,N_s]
    list_others = N_to_anything(N_)
    for other in list_others:
        list_word.append(other)
    #print(list_word)
#*****************************************判断是否需要冠词或者代词************************************************************************   
    
    input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"])
    input_type_ids.append(0)
    #print(tokenizer.convert_ids_to_tokens(input_ids))
    list_DT = ['the','a','an','this','that','these','those','some','any','all']
    suggestion,need_DT,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT,1)
    if need_DT == 0:#不需要冠词
        #print("不需要冠词")
        wordN,input_ids,input_type_ids,index = pre_training_input_entire(index)
        suggestion,need_DT,_,top_of_list_word = give_suggestion(input_ids,input_type_ids,index,list_word,7)
        return suggestion
    elif need_DT == 1:
        wordN,input_ids,input_type_ids,index = pre_training_input_entire(index)
        input_ids.insert(index,tokenizer.vocab[suggestion])
        input_type_ids.append(0)
        suggestion2,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,7)
        return suggestion + ' ' + suggestion2

print(analyse_N(78))


wanted


In [58]:
'''
    这是一个相关代词的词典，容易混淆的词放在一个列表中

'''
like_he = ['he','his','him','himself','who', 'whom', 'whose']
like_she = ['she','her','herself','hers','who', 'whom', 'whose']
like_it = ['it','its','itself','who', 'whom', 'whose']
like_i = ['i','me','my','myself','mine']
like_you = ['you','your','yourself','yourselves']
like_we = ['we','us','our','ours','ourselves']
like_they = ['they','them','their','theirs']

like_this = ['this', 'these'] 
like_that = ['that','those'] 
pronoun_Question = ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'] #疑问代词
pronoun_relation =  ['that', 'which', 'who', 'whom', 'whose', 'as'] #关系代词
like_some = ['some','any']
like_few = ['few','little']
like_many = ['many','much']
like_other = ['another','other']

pronoun = [like_he,like_she,like_it,like_i,like_you,like_we,like_they,like_this,like_that,pronoun_Question,pronoun_relation,like_some,like_few,like_many,like_other]
pronoun_dictionary = {}

for list_word in pronoun:
    for word in list_word:
        pronoun_dictionary.update({word:list_word})
print(pronoun_dictionary)

'\n    这是一个相关代词的词典，容易混淆的词放在一个列表中\n\n'

{'he': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'his': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'him': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'himself': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'who': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'whom': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'whose': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'she': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'her': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'herself': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'hers': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'it': ['it', 'its', 'itself', 'who', 'whom', 'whose'], 'its': ['it', 'its', 'itself', 'who', 'whom', 'whose'], 'itself': ['it', 'its', 'itself', 'who', 'whom', 'whose'], 'i': ['i', 'me', 'my', 'myself', 'mine'], 'me': ['i', 'me', 'my', 'myself', 'mine'], 'my': ['i', 'me', 'my', 'myself', 'mine'], 'm

In [59]:
def analyse_pronoun(index):
    #这是一个处理代词语法问题的函数，输入为问题词在text的token中的下标index
    wordPROP,input_ids,input_type_ids,index = pre_training_input_entire(index)
    list_word = pronoun_dictionary[wordPROP]
    suggestion,_,_,_= give_suggestion(input_ids,input_type_ids,index,list_word,3)
    return suggestion
print(analyse_pronoun(14))

KeyError: 'night'

解释一下，有很多副词确实也不是ly形式结尾的,比如用在三音节形容词前面的比较级more，most，还有频度副词often，seldom，never这种。因为这些词比较不容易用错，先暂时不考虑

In [60]:
def analyse_DT(index):
    #检查冠词，检查是不是用别的冠词，或者是去掉会不会更好
    wordDT,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)  
    if wordDT in ['all',"every",'per']:
        return wordDT
    else:
        if wordDT in ['some','any']:
            list_word = ['some','any']
        elif wordDT in ['this','that','these','those']:
            list_word = ['this','that','these','those']
        elif wordDT in ['the','a','an']:
            list_word = ['the','a','an']
        elif wordDT in ['another','other']:
            list_word = ['another','other']
        else:
            list_word = []
        suggestion0,need_DT,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,1)
        if wordDT in ['some','any','this','that','these','those','another','other','the','a','an']:
            if need_DT == 1:
                return suggestion0
            else:
                return "去掉 " + get_word(index)
        else:
            return wordDT
    
print(analyse_DT(25))

all


In [61]:
def analyse_IN(index):
    #检查介词是否需要去掉
    wordIN,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) 
    list_word = ["at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of",'to']
    suggestion0,need_IN,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,3)
    if need_IN == 1:
        return suggestion0
    else:
        if wordIN in list_word:
            return "去掉 " + get_word(index)
        else:
            return suggestion0
    
print(analyse_IN(76))

but


In [62]:
import nltk
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG
'''
    这是一个输出BERT模型训练结果的函数，方便查看调试
'''
def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数，要最高概率topk个输出
    def print_pair(token, prob, end_str='', hit_mark=' '):
        if i < firstk:
            # token = token.replace('</w>', '').replace('\n', '/n')
            print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)
    
    ret = None
    for i in range(len(tokens)):
        ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]
        prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率，prob_是词典上原来天的这个词的概率
        print_pair(tokens[i], prob_, end_str='\t')
        values, indices = probs[i].topk(topk)
        #print(values, indices)
        #print("****************************************************************************************************")
        top_pairs = []
        for j in range(topk):
            ind, prob = indices[j].item(), values[j].item()
            hit_mark = '*' if ind == ind_ else ' '
            token = tokenizer.ids_to_tokens[ind]
            print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\n')
            top_pairs.append((token, prob))
        if tokens[i] == "[MASK]":
            ret = top_pairs
    return ret #返回的这是个啥

'\n    这是一个输出BERT模型训练结果的函数，方便查看调试\n'

In [63]:
import math
from pattern import en
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE
'''
    功能：
        judge_and_suggestion系列函数，这个系列函数是在analyse之前做的一个预先判断处理，判断的是该位置原来词的相关词中有没有可以代替它的词
        当相关词中有词的可能性和原词的可能性的差距大于阈值，则认为原词是错的
    输入：
        prob：该位置可能性列表
        original：该位置原先的词
        list_word：该位置相关词表
        threhold：门槛，也就是阈值
    输出：
        judge：判断原来的词是否正确，0表示需要换词，1表示不需要换词或者说相关词里面没一个合适的
        suggestion：相关词中最好的推荐
'''
def judge_and_suggestion(prob,original,list_word,threhold):
    top_prob = 0
    original_prob = prob[tokenizer.vocab[original]]
    best = None
    suggestion = None
    for word in list_word:
        try:
            word_id = tokenizer.vocab[word]
            prob_word = prob[word_id]
            if prob_word > top_prob:
                top_prob = prob_word
                best_word = word
        except KeyError:
            pass
    #print(best_word,top_prob)
    #print(original,original_prob)
    gap = math.log(top_prob) - math.log(original_prob)
    #print(gap)
    if gap > threhold:
        suggestion = best_word
        return 0,suggestion
    else:
        return 1,suggestion
def judge_CC_and_suggestion(prob,original_CC):
    list_CC = ["but","yet","still","however","although","for","so","thus","and","or","too","again","another","either","or","neither","nor","when","while","as","whenever","since","until","till"]
    judge,suggestion = judge_and_suggestion(prob,original_CC,list_CC,2)
    return judge,suggestion
def judge_V_and_suggestion(prob,original_V):
    list_V = lexeme(original_V)
    judge,suggestion = judge_and_suggestion(prob,original_V,list_V,2)
    #print("检查点",judge,suggestion)
    return judge,suggestion
    
def judge_IN_and_suggestion(prob,original_IN):
    list_IN = ["at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of",'to']
    judge,suggestion = judge_and_suggestion(prob,original_IN,list_IN,1)
    return judge,suggestion
def judge_DT_and_suggestion(prob,original_DT):
    if original_DT in ['some','any']:
        list_word = ['some','any']
    elif original_DT in ['this','that','these','those']:
        list_word = ['this','that','these','those']
    elif original_DT in ['the','a','an']:
        list_word = ['the','a','an']
    elif original_DT in ['another','other']:
        list_word = ['another','other']
    judge,suggestion = judge_and_suggestion(prob,original_DT,list_DT,1)
    return judge,suggestion

def judge_MD_and_suggestion(prob,original_MD):
    if original_MD in ['can','could']:
        list_MD = ['can','could']
    elif original_MD in ['may','might']:
        list_MD = ['may','might']
    elif original_MD in ['shall','should']:
        list_MD = ['shall','should']   
    elif original_MD in ['will','would']:
        list_MD = ['will','would']  
    elif original_MD in ['dare','dared']:
        list_MD = ['dare','dared']  
    else:
        list_MD = []
    judge,suggestion = judge_and_suggestion(prob,original_MD,list_MD,1)
    if original_MD not in ['can','could','may','might','shall','should','will','would'] :
        return judge,suggestion
    else:
        return 1,None
    
def judge_N_and_suggestion(prob,original_N):
    word_tag = nltk.pos_tag([original_N])
    if word_tag[0][1] == "NN":
        N_ = original_N
        N_s= pluralize(original_N)
    else:
        N_ = singularize(original_N)
        N_s= original_N
    list_N = [N_,N_s]
    list_others = N_to_anything(N_)
    for other in list_others:
        list_N.append(other)
    judge,suggestion = judge_and_suggestion(prob,original_N,list_N,0.5)
    return judge,suggestion



'\n    功能：\n        judge_and_suggestion系列函数，这个系列函数是在analyse之前做的一个预先判断处理，判断的是该位置原来词的相关词中有没有可以代替它的词\n        当相关词中有词的可能性和原词的可能性的差距大于阈值，则认为原词是错的\n    输入：\n        prob：该位置可能性列表\n        original：该位置原先的词\n        list_word：该位置相关词表\n        threhold：门槛，也就是阈值\n    输出：\n        judge：判断原来的词是否正确，0表示需要换词，1表示不需要换词或者说相关词里面没一个合适的\n        suggestion：相关词中最好的推荐\n'

In [64]:
import colored
from colored import stylize
import spacy
nlp = spacy.load('en')

suggestions = {} #
def show_abnormals(tokens, probs, show_suggestions=False):
    global suggestions
    suggestions = {} 
    def gap2color(gap):
        if gap <= 5:
            return 'yellow_1'
        elif gap <= 10:
            return 'orange_1'
        else:
            return 'red_1'
        
    def print_token(token, suggestion, gap):
        if gap == 0:
            print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')
        else:
            print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')
            if show_suggestions and gap > 5:
                print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')
            else:
                print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')
                # print('/' + suggestion, end=' ')
            # print('%.2f' % gap, end=' ')
        
    avg_gap = 0.
    tokens_tag = nltk.pos_tag(tokens)
    #print(tokens_tag)
    for i in range(1, len(tokens) - 1):  # skip first [CLS] and last [SEP]
        ind_ = tokenizer.vocab[tokens[i]]
        prob_ = probs[i][ind_].item()
        top_prob = probs[i].max().item()
        top_ind = probs[i].argmax().item()
        top_word = tokenizer.ids_to_tokens[top_ind]
        gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距
        print()
        print("*******************************************************************************************************************")
        print(i)
        print(gap)
        avg_gap += gap
        #suggestion = tokenizer.ids_to_tokens[top_ind]
        suggestion = None
        #tag = tokens_tag[i][1]
        #doc = nlp(tokens[i])
        #tag = doc[0].tag_
        tag = tokens_tag[i][1]
        #print(tokens_tag[i])
        print(tag)
        if 'VB' in tag:
            if gap>3 and top_word in ["at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of",'to']:
                suggestion = analyse_V(i)   #如果推荐的是介词，说明这个位置可能需要补充什么 
            elif gap > 7.5:
                suggestion = analyse_V(i)
            elif gap < 7.5 and gap > 3:
                judge,suggestion = judge_V_and_suggestion(probs[i],tokens[i])
                if judge == 0 :
                    gap = 6
                else:
                    gap = 3
        elif "DT" == tag and gap > 3:
            suggestion = analyse_DT(i)
        elif "JJ" in tag :
            if gap > 6:
                suggestion = analyse_adj(i)
            else:
                gap = 3
        elif "RB" in tag and gap > 5:
            suggestion = analyse_adv(i)
            
        elif "PRP" in tag and gap >5:
            suggestion = analyse_pronoun(i)
        elif "NN" in tag:
            if gap > 4 and tokens[i][:2]=="##" and suggestions.__contains__(i-1)==False:
                #如果gap>4并且该位置是后缀，并且前一个位置被建议修改，说明该位置需要去掉
                suggestion = '去掉' + ' ' + tokens[i]
            elif gap > 7.5:
                suggestion = analyse_N(i)
            elif gap < 7.5 and gap > 2:
                judge,suggestion = judge_N_and_suggestion(probs[i],tokens[i])
                if judge == 0 :
                    gap = 6
                else:
                    gap = 3
        elif "CC" in tag and gap > 2 :
            judge,suggestion = judge_CC_and_suggestion(probs[i],tokens[i])
            if judge == 1 :
                gap = 3

        elif ("IN" == tag or 'TO' == tag) and gap > 2:
            suggestion = analyse_IN(i)
            
        elif 'MD' in tag and gap > 5:
            print("检查点1*****************************************************")
            judge,suggestion = judge_MD_and_suggestion(probs[i],tokens[i])
            if judge == 1:
                gap = 3
                
        elif "CD" in tag:
            gap = 0  
            
        elif "WDT" == tag and gap > 2: #who，which，that那些
            suggestion = top_word
            
        elif gap > 5:
            suggestion = top_word
            
        if suggestion != tokens[i] and suggestion != None:
            suggestions.update({i:suggestion})
            gap = max(gap,6)
        else:
            gap = min(gap,3)
        print_token(tokens[i], suggestion, gap)
        
    avg_gap /= (len(tokens) - 2)
    print()
    print('平均gap:'+ str(avg_gap))
    

In [65]:
input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids

def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=20):
    step = 15
    #print(text[0])
    global input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids
    input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = process_text(text[0])

    examples = convert_text_to_examples(text)
    features = convert_examples_to_features(examples, tokenizer, print_info=False)
    given_mask = "[MASK]" in features[0].tokens
    if not given_mask or masked_tokens is not None:
        assert len(features) == 1
        features, batches = copy_and_mask_feature(features[0],step, masked_tokens=masked_tokens)
        #print(len(features))

    input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度，变成[n_features,sequence_len]
    #这里的n_features实际上是句子有多少批训练

    input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度，其实每一行都一样
    input_ids = input_ids.to(device) #拿去GPU
    input_type_ids = input_type_ids.to(device)

    mlm_logits, _ = model(input_ids, input_type_ids)
    mlm_probs = F.softmax(mlm_logits, dim=-1) #最后一维，也就是vocab 换算成概率和为百分之百
    #print(mlm_probs.size())#这里实验的是torch.Size([5, 5, 30522])
    tokens = features[0].tokens #为了输出，[mask]在input_ids里面表示出来，features的token都一样
    #print(tokens)
    if not given_mask or masked_tokens is not None:
        bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size
        assert bsz == len(batches)
        # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)
        # for i in range(seq_len):
        #    reduced_mlm_probs[0, i] = mlm_probs[i, i]
        reduced_mlm_probs = torch.Tensor(1, len(tokens), vocab_size)
        for i in batches:
            pos = i
            while pos < len(tokens):
                reduced_mlm_probs[0, pos] = mlm_probs[i, pos]
                pos = pos + step
        mlm_probs = reduced_mlm_probs #压缩一下大小，节约不必要浪费的空间（只需要第i个batch里面[mask]位置的词汇表概率即可）
        #tokens = [tokens[i] for i in masked_positions]
    top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的
    #print(top_pairs) #******************************
    if not given_mask:
        show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)
    #return top_pairs


([[101,
   2043,
   1045,
   2001,
   2210,
   1010,
   5958,
   1005,
   1055,
   2305,
   2001,
   2256,
   2155,
   2208,
   2305,
   1012,
   102],
  [101,
   2044,
   15264,
   1010,
   2057,
   2052,
   2377,
   4003,
   2399,
   1997,
   2035,
   4066,
   1999,
   1996,
   3564,
   2282,
   1012,
   102],
  [101,
   2004,
   1996,
   4845,
   1010,
   1045,
   3866,
   2000,
   3422,
   13941,
   1989,
   2021,
   2053,
   3043,
   2129,
   2116,
   2335,
   1045,
   2356,
   2000,
   3666,
   2068,
   1989,
   2026,
   3008,
   2052,
   2025,
   2000,
   2292,
   2033,
   1012,
   102],
  [101,
   2027,
   2052,
   2360,
   2000,
   2149,
   2008,
   2652,
   4003,
   2399,
   2052,
   2393,
   2026,
   4167,
   1012,
   102],
  [101,
   2145,
   1045,
   15175,
   2000,
   2377,
   1996,
   2399,
   2005,
   2068,
   2823,
   1012,
   102],
  [101,
   1045,
   2134,
   1005,
   1056,
   5382,
   2129,
   2157,
   2026,
   3008,
   2024,
   2127,
   1045,
   3133,
   2152,
   2

In [66]:
import time
# text = ["Who was Jim Henson? Jim Henson _ a puppeteer."]
# text = ["Last week I went to the theater. There are many person . Luckily , I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]

#text = ["Last week I went to the theater. I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]
# text = ["After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented."]
# text = ["Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence."]
#text = ["The journey was long and tired. We left London at five o'clock in the evening and spend eight hours in the train. We had been travelled for 3 hours after someone appeared selling food and drinks. It was darkness all the time we were crossing Wales, but we could see nothing through the windows. When we finally arrived Holyhead nearly , everyone was slept. As soon as the train stopped, everybody come to life, grabbing their suitcases and rushing onto the platform."]
text = ["When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons，but no matter how many times I asked to watching them， my parents would not to let me. They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. I didn't realize how right my parents are until I entered high school. The games my parents taught me where I was a child turned out to be very useful later in my life."]
#text = ["Mr. and Mrs.Zhang all work in our school. They live far from the school, and it takes them about a hour and a half to go to work every day. In their spare time, they are interesting in planting vegetables in their garden, that is on the rooftop of their house. They often get up earlier and water the vegetables together. They have also bought in some gardening tools.beside, they often get some useful informations from the internet. When summer came, they will invite their students pick the vegetables！"]
#text = ['The question is more easy than that.']
#text = ["Last week I go to the zoo. I had a very good seat. The play was very interesting."]
#text =["Last week I went to the theater. I had very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry."]#因为外面有中括号，所以是二维的
time_start=time.time()
analyze_text(text, show_firstk_probs=200)
time_end=time.time()
print('time cost',time_end-time_start,'s')

03/21/2019 18:19:51 - INFO - examples.extract_features -   tokens: [CLS] when i was little , friday ' s night was our family game night . after supper , we would play card games of all sort in the sitting room . as the kid , i loved to watch cartoons ， but no matter how many times i asked to watching them ， my parents would not to let me . they would say to us that playing card games would help my brain . still i unwilling to play the games for them sometimes . i didn ' t realize how right my parents are until i entered high school . the games my parents taught me where i was a child turned out to be very useful later in my life . [SEP]


   0 | [CLS]       	   2 | .              1 | the            1 | )              1 | "              1 | ,           
  97 | when        	* 97 | when           2 | since          1 | until          0 | while          0 | before      
  99 | i           	* 99 | i              0 | she            0 | he             0 | we             0 | me          
 100 | was         	*100 | was            0 | were           0 | got            0 | turned         0 | is          
  11 | little      	  19 | younger     * 11 | little         8 | eight          7 | young          7 | twelve      
  51 | ,           	* 51 | ,             24 | .              4 | and            1 | of             1 | the         
   0 | friday      	  33 | valentine     16 | mother         7 | children       7 | father         5 | grandma     
 100 | '           	*100 | '              0 | `              0 | ′              0 | "              0 | *           
 100 | s           	*100 | s              0 | til            0 | n      

[38;5;226m[48;5;0mfamily[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
13
1.2859363936756965
NN
[38;5;226m[48;5;0mgame[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
14
0.0
NN
[38;5;15m[48;5;0mnight [0m
*******************************************************************************************************************
15
0.0
.
[38;5;15m[48;5;0m. [0m
*******************************************************************************************************************
16
0.0
IN
[38;5;15m[48;5;0mafter [0m
*******************************************************************************************************************
17
3.864973993379616
NN
[38;5;226m[48;5;0msupper[0m[38;5;226m[48;5;0m [0m
******************************************************************************************

[38;5;226m[48;5;0mbrain[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
75
0.0
.
[38;5;15m[48;5;0m. [0m
*******************************************************************************************************************
76
5.90809919263306
RB
[38;5;214m[48;5;0mstill[0m[38;5;6m[48;5;0m/still , [0m
*******************************************************************************************************************
77
2.2313680234481628
JJ
[38;5;226m[48;5;0mi[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
78
7.241924210620825
NN
[38;5;226m[48;5;0munwilling[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
79
0.033503519227476186
TO
[38;5;226m[48;5;0mto[0m[38;5;226m[48;5;0m [0m
********

In [438]:
#print(suggestions)
def display_suggestion():
    print("**********************************display_suggestions********************************************************")
    print("| {:50} : {}".format("suggestion","position in text"))
    print("---------------------------------------------------------------------------------------")
    for key in suggestions:
        print("| {:<50} : {}".format(suggestions[key] ,key))
    print("*************************************************************************************************************")
display_suggestion()

def modify_text(index):
    #entire_ids,entire_type_ids
    entire_ids_copy = copy.deepcopy(entire_ids)
    new_text = ""
    suggestion = suggestions[index]
    if suggestion[0:2] == '##':
        suggestion = tokenizer.ids_to_tokens[entire_ids_copy[index - 1]] + suggestion[2:]
        del entire_ids_copy[index]
        index = index - 1
    #print(suggestion)
    suggestion_tokens = suggestion.split(" ")
    print(suggestion_tokens)
    if '去掉前面' == suggestion_tokens[0]:
        del entire_ids_copy[index - 1]
        del suggestion_tokens[0]
        del suggestion_tokens[0]
        index = index - 1
    elif '去掉后面' == suggestion_tokens[0]:
        del entire_ids_copy[index + 1]
        del suggestion_tokens[0]
        del suggestion_tokens[0]
    elif '去掉' == suggestion_tokens[0]:
        del entire_ids_copy[index]
        del suggestion_tokens[0]
        del suggestion_tokens[0]
    if '原位置改成' in suggestion_tokens:
        del suggestion_tokens[0]
        
    len_suggest = len(suggestion_tokens)
    if len_suggest == 1:
        entire_ids_copy[index] = tokenizer.vocab[suggestion_tokens[0]]
    elif len_suggest == 2:
        entire_ids_copy.insert(index,tokenizer.vocab[suggestion_tokens[0]])
        entire_ids_copy[index + 1] = tokenizer.vocab[suggestion_tokens[1]]
        
    for i in range(1,len(entire_ids_copy)-1):
        word = tokenizer.ids_to_tokens[entire_ids_copy[i]]
        if word[0:2] == "##":
            new_text = new_text + word[2:]
        else:
            new_text = new_text + ' ' + tokenizer.ids_to_tokens[entire_ids_copy[i]]
    return new_text

print(modify_text(5))

**********************************display_suggestions********************************************************
| suggestion                                         : position in text
---------------------------------------------------------------------------------------
| 去掉前面 more 原位置改成 easier                             : 5
*************************************************************************************************************
['去掉前面', 'more', '原位置改成', 'easier']
 the question is easier than that .


In [283]:
import os
#text = ["Last week I went to the theater. There are many person . Luckily I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"]
#text = ["After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented."]
#text = ["me love yours."]
#text = ["Mr. and Mrs.Zhang all work in our school. They live far from the school, and it takes them about a hour and a half to go to work every day. In their spare time, they are interesting in planting vegetables in their garden, that is on the rooftop of their house. They often get up earlier and water the vegetables together. They have also bought in some gardening tools.beside, they often get some useful informations from the internet. When summer came, they will invite their students pick the vegetables！"]
text = ["When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons，but no matter how many times I asked to watching them， my parents would not to let me. They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. I didn't realize how right my parents are until I entered high school. The games my parents taught me where I was a child turned out to be very useful later in my life."]
def analyse_and_modify_and_review():
    global text
    analyze_text(text, show_firstk_probs=200)
    while len(suggestions)>0:
        display_suggestion()
        print('建议的数量是',len(suggestions))
        if len(suggestions) == 0:
            break
        else:
            index = input("Please input the position you want to modify：")
            index = int(index)
            text[0] = modify_text(index)
            analyze_text(text, show_firstk_probs=200)
            
analyse_and_modify_and_review()

03/20/2019 15:48:16 - INFO - examples.extract_features -   tokens: [CLS] when i was little , friday ' s night was our family game night . after supper , we would play card games of all sort in the sitting room . as the kid , i loved to watch cartoons ， but no matter how many times i asked to watching them ， my parents would not to let me . they would say to us that playing card games would help my brain . still i unwilling to play the games for them sometimes . i didn ' t realize how right my parents are until i entered high school . the games my parents taught me where i was a child turned out to be very useful later in my life . [SEP]


['[CLS]', 'when', 'i', 'was', 'little', ',', 'friday', "'", 's', 'night', 'was', 'our', 'family', 'game', 'night', '.', 'after', 'supper', ',', 'we', 'would', 'play', 'card', 'games', 'of', 'all', 'sort', 'in', 'the', 'sitting', 'room', '.', 'as', 'the', 'kid', ',', 'i', 'loved', 'to', 'watch', 'cartoons', '，', 'but', 'no', 'matter', 'how', 'many', 'times', 'i', 'asked', 'to', 'watching', 'them', '，', 'my', 'parents', 'would', 'not', 'to', 'let', 'me', '.', 'they', 'would', 'say', 'to', 'us', 'that', 'playing', 'card', 'games', 'would', 'help', 'my', 'brain', '.', 'still', 'i', 'unwilling', 'to', 'play', 'the', 'games', 'for', 'them', 'sometimes', '.', 'i', 'didn', "'", 't', 'realize', 'how', 'right', 'my', 'parents', 'are', 'until', 'i', 'entered', 'high', 'school', '.', 'the', 'games', 'my', 'parents', 'taught', 'me', 'where', 'i', 'was', 'a', 'child', 'turned', 'out', 'to', 'be', 'very', 'useful', 'later', 'in', 'my', 'life', '.', '[SEP]']
*******************************************

[38;5;226m[48;5;0mfamily[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
13
1.2859363936756965
NN
[38;5;226m[48;5;0mgame[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
14
0.0
NN
[38;5;15m[48;5;0mnight [0m
*******************************************************************************************************************
15
0.0
.
[38;5;15m[48;5;0m. [0m
*******************************************************************************************************************
16
0.0
IN
[38;5;15m[48;5;0mafter [0m
*******************************************************************************************************************
17
3.864973993379616
NN
[38;5;226m[48;5;0msupper[0m[38;5;226m[48;5;0m [0m
******************************************************************************************

[38;5;226m[48;5;0mbrain[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
75
0.0
.
[38;5;15m[48;5;0m. [0m
*******************************************************************************************************************
76
5.90809919263306
RB
[38;5;214m[48;5;0mstill[0m[38;5;6m[48;5;0m/still , [0m
*******************************************************************************************************************
77
2.2313680234481628
JJ
[38;5;226m[48;5;0mi[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
78
7.241924210620825
NN
[38;5;226m[48;5;0munwilling[0m[38;5;226m[48;5;0m [0m
*******************************************************************************************************************
79
0.033503519227476186
TO
[38;5;226m[48;5;0mto[0m[38;5;226m[48;5;0m [0m
********

KeyboardInterrupt: 

In [None]:
text = ["The trophy doesn't fit into the brown suitcase because the _ is too large."]
# text = ["Mary beat John in the match because _ was very strong."]
features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)
input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)
mlm_logits, _ = model(input_ids, input_type_ids)
mlm_probs = F.softmax(mlm_logits, dim=-1)
tokens = features[0].tokens
top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)

In [None]:
text = [
    # same / different
    "Tom has black hair. Mary has black hair. John has yellow hair. _  and Mary have the same hair color.",
    "Tom has black hair. Mary has black hair. John has yellow hair. _  and Mary have different hair colors.",
    "Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.",
    # because / although
    "John is taller/shorter than Mary because/although _ is older/younger.",
    "The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.",
    "Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.",
    "The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.",
    "John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.",
    # reverse
    "John came then Mary came. They left in reverse order. _ left then _ left.",
    "John came after Mary. They left in reverse order. _ left after _ .",
    "John came first, then came Mary. They left in reverse order: _ left first, then left _ .",
    # compare
    "Though John is tall, Tom is taller than John. So John is _ than Tom.",
    "Tom is taller than John. So _ is shorter than _.",
    # WSC-style: before /after
    "Mary came before/after John. _ was late/early .",
    # yes / no
    "Was Tom taller than Susan? Yes, _ was taller.",
    # right / wrong, epistemic modality
    "John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong.",
    
    "The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.",
    "John thanked Mary because  _ had given help to _ . ",
    "John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.",
    "John couldn't see the stage with Mary in front of him because _ is so short/tall.",
    "Although they ran at about the same speed, John beat Sally because _ had such a bad start.",
    "The fish ate the worm. The _ was hungry/tasty.",
    
    "John beat Mary. _ won the game/e winner.",
]
text

In [1345]:
config

{
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [1346]:
with open('WSC_switched_label.json') as f:
    examples = json.load(f)

In [9]:
with open('WSC_child_problem.json') as f:
    cexamples = json.load(f)

In [89]:
for ce in cexamples:
    for s in ce['sentences']:
        for a in s['answer0'] + s['answer1']:
            a = a.lower()
            if a not in tokenizer.vocab:
                ce
                print(a, 'not in vocab!!!')

In [23]:
for ce in cexamples:
    if len(ce['sentences']) > 0:
        e = examples[ce['index']]
        assert ce['index'] == e['index']
        e['score'] = all([s['score'] for s in ce['sentences']])
        assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'
        e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']

In [24]:
from collections import defaultdict

groups = defaultdict(list)
for e in examples:
    if 'score' in e:
        index = e['index']
        if index < 252:
            if index % 2 == 1:
                index -= 1
        elif index in [252, 253, 254]:
            index = 252
        else:
            if index % 2 == 0:
                index -= 1
        groups[index].append(e)

In [62]:
def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):
    return {k: d[k] for k in d if k in keys}

# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])
[(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']
# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])
# for e in examples:
#     if e['index'] % 2 == 0:
#         print(e['sentence'])

[(2, 'fit into:large/small', False),
 (4, 'thank:receive/give', False),
 (6, 'call:successful available', True),
 (8, 'ask:repeat answer', False),
 (10, 'zoom by:fast/slow', False),
 (12, 'vindicated/crushed:be the winner', False),
 (14, 'lift:weak heavy', False),
 (16, 'crash through:[hard]/[soft]', False),
 (18, '[block]:short/tall', False),
 (20, 'down to:top/bottom', False),
 (22, 'beat:good/bad', False),
 (24, 'roll off:anchored level', False),
 (26, 'above/below', False),
 (28, 'better/worse:study hard', False),
 (30, 'after/before:far away', False),
 (32, 'be upset with:buy from not work/sell not work', True),
 (34, '?yell at comfort:upset', False),
 (36, 'above/below:moved first', False),
 (38, 'although/because', False),
 (40, 'bully:punish rescue', False),
 (42, 'pour:empty/full', False),
 (44, 'know:nosy indiscreet', False),
 (46, 'explain:convince/understand', True),
 (48, '?know tell:so/because', True),
 (50, 'beat:younger/older', False),
 (56, 'clog:cleaned removed', True

In [51]:
sum(['because' in e['sentence'] for e in examples]) + \
sum(['so ' in e['sentence'] for e in examples]) + \
sum(['but ' in e['sentence'] for e in examples]) + \
sum(['though' in e['sentence'] for e in examples])

179

In [73]:
# with open('WSC_switched_label.json', 'w') as f:
#     json.dump(examples, f)

In [19]:
vis_attn_topk = 3

def has_chinese_label(labels):
    labels = [label.split('->')[0].strip() for label in labels]
    r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)
    return 0 < r < 0.5  # r == 0 means empty query labels used in self attention

def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):
    assert len(query_labels) == attn.size(0)
    assert len(key_labels) == attn.size(1)

    ax1.set_xlim([-1, 1])
    ax1.set_xticks([])
    ax2 = ax1.twinx()
    nlabels = max(len(key_labels), len(query_labels))
    pos = range(nlabels)
    
    if 'self' in attn_name and col < ncols - 1:
        query_labels = ['' for _ in query_labels]

    for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:
        ax.set_yticks(pos)
        if has_chinese_label(labels):
            ax.set_yticklabels(labels, fontproperties=zhfont)
        else:
            ax.set_yticklabels(labels)
        ax.set_ylim([nlabels - 1, 0])
        ax.tick_params(width=0, labelsize='xx-large')

        for spine in ax.spines.values():
            spine.set_visible(False)

#     mask, attn = filter_attn(attn)
    for qi in range(attn.size(0)):
#         if not mask[qi]:
#             continue
#         for ki in range(attn.size(1)):
        for ki in attn[qi].topk(vis_attn_topk)[1]:
            a = attn[qi, ki]
            ax1.plot((-1, 1), (ki, qi), color, alpha=a)
#     print(attn.mean(dim=0).topk(5)[0])
#     ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())

def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):
    hypo, nheads, labels_dict = result_tuple
    key_labels, query_labels = labels_dict[attn_name]
    if heads is None:
        heads = range(nheads)
    else:
        nheads = len(heads)
    
    stride = 2 if attn_name == 'dec_enc_attns' else 1
    nlabels = max(len(key_labels), len(query_labels))
    rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))
    
    rows = nheads // ncols * stride
    fig, axes = plt.subplots(rows, ncols)
    
    # for head in range(nheads):
    for head_i, head in enumerate(heads):
        row, col = head_i * stride // ncols, head_i * stride % ncols
        ax1 = axes[row, col]
        attn = hypo[attn_name][layer][head]
        _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)
        if attn_name == 'dec_enc_attns':
            col = col + 1
            axes[row, col].axis('off')  # next subfig acts as blank place holder
    # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)
    plt.show()  
            
ncols = 4

In [31]:
attn_name = 'enc_self_attns'
hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}
key_labels = query_labels = tokens
labels_dict = {attn_name: (key_labels, query_labels)}
result_tuple = (hypo, config.num_attention_heads, labels_dict)
plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)

AttributeError: 'BertSelfAttention' object has no attribute 'attention_probs'