In [1]:
import torch
from transformers import pipeline
import nltk
from lemminflect import getAllInflections, getAllLemmas
from nltk.corpus import wordnet as wn
import numpy as np
import pandas as pd
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

# Prepare step

In [2]:
# load data
unmasker = pipeline('fill-mask', model='bert-base-uncased', top_k=10)
clear_output()

In [57]:
# test
sentense = "Letters whose sole [MASK] is to make a political point will not be published."
candidate = unmasker(sentense)
result = []
for i in range(len(candidate)):
    result.append((candidate[i]['score'], candidate[i]['token_str']))
    
result

[(0.8293458223342896, 'purpose'),
 (0.06537234783172607, 'aim'),
 (0.02632279321551323, 'goal'),
 (0.013286354951560497, 'object'),
 (0.011476273648440838, 'function'),
 (0.00935242511332035, 'objective'),
 (0.00922190211713314, 'intention'),
 (0.007544944994151592, 'intent'),
 (0.004589710384607315, 'task'),
 (0.004187312442809343, 'use')]

In [4]:
# load corpus 
with open('dataset/BAWE.txt', 'r', encoding='utf-8') as f:
    BAME_corpus = f.read().strip().split('. ')
with open('dataset/big.txt', 'r', encoding='utf-8') as f:
    big_corpus = f.read().strip().split('. ')
with open('dataset/paper.txt', 'r', encoding='utf-8') as f:
    paper_corpus = f.read().strip().split('. ')
with open('dataset/party_test.txt', 'r', encoding='utf-8') as f:
    party_test_corpus = f.read().strip().split('\n')
with open('dataset/party_train.txt', 'r', encoding='utf-8') as f:
    party_train_corpus = f.read().strip().split('\n')
    
corpuses = [BAME_corpus, big_corpus, paper_corpus, party_test_corpus, party_train_corpus]
cor_names = ["BAME_corpus", "big_corpus", "paper_corpus", "party_test_corpus", "party_train_corpus"]
c_len = len(cor_names)
for i in  range(c_len):
    print(cor_names[i], "len:", len(corpuses[i]))
    
corpus_combine = BAME_corpus + big_corpus + paper_corpus + party_test_corpus + party_train_corpus

BAME_corpus len: 244506
big_corpus len: 31564
paper_corpus len: 123656
party_test_corpus len: 70
party_train_corpus len: 637


In [5]:
# load AKL words
with open("data/noun.txt", 'r', encoding="utf-8") as f:
    noun = f.read().strip().split(', ')
with open("data/adj.txt", 'r', encoding="utf-8") as f:
    adj = f.read().strip().split(', ')
with open("data/adv.txt", 'r', encoding="utf-8") as f:
    adv = f.read().strip().split(', ')
with open("data/verb.txt", 'r', encoding="utf-8") as f:
    verb = f.read().strip().split(', ')
with open("data/others.txt", 'r', encoding="utf-8") as f:
    others = f.read().strip().split(', ')
    
AKL_words = [noun, adj, adv, verb, others]
AKL_merge = noun + adj + adv + verb + others
types = ["noun", "adj", "adv", "verb", "others"]

In [6]:
a_len = len(AKL_words)
for i in  range(a_len):
    print(types[i], "words:", len(AKL_words[i]))

noun words: 353
adj words: 180
adv words: 86
verb words: 233
others words: 75


In [7]:
# Preprocess the sentences
def preprocess(text):
    """
    input: a string
    output: a list
    - transform to lower case
    - remove the punctuation
    - seperate the words by blank
    """
    text = text.lower()
    punc = '!()-[]{};:"\,<">./?@#$%^&*_~1234567890'
    for p in punc: 
        text = text.replace(p, "")
    return text

corpus = []
for cor in corpus_combine:
    sentence = preprocess(cor)
    corpus.append(sentence)

# Step 1: get possible candidate words

In [68]:
base_word = "ability" 

In [85]:
getAllInflections(base_word)

{'NNS': ('abilities', 'ability'), 'NN': ('ability',)}

In [70]:
def check_word_exist(st, base_word):
    """"
    若st 中有base_word的任何變形，回傳True
    """
    tokens = st.split(' ')
    vairation = getAllInflections(base_word)
    var_list = set()
    for types in vairation:
        for item in vairation[types]:
            var_list.add(item)
        
    for item in var_list:
        if item in tokens:
            return True
    return False

In [87]:
def put_mask(sentense, base_word):
    """
    把 [MASK] 放到第一個出現的 `base_word`各種變形
    """
    tokens = sentense.split(' ')
    vairation = getAllInflections(base_word)
    var_list = set()
    for types in vairation:
        for item in vairation[types]:
            var_list.add(item)
            
    rep_tokens = []
    mask = 0 # Only put mask on the first appeared base word
    for token in tokens:
        add = 0
        for item in var_list:
            if token == item and mask== 0:
                rep_tokens.append("[MASK]")
                add = 1
                mask += 1
        if add == 0:
            rep_tokens.append(token)

    res_sent = " ".join(rep_tokens)
    return res_sent, var_list

In [101]:
def get_candidates(sentense, base_word):
    """
    所有`base_word`的變形都不會納入candidates
    """
    sentense, var_list = put_mask(sentense, base_word)
    candidate = unmasker(sentense)
    result = {}
    for i in range(len(candidate)):
        same = 0
        for item in var_list:
            if candidate[i]['token_str'] == item:
                same = 1
        if same == 0:
            result[candidate[i]['token_str']] = candidate[i]['score']
    return result

In [81]:
# get the sentense that contains base_word
filter_corpus = []
for cor in corpus: 
    if check_word_exist(cor, base_word): 
        filter_corpus.append(cor)
print("length of our base word sentense: ", len(filter_corpus))

length of our base word sentense:  2345


In [102]:
used_sentense = filter_corpus[0]
cand = get_candidates(used_sentense, base_word)
cand

{'capacity': 0.023000137880444527,
 'freedom': 0.006012896075844765,
 'willingness': 0.004808926954865456,
 'capability': 0.004298985470086336,
 'desire': 0.0022593154571950436,
 'opportunity': 0.001206710352562368,
 'skills': 0.0008579287678003311,
 'determination': 0.0008293994469568133}

# Step 2: Processing weight

In [109]:
def check_akl(word):
    if word in AKL_merge:
        return True
    return False

In [108]:
def get_POS(sentense, target_word):
    """
    回傳 `target_word` 在 `sentense`中的詞性
    詞性種類: https://www.guru99.com/pos-tagging-chunking-nltk.html
    """
    tokens = nltk.word_tokenize(sentense)
    tag = nltk.pos_tag(tokens)
    for tu in tag:
        if tu[0] == target_word:
            return tu[1]

In [107]:
def get_similarity_score(base_word, syn_word):
    """
    return mean similarity score of this two words
    compare all meaning
    """
    base_sets = wn.synsets(base_word)
    syn_sets = wn.synsets(syn_word)
    n = len(base_sets)
    m = len(syn_sets)
    score = 0
    for i in range(n):
        for j in range(m):
            try:
                score += base_sets[i].wup_similarity(syn_sets[j])
            except:
                pass
    score = score/ (n*m)
    return score

<font color="red">**[ TODO 3 ]**</font> wordnet 有好幾種算相似度的方法，哪個最適合?

- path_similarity

-  lch_similarity

-  wup_similarity

https://www.nltk.org/howto/wordnet.html#similarity

- path_similarity:
    
    Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hyponym) taxonomy.
    
    -> 檢查是否有上下位關係 

- lch_similarity: 

    based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. The relationship is given as -log(p/2d) where p is the shortest path length and d the taxonomy depth.
    
    -> 與上類似，算法不同

- wup_similarity: 
    based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). 
    
    -> 回傳最接近的ancestor深度

In [110]:
def calculate_weight(cand, sentense, base_word):
    """
    input 1: the possible words dictionary
    input 2: the sentense used
    input 3: base word
    """
    data_items = cand.items()
    data_list = list(data_items)
    cand_df = pd.DataFrame(data_list, columns=['Words', 'Score'])
    
    # AKL part
    c_len = len(cand_df)
    for i in range(c_len):
        if check_akl(cand_df['Words'][i]):
            cand_df['Score'][i] = cand_df['Score'][i] *1.25
            print("in AKL")
            
    # POS-tagging part
    base_pos = get_POS(sentense, base_word)
    for i in range(c_len): 
        cand_pos = get_POS(sentense, cand_df['Words'][i])
        if cand_pos == base_pos:
            cand_df['Score'][i] = cand_df['Score'][i] *1.5
            print("Same type")
    
    # Wordnet Similarity
    for i in range(c_len):
        cand_df['Score'][i] += get_similarity_score(base_word, cand_df['Words'][i])
    
    cand_df = cand_df.sort_values(by=['Score'], ascending=False).reset_index(drop=True)
    return cand_df

<font color="red">**[ TODO 4 ]**</font> nltk 詞性分得太細了，怎麼降低標準 (詞性加權無法使用)

In [111]:
# input : cand, used_sentense, base_word
result_df = calculate_weight(cand, used_sentense, base_word)
result_df

in AKL
in AKL
in AKL


Unnamed: 0,Words,Score
0,skills,0.637222
1,capability,0.549754
2,freedom,0.464346
3,capacity,0.431046
4,determination,0.430224
5,willingness,0.421476
6,opportunity,0.418175
7,desire,0.211564


In [112]:
syn_final_word = result_df['Words'][0]

# Step 3: Find the closest meaning between two words

In [113]:
def find_sense_of_two_words(base_word, syn_word):
    base_word = wn.synsets(base_word) #可增加詞性 base_word = wn.synsets(base_word, pos=wn.VERB)  [VERB, NOUN, ADJ, ADV]
    syn_word = wn.synsets(syn_word) #可增加詞性 syn_word = wn.synsets(syn_word, pos=wn.VERB)  [VERB, NOUN, ADJ, ADV]
    
    path_similarity=[]
    path_similarity_dict={}
    for i in base_word:
        for j in syn_word:
            path_similarity.append(wn.path_similarity(i, j))
            path_similarity_dict[wn.path_similarity(i, j)]=[i,j]
            
    #找出相似度最大的值與sense    
    similarity = max(path_similarity)
    #propose sense編號 
    sense= path_similarity_dict[max(path_similarity)][0]
    #propose 字義
    definition = path_similarity_dict[max(path_similarity)][0].definition()
  
    return similarity, sense, definition  #propose和need相似度, propose和need相似度最接近的sense編號, 字義 

In [114]:
similarity, sense, definition = find_sense_of_two_words(base_word, syn_final_word)

# Result

In [115]:
print(f"""
Target Word：{base_word}

例句：{used_sentense}

--------------------

在此例句中 "{base_word}" 字義：{definition}
""")


Target Word：ability

例句：the teacher encouraged them to discuss their ideas and again  advocates this  the quality of pupils' mathematical thinking as well as their ability to express themselves are considerably enhanced by discussion

--------------------

在此例句中 "ability" 字義：possession of the qualities (especially mental qualities) required to do something or get something done



# Evaluate

http://man.hubwiz.com/docset/NLTK.docset/Contents/Resources/Documents/api/nltk.corpus.reader.html#module-nltk.corpus.reader.semcor

https://www.nltk.org/api/nltk.corpus.reader.semcor.html

https://www.nltk.org/_modules/nltk/corpus/reader/semcor.html

https://www.nltk.org/howto/corpus.html#chunked-corpora

In [2]:
import nltk

In [3]:
from nltk.corpus import semcor

In [4]:
# nltk.download('semcor')

[nltk_data] Downloading package semcor to
[nltk_data]     C:\Users\WangHongWen\AppData\Roaming\nltk_data...


True

In [4]:
len(semcor.sents()) # total number of sentences

37176

In [2]:
" ".join(semcor.sents()[0]) # sentences

"The Fulton County Grand Jury said Friday an investigation of Atlanta 's recent primary election produced `` no evidence '' that any irregularities took place ."

In [26]:
semcor.tagged_sents(tag='pos')[5] # POS-tagging

[Tree('PRP', ['It']),
 Tree('VB', ['recommended']),
 Tree('IN', ['that']),
 Tree('NNP', ['Fulton']),
 Tree('NN', ['legislators']),
 Tree('VB', ['act']),
 Tree(None, ['``']),
 Tree('TO', ['to']),
 Tree('VB', ['have']),
 Tree('DT', ['these']),
 Tree('NN', ['laws']),
 Tree('VB', ['studied']),
 Tree('CC', ['and']),
 Tree('VB', ['revised']),
 Tree('TO', ['to']),
 Tree('DT', ['the']),
 Tree('NN', ['end']),
 Tree('IN', ['of']),
 Tree('VB', ['modernizing']),
 Tree('CC', ['and']),
 Tree('VB', ['improving']),
 Tree('PRP', ['them']),
 Tree(None, ["''"]),
 Tree(None, ['.'])]

In [31]:
semcor.tagged_sents(tag='sem')[0] # wordnet sense-tagging

[['The'],
 Tree(Lemma('group.n.01.group'), [Tree('NE', ['Fulton', 'County', 'Grand', 'Jury'])]),
 Tree(Lemma('state.v.01.say'), ['said']),
 Tree(Lemma('friday.n.01.Friday'), ['Friday']),
 ['an'],
 Tree(Lemma('probe.n.01.investigation'), ['investigation']),
 ['of'],
 Tree(Lemma('atlanta.n.01.Atlanta'), ['Atlanta']),
 ["'s"],
 Tree(Lemma('late.s.03.recent'), ['recent']),
 Tree(Lemma('primary.n.01.primary_election'), ['primary', 'election']),
 Tree(Lemma('produce.v.04.produce'), ['produced']),
 ['``'],
 ['no'],
 Tree(Lemma('evidence.n.01.evidence'), ['evidence']),
 ["''"],
 ['that'],
 ['any'],
 Tree(Lemma('abnormality.n.04.irregularity'), ['irregularities']),
 Tree(Lemma('happen.v.01.take_place'), ['took', 'place']),
 ['.']]