In [195]:
import torch
from transformers import AutoTokenizer, AutoModel
from keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import wordnet as wn
import numpy as np
import pandas as pd
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

# Prepare step

In [196]:
# load data
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased', top_k=10)
clear_output()

In [3]:
# test
sentense = "Letters whose sole [MASK] is to make a political point will not be published."
candidate = unmasker(sentense)
result = []
for i in range(len(candidate)):
    result.append((candidate[i]['score'], candidate[i]['token_str']))
    
result

[(0.8293458223342896, 'purpose'),
 (0.06537234783172607, 'aim'),
 (0.02632279321551323, 'goal'),
 (0.013286354951560497, 'object'),
 (0.011476273648440838, 'function'),
 (0.00935242511332035, 'objective'),
 (0.00922190211713314, 'intention'),
 (0.007544944994151592, 'intent'),
 (0.004589710384607315, 'task'),
 (0.004187312442809343, 'use')]

In [7]:
# load corpus 
with open('dataset/BAWE.txt', 'r', encoding='utf-8') as f:
    BAME_corpus = f.read().strip().split('. ')
with open('dataset/big.txt', 'r', encoding='utf-8') as f:
    big_corpus = f.read().strip().split('. ')
with open('dataset/paper.txt', 'r', encoding='utf-8') as f:
    paper_corpus = f.read().strip().split('. ')
with open('dataset/party_test.txt', 'r', encoding='utf-8') as f:
    party_test_corpus = f.read().strip().split('\n')
with open('dataset/party_train.txt', 'r', encoding='utf-8') as f:
    party_train_corpus = f.read().strip().split('\n')
    
corpuses = [BAME_corpus, big_corpus, paper_corpus, party_test_corpus, party_train_corpus]
cor_names = ["BAME_corpus", "big_corpus", "paper_corpus", "party_test_corpus", "party_train_corpus"]
c_len = len(cor_names)
for i in  range(c_len):
    print(cor_names[i], "len:", len(corpuses[i]))
    
corpus_combine = BAME_corpus + big_corpus + paper_corpus + party_test_corpus + party_train_corpus

BAME_corpus len: 244506
big_corpus len: 31564
paper_corpus len: 123656
party_test_corpus len: 70
party_train_corpus len: 637


In [76]:
# load AKL words
with open("data/noun.txt", 'r', encoding="utf-8") as f:
    noun = f.read().strip().split(', ')
with open("data/adj.txt", 'r', encoding="utf-8") as f:
    adj = f.read().strip().split(', ')
with open("data/adv.txt", 'r', encoding="utf-8") as f:
    adv = f.read().strip().split(', ')
with open("data/verb.txt", 'r', encoding="utf-8") as f:
    verb = f.read().strip().split(', ')
with open("data/others.txt", 'r', encoding="utf-8") as f:
    others = f.read().strip().split(', ')
    
AKL_words = [noun, adj, adv, verb, others]
AKL_merge = noun + adj + adv + verb + others
types = ["noun", "adj", "adv", "verb", "others"]

In [15]:
a_len = len(AKL_words)
for i in  range(a_len):
    print(types[i], "words:", len(AKL_words[i]))

noun words: 353
adj words: 180
adv words: 86
verb words: 233
others words: 75


In [17]:
# Preprocess the sentences
def preprocess(text):
    """
    input: a string
    output: a list
    - transform to lower case
    - remove the punctuation
    - seperate the words by blank
    """
    text = text.lower()
    punc = '!()-[]{};:"\,<">./?@#$%^&*_~1234567890'
    for p in punc: 
        text = text.replace(p, "")
    return text

corpus = []
for cor in corpus_combine:
    sentence = preprocess(cor)
    corpus.append(sentence)

# Step 1: get possible candidate words

In [116]:
base_word = "ability" 

<font color="red">**[ TODO 1 ]**</font> 如何納入動詞、名詞的變化

In [117]:
def check_single_word(st, base_word):
    tokens = st.split(' ')
    if base_word in tokens:
        return True
    return False

In [119]:
def put_mask(sentense, base_word):
    """
    put mask on the target word
    """
    tokens = sentense.split(' ')
    rep_tokens = ["[MASK]" if word==base_word else word for word in tokens]
    
    res_sent = " ".join(rep_tokens)
    return res_sent

In [120]:
def get_candidates(sentense, base_word):
    sentense = put_mask(sentense, base_word)
    candidate = unmasker(sentense)
    result = {}
    for i in range(len(candidate)):
        if candidate[i]['token_str'] == base_word:
            continue
        result[candidate[i]['token_str']] = candidate[i]['score']
    return result

In [123]:
# get the sentense that contains base_word
filter_corpus = []
for cor in corpus: 
    if check_single_word(cor, base_word): 
        filter_corpus.append(cor)
print("length of our base word sentense: ", len(filter_corpus))

length of our base word sentense:  2071


<font color="red">**[ TODO 2 ]**</font> 套入其他例句

In [161]:
used_sentense = filter_corpus[0]
cand = get_candidates(used_sentense, base_word)
cand

{'capacity': 0.023000137880444527,
 'abilities': 0.007775991223752499,
 'freedom': 0.006012896075844765,
 'willingness': 0.004808926954865456,
 'capability': 0.004298985470086336,
 'desire': 0.0022593154571950436,
 'opportunity': 0.001206710352562368,
 'skills': 0.0008579287678003311,
 'determination': 0.0008293994469568133}

# Step 2: Processing weight

In [78]:
def check_akl(word):
    if word in AKL_merge:
        return True
    return False

In [105]:
def get_POS(sentense, target_word):
    """
    回傳 `target_word` 在 `sentense`中的詞性
    詞性種類: https://www.guru99.com/pos-tagging-chunking-nltk.html
    """
    tokens = nltk.word_tokenize(sentense)
    tag = nltk.pos_tag(tokens)
    for tu in tag:
        if tu[0] == target_word:
            return tu[1]

In [164]:
def get_similarity_score(base_word, syn_word):
    """
    return mean similarity score of this two words
    compare all meaning
    """
    base_sets = wn.synsets(base_word)
    syn_sets = wn.synsets(syn_word)
    n = len(base_sets)
    m = len(syn_sets)
    score = 0
    for i in range(n):
        for j in range(m):
            try:
                score += base_sets[i].path_similarity(syn_sets[j])
            except:
                pass
    score = score/ (n*m)
    return score

<font color="red">**[ TODO 3 ]**</font> wordnet 有好幾種算相似度的方法，哪個最適合?

- path_similarity

-  lch_similarity

-  wup_similarity

https://www.nltk.org/howto/wordnet.html

In [172]:
def calculate_weight(cand, sentense, base_word):
    """
    input 1: the possible words dictionary
    input 2: the sentense used
    input 3: base word
    """
    data_items = cand.items()
    data_list = list(data_items)
    cand_df = pd.DataFrame(data_list, columns=['Words', 'Score'])
    
    # AKL part
    c_len = len(cand_df)
    for i in range(c_len):
        if check_akl(cand_df['Words'][i]):
            cand_df['Score'][i] = cand_df['Score'][i] *1.25
            print("in AKL")
            
    # POS-tagging part
    base_pos = get_POS(sentense, base_word)
    for i in range(c_len): 
        cand_pos = get_POS(sentense, cand_df['Words'][i])
        if cand_pos == base_pos:
            cand_df['Score'][i] = cand_df['Score'][i] *1.5
            print("Same type")
    
    # Wordnet Similarity
    for i in range(c_len):
        cand_df['Score'][i] += get_similarity_score(base_word, cand_df['Words'][i])
    
    cand_df = cand_df.sort_values(by=['Score'], ascending=False).reset_index(drop=True)
    return cand_df

<font color="red">**[ TODO 4 ]**</font> nltk 詞性分得太細了，怎麼降低標準 (詞性加權無法使用)

In [176]:
# input : cand, used_sentense, base_word
result_df = calculate_weight(cand, used_sentense, base_word)
result_df

in AKL
in AKL
in AKL


Unnamed: 0,Words,Score
0,abilities,0.579205
1,skills,0.313358
2,capability,0.224868
3,capacity,0.175288
4,freedom,0.155219
5,willingness,0.131793
6,opportunity,0.128493
7,determination,0.128144
8,desire,0.066479


In [181]:
syn_final_word = result_df['Words'][1]

# Step 3: Find the closest meaning between two words

In [183]:
def find_sense_of_two_words(base_word, syn_word):
    base_word = wn.synsets(base_word) #可增加詞性 base_word = wn.synsets(base_word, pos=wn.VERB)  [VERB, NOUN, ADJ, ADV]
    syn_word = wn.synsets(syn_word) #可增加詞性 syn_word = wn.synsets(syn_word, pos=wn.VERB)  [VERB, NOUN, ADJ, ADV]
    
    path_similarity=[]
    path_similarity_dict={}
    for i in base_word:
        for j in syn_word:
            path_similarity.append(wn.path_similarity(i, j))
            path_similarity_dict[wn.path_similarity(i, j)]=[i,j]
            
    #找出相似度最大的值與sense    
    similarity = max(path_similarity)
    #propose sense編號 
    sense= path_similarity_dict[max(path_similarity)][0]
    #propose 字義
    definition = path_similarity_dict[max(path_similarity)][0].definition()
  
    return similarity, sense, definition  #propose和need相似度, propose和need相似度最接近的sense編號, 字義 

In [184]:
similarity, sense, definition = find_sense_of_two_words(base_word, syn_final_word)

# Result

In [194]:
print(f"""
Target Word：{base_word}

例句：{used_sentense}

--------------------

在此例句中 "{base_word}" 字義：{definition}
""")


Target Word：ability

例句：the teacher encouraged them to discuss their ideas and again  advocates this  the quality of pupils' mathematical thinking as well as their ability to express themselves are considerably enhanced by discussion

--------------------

在此例句中 "ability" 字義：possession of the qualities (especially mental qualities) required to do something or get something done

