In [31]:
import numpy as np
import os
from os.path import join as pjoin
from glob import iglob
from shutil import copyfile

import jsonlines
import pickle

In [11]:
root_path = '/data/ksb'
result_dir = './logs'
model_name = 'ext_bert_korean_step50000'

result_path = pjoin(result_dir, model_name)
result_path

'./logs/ext_bert_korean_step50000'

In [4]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as err:
        if err.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
            
def del_folder(path):
    try:
        rmtree(path)
    except:
        pass

In [34]:
def get_mapping_list(path, mapping_list):
    
    article_list = []
    
    with jsonlines.open(path) as reader:
        for news in reader:
            content = news['article_original']
            idx = news['id']
            
            if not idx in mapping_list:
                continue
                
            abs_summ = news['abstractive']
            ext_summ = news['extractive']

            article = (idx, content, abs_summ, ext_summ)
            article_list.append(article)
        
    return article_list

### After Training koBERT

In [6]:
from numpy import dot 
from numpy.linalg import norm 
import numpy as np

In [7]:
import gensim
print(gensim.__version__)

from gensim.models import FastText
import fasttext.util
from gensim import models
from gensim.matutils import softcossim
from gensim import corpora
from gensim.utils import simple_preprocess
import gensim.downloader as api

from scipy import sparse, stats
from collections import deque


3.8.0


In [12]:
ko_model = models.fasttext.load_facebook_model(pjoin(root_path, 'cc.ko.300.bin'))

In [13]:
ko_model

<gensim.models.fasttext.FastText at 0x7f7f20d92760>

In [14]:
len(ko_model.wv.vocab)

2000000

In [15]:
from gensim import utils, matutils  
def get_similarity(w1, w2, model):
    return dot(matutils.unitvec(model[w1]), matutils.unitvec(model[w2]))

In [16]:
def similarity_matrix(model, dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=np.float32):
    
    matrix_order = len(dictionary)
    matrix_nonzero = [1] * matrix_order
    matrix = sparse.identity(matrix_order, dtype=dtype, format="dok")
    num_skipped = 0
    
    # Decide the order of rows.
    if tfidf is None:
        word_indices = deque(sorted(dictionary.keys()))
    else:
        assert max(tfidf.idfs) < matrix_order
        word_indices = deque([
            index for index, _
            in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True)
        ])

    # Traverse rows.
    for row_number, w1_index in enumerate(list(word_indices)):
        word_indices.popleft()

        w1 = dictionary[w1_index]
        if w1 not in model.wv.vocab:
            num_skipped += 1
            continue  # A word from the dictionary is not present in the word2vec model.

        # Traverse upper triangle columns.
        if matrix_order <= nonzero_limit + 1:  # Traverse all columns.
            columns = (
                (w2_index, get_similarity(w1, dictionary[w2_index], model))
                for w2_index in word_indices
                if dictionary[w2_index] in model.wv.vocab)
        else:  # Traverse only columns corresponding to the embeddings closest to w1.
            num_nonzero = matrix_nonzero[w1_index] - 1
            columns = (
                (dictionary.token2id[w2], similarity)
                for _, (w2, similarity)
                in zip(
                    range(nonzero_limit - num_nonzero),
                    model.similar_by_word(w1, nonzero_limit - num_nonzero)
                )
                if w2 in dictionary.token2id
            )
            columns = sorted(columns, key=lambda x: x[0])

        for w2_index, sim in columns:

            if sim > threshold and matrix_nonzero[w2_index] <= nonzero_limit:
                element = sim**exponent
                matrix[w1_index, w2_index] = element
                matrix_nonzero[w1_index] += 1
                matrix[w2_index, w1_index] = element
                matrix_nonzero[w2_index] += 1

    return matrix.tocsc()

In [17]:
class DocSimilarity:
    
    def __init__(self, model, documents, tfidf=None, expo=2.0, threshold=0.0, nonzero_limit=100):
        self.model = model
        self.dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])
        self.similarity_matrix = similarity_matrix(self.model, self.dictionary, tfidf=tfidf, 
                                                              threshold=threshold, exponent=expo, 
                                                              nonzero_limit=nonzero_limit)
    
    def get_similarity(self, src_doc, tgt_doc):
        src_sent = ' '.join(src_doc)
        tgt_sent = ' '.join(tgt_doc)

        src_bow = self.dictionary.doc2bow(simple_preprocess(src_sent))
        tgt_bow = self.dictionary.doc2bow(simple_preprocess(tgt_sent))
        
        sim = softcossim(src_bow, tgt_bow, self.similarity_matrix)
        return sim
        

In [47]:
def get_result_list(src_regex):
    gold, cand, cand_num, src = [], [], [], []

    for file_paths in iglob(src_regex, recursive=False):
        lines = []
        with open(file_paths) as f:
            lines = f.readlines()

        if file_paths.endswith('gold'):
            gold = lines
        elif file_paths.endswith('candidate_sent_num'):
            cand_num = lines
        elif file_paths.endswith('candidate'):
            cand = lines
        elif file_paths.endswith('src'):
            src = lines
        else:
            print(file_paths)
            raise ValueError
    
    return src, gold, cand, cand_num
    

In [49]:
src, gold, cand, cand_num = get_result_list(result_path+"*")

print("gold len : {}".format(len(gold)))
print("src len : {}".format(len(src)))
print("cand len : {}".format(len(cand)))
print("cand_num len : {}".format(len(cand_num)))

gold len : 5978
src len : 5978
cand len : 5978
cand_num len : 5978


In [59]:
def remove_sep(txt):
    txt = txt.replace('\n','')
    list_ = txt.split('<q>')
    return list_

In [62]:
remove_sep(r)

['6개 번호를 모두 맞힌 1등 당첨자는 1명 , 5개 번호와 보너스 번호를 맞힌 2등은 68명이며 1등 판매점은 서울 구로구의 복권방 등 서울만 4곳이다 .']

In [None]:
from rouge import Rouge
import pandas as pd

result_dataframe = pd.DataFrame(columns=['news_id', 'news', 'summary', 'news_token', 'summary_token', 'result', 'cos_sim'])

rouge = Rouge()
for s, g, c in zip(src, gold, cand):
    
    news = remove_sep(s)
    summ = remove_sep(g)
    gen_summary = remove_sep(c)
    
    similar = DocSimilarity(model=ko_model, documents=[' '.join(news)])
    
    rouge_score = rouge.get_scores(' '.join(summ), ' '.join(gen_summary))[0]
    cos_sim = similar.get_similarity(news, gen_summary)
    print(rouge_score, "\n")
    print(cos_sim, "\n\n")
    
    dic = {
           'news' : news,
           'summary' : summ,
           'result': gen_summary,
           'cos_sim': cos_sim,
           'rouge_1_R' : rouge_score['rouge-1']['r'],
           'rouge_1_P' : rouge_score['rouge-1']['p'],
           'rouge_1_F' : rouge_score['rouge-1']['f'],
           'rouge_2_R' : rouge_score['rouge-2']['r'],
           'rouge_2_P' : rouge_score['rouge-2']['p'],
           'rouge_2_F' : rouge_score['rouge-2']['f'],
           'rouge_L_R' : rouge_score['rouge-l']['r'],
           'rouge_L_P' : rouge_score['rouge-l']['p'],
           'rouge_L_F' : rouge_score['rouge-l']['f']
          }
    result_dataframe = result_dataframe.append(dic, ignore_index=True)


  return dot(matutils.unitvec(model[w1]), matutils.unitvec(model[w2]))
  sim = softcossim(src_bow, tgt_bow, self.similarity_matrix)
  model.similar_by_word(w1, nonzero_limit - num_nonzero)


{'rouge-1': {'r': 0.2916666666666667, 'p': 0.35, 'f': 0.3181818132231406}, 'rouge-2': {'r': 0.17857142857142858, 'p': 0.22727272727272727, 'f': 0.19999999507200011}, 'rouge-l': {'r': 0.2916666666666667, 'p': 0.35, 'f': 0.3181818132231406}} 

0.49471366958688406 


{'rouge-1': {'r': 0.39285714285714285, 'p': 0.6111111111111112, 'f': 0.47826086480151236}, 'rouge-2': {'r': 0.23214285714285715, 'p': 0.3611111111111111, 'f': 0.2826086908884689}, 'rouge-l': {'r': 0.32142857142857145, 'p': 0.5, 'f': 0.39130434306238193}} 

0.5932340536138127 


{'rouge-1': {'r': 0.3269230769230769, 'p': 0.5151515151515151, 'f': 0.39999999524982705}, 'rouge-2': {'r': 0.2, 'p': 0.3142857142857143, 'f': 0.24444443969135812}, 'rouge-l': {'r': 0.3076923076923077, 'p': 0.48484848484848486, 'f': 0.37647058348512114}} 

0.6423483047596285 


{'rouge-1': {'r': 0.10416666666666667, 'p': 0.2631578947368421, 'f': 0.14925372728001793}, 'rouge-2': {'r': 0.04, 'p': 0.10526315789473684, 'f': 0.05797101050199566}, 'rouge-l': 

{'rouge-1': {'r': 0.22727272727272727, 'p': 0.3448275862068966, 'f': 0.2739725979508351}, 'rouge-2': {'r': 0.14, 'p': 0.23333333333333334, 'f': 0.17499999531250013}, 'rouge-l': {'r': 0.22727272727272727, 'p': 0.3448275862068966, 'f': 0.2739725979508351}} 

0.6036561158574867 


{'rouge-1': {'r': 0.5087719298245614, 'p': 0.48333333333333334, 'f': 0.49572649072978303}, 'rouge-2': {'r': 0.39436619718309857, 'p': 0.3783783783783784, 'f': 0.38620689155386445}, 'rouge-l': {'r': 0.47368421052631576, 'p': 0.45, 'f': 0.46153845654174885}} 

0.7538468454918181 


{'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}} 

0.6558817319953043 


{'rouge-1': {'r': 0.2727272727272727, 'p': 0.5, 'f': 0.3529411719031142}, 'rouge-2': {'r': 0.18518518518518517, 'p': 0.43478260869565216, 'f': 0.25974025555068314}, 'rouge-l': {'r': 0.2727272727272727, 'p': 0.5, 'f': 0.3529411719031142}} 

0.7244417077800892 


{'rouge-1': {'r': 0.2656

{'rouge-1': {'r': 0.23076923076923078, 'p': 0.6, 'f': 0.33333332932098775}, 'rouge-2': {'r': 0.1388888888888889, 'p': 0.4, 'f': 0.20618556318418538}, 'rouge-l': {'r': 0.23076923076923078, 'p': 0.6, 'f': 0.33333332932098775}} 

0.7079317112097302 


{'rouge-1': {'r': 0.21951219512195122, 'p': 0.72, 'f': 0.33644859454974235}, 'rouge-2': {'r': 0.11827956989247312, 'p': 0.4583333333333333, 'f': 0.18803418477317557}, 'rouge-l': {'r': 0.21951219512195122, 'p': 0.72, 'f': 0.33644859454974235}} 

0.6797789176447987 


{'rouge-1': {'r': 0.15384615384615385, 'p': 0.36363636363636365, 'f': 0.216216212037984}, 'rouge-2': {'r': 0.125, 'p': 0.28, 'f': 0.17283950190519748}, 'rouge-l': {'r': 0.15384615384615385, 'p': 0.36363636363636365, 'f': 0.216216212037984}} 

0.5472780339970611 


{'rouge-1': {'r': 0.16176470588235295, 'p': 0.2894736842105263, 'f': 0.2075471652118193}, 'rouge-2': {'r': 0.08, 'p': 0.15789473684210525, 'f': 0.10619468580155081}, 'rouge-l': {'r': 0.1323529411764706, 'p': 0.236842105

{'rouge-1': {'r': 0.3333333333333333, 'p': 0.6428571428571429, 'f': 0.4390243857465794}, 'rouge-2': {'r': 0.3148148148148148, 'p': 0.6071428571428571, 'f': 0.41463414184414044}, 'rouge-l': {'r': 0.3333333333333333, 'p': 0.6428571428571429, 'f': 0.4390243857465794}} 

0.6353533932781559 


{'rouge-1': {'r': 0.06557377049180328, 'p': 0.11428571428571428, 'f': 0.08333332870008707}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.04918032786885246, 'p': 0.08571428571428572, 'f': 0.06249999536675382}} 

0.6105556356068113 


{'rouge-1': {'r': 0.34615384615384615, 'p': 0.5, 'f': 0.4090909042561984}, 'rouge-2': {'r': 0.3018867924528302, 'p': 0.41025641025641024, 'f': 0.34782608207230625}, 'rouge-l': {'r': 0.34615384615384615, 'p': 0.5, 'f': 0.4090909042561984}} 

0.6074285633956114 


{'rouge-1': {'r': 0.28846153846153844, 'p': 0.39473684210526316, 'f': 0.333333328454321}, 'rouge-2': {'r': 0.26666666666666666, 'p': 0.36363636363636365, 'f': 0.3076923028106509}, 'rouge-l': {'r': 

{'rouge-1': {'r': 0.22950819672131148, 'p': 0.3888888888888889, 'f': 0.2886597891465619}, 'rouge-2': {'r': 0.11475409836065574, 'p': 0.2, 'f': 0.14583332870008697}, 'rouge-l': {'r': 0.21311475409836064, 'p': 0.3611111111111111, 'f': 0.26804123244553096}} 

0.6165662793859636 


{'rouge-1': {'r': 0.15294117647058825, 'p': 0.48148148148148145, 'f': 0.23214285348373728}, 'rouge-2': {'r': 0.1, 'p': 0.38461538461538464, 'f': 0.15873015545477456}, 'rouge-l': {'r': 0.15294117647058825, 'p': 0.48148148148148145, 'f': 0.23214285348373728}} 

0.7510074464167402 


{'rouge-1': {'r': 0.2631578947368421, 'p': 0.5714285714285714, 'f': 0.3603603560425291}, 'rouge-2': {'r': 0.17582417582417584, 'p': 0.43243243243243246, 'f': 0.24999999588989263}, 'rouge-l': {'r': 0.25, 'p': 0.5428571428571428, 'f': 0.34234233802451103}} 

0.7031587540581741 


{'rouge-1': {'r': 0.59375, 'p': 0.76, 'f': 0.6666666617420746}, 'rouge-2': {'r': 0.5882352941176471, 'p': 0.7407407407407407, 'f': 0.6557376999838754}, 'rouge-l

{'rouge-1': {'r': 0.26153846153846155, 'p': 0.7727272727272727, 'f': 0.39080459392257905}, 'rouge-2': {'r': 0.2571428571428571, 'p': 0.75, 'f': 0.3829787196016297}, 'rouge-l': {'r': 0.26153846153846155, 'p': 0.7727272727272727, 'f': 0.39080459392257905}} 

0.5915384952101902 


{'rouge-1': {'r': 0.3333333333333333, 'p': 0.4, 'f': 0.36363635867768596}, 'rouge-2': {'r': 0.18181818181818182, 'p': 0.2222222222222222, 'f': 0.1999999950500001}, 'rouge-l': {'r': 0.30952380952380953, 'p': 0.37142857142857144, 'f': 0.33766233270366003}} 

0.5520709186277099 


{'rouge-1': {'r': 0.1794871794871795, 'p': 0.4666666666666667, 'f': 0.2592592552469136}, 'rouge-2': {'r': 0.1348314606741573, 'p': 0.41379310344827586, 'f': 0.20338982680120657}, 'rouge-l': {'r': 0.1794871794871795, 'p': 0.4666666666666667, 'f': 0.2592592552469136}} 

0.6625884883337491 


{'rouge-1': {'r': 0.03225806451612903, 'p': 0.07142857142857142, 'f': 0.04444444015802511}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r':

{'rouge-1': {'r': 0.3333333333333333, 'p': 0.7586206896551724, 'f': 0.46315789049529094}, 'rouge-2': {'r': 0.2535211267605634, 'p': 0.6206896551724138, 'f': 0.359999995882}, 'rouge-l': {'r': 0.3333333333333333, 'p': 0.7586206896551724, 'f': 0.46315789049529094}} 

0.778574199939562 


{'rouge-1': {'r': 0.25, 'p': 0.17647058823529413, 'f': 0.20689654687277062}, 'rouge-2': {'r': 0.03571428571428571, 'p': 0.029411764705882353, 'f': 0.03225805956295602}, 'rouge-l': {'r': 0.125, 'p': 0.08823529411764706, 'f': 0.10344827101070177}} 

0.6396869105526043 


{'rouge-1': {'r': 0.20408163265306123, 'p': 0.5882352941176471, 'f': 0.30303029920569335}, 'rouge-2': {'r': 0.11926605504587157, 'p': 0.3939393939393939, 'f': 0.18309858798155132}, 'rouge-l': {'r': 0.19387755102040816, 'p': 0.5588235294117647, 'f': 0.28787878405417816}} 

0.790264364129104 


{'rouge-1': {'r': 0.5434782608695652, 'p': 0.6097560975609756, 'f': 0.5747126386946757}, 'rouge-2': {'r': 0.42857142857142855, 'p': 0.4883720930232558

In [None]:
result_dataframe