In [1]:
# import pandas as pd
import csv
import numpy as np
import os
import copy
from os.path import join as pjoin
from glob import iglob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#!/usr/bin/env python
from __future__ import division

import argparse
import glob
import os
import random
import signal
import time

import torch

import distributed
from models import data_loader, model_builder
from models.data_loader import load_dataset
from models.model_builder import ExtSummarizer
from models.trainer_ext import build_trainer
from others.logging import logger, init_logger

model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers', 'encoder', 'ff_actv', 'use_interval', 'rnn_size']


In [3]:
root_path = '/data/ksb/'
bert_root_path = pjoin(root_path, 'BertSum/PreSumm')
bert_model_dir = pjoin(bert_root_path, 'models')

data_dir = pjoin(root_path, 'cnn-dailymail/finished_files')

three_data_dir = pjoin(root_path, 'three-mat')
three_data_test = pjoin(three_data_dir, 'test')

#### Loss function 비교  

*Trained Model parameter 필요*


In [4]:
def get_cos_similarity(inputs, summaries):
    tfidf_vectorizer = TfidfVectorizer()

    cos_similarity_list = []
    for input_, summary_ in zip(inputs, summaries):
        try:
            tfidf_matrix = tfidf_vectorizer.fit_transform([input_, summary_])

            similarity = cosine_similarity(tfidf_matrix[0] , tfidf_matrix[1])[0][0]
        except ValueError:
            similarity = 0.0
            
        cos_similarity_list.append(similarity)

    return cos_similarity_list

In [5]:
import jsonlines
import json

data_list = []
for data_p in iglob(pjoin(three_data_test, '**.json'), recursive=False):
    
    with open(data_p,'r',encoding='utf-8') as f:
        data = json.load(f)
        data_list.append(data)

### Origin candidate set

In [6]:
from rouge import Rouge
rouge = Rouge()

In [8]:
doc_sim_list = []
ref_sim_list = []

min_doc_sim_list = []
max_doc_sim_list = []

doc_rouge_list = []
ref_rouge_list = []

max_ref_rouge_list = []
min_ref_rouge_list = []

for data in data_list:
    candidates = data['candidates']
    article = data['article']
    abstract = data["abstract"]
    
    sim_list = [np.mean(get_cos_similarity(article, cand[0])) for cand in candidates]
    doc_sim_list += sim_list
    min_doc_sim_list += [min(sim_list)]
    max_doc_sim_list += [max(sim_list)]
    
    ref_sim_list += [np.mean(get_cos_similarity(abstract, cand[0])) for cand in candidates]
    
    doc_rouge_list +=[rouge.get_scores('\n'.join(cand[0]), '\n'.join(article))[0]['rouge-l']['f'] for cand in candidates]
    
    rouge_list = [rouge.get_scores('\n'.join(cand[0]), '\n'.join(abstract))[0]['rouge-l']['f'] for cand in candidates]
    ref_rouge_list += rouge_list
    min_ref_rouge_list += [min(rouge_list)]
    max_ref_rouge_list += [max(rouge_list)]

KeyboardInterrupt: 

In [None]:
print("Mean cosine similarity between document and summaries : {}".format(round(np.mean(doc_sim_list), 4)))
print("Mean cosine similarity between reference and summaries : {}\n".format(round(np.mean(ref_sim_list), 4)))

print("Min cosine similarity between document and summaries : {}".format(round(np.mean(min_doc_sim_list), 4)))
print("Max cosine similarity between document and summaries : {}".format(round(np.mean(max_doc_sim_list), 4)))

In [None]:
print("Mean Rouge score between document and summaries : {}".format(round(np.mean(doc_rouge_list), 4)))
print("Mean Rouge score between reference and summaries : {}\n".format(round(np.mean(ref_rouge_list), 4)))

print("Min Rouge score between reference and summaries : {}".format(round(np.mean(min_ref_rouge_list), 4)))
print("Max Rouge score between reference and summaries : {}".format(round(np.mean(max_ref_rouge_list), 4)))

In [9]:
from transformers import BertTokenizer

def bert_encode(x, max_len=-1):
    tok = BertTokenizer.from_pretrained('bert-base-uncased', verbose=False)
    cls_token_id = tok.cls_token_id
    sep_token_id = tok.sep_token_id

    _ids = tok.encode(x, add_special_tokens=False)
    ids = [cls_token_id] # [CLS]
    if max_len > 0:
        ids.extend(_ids[:max_len - 2])
    else:
        ids.extend(_ids[:512 - 2])
    ids.append(sep_token_id) # [SEP], meaning end of sentence
    return ids

def bert_decode(x):
    tok = BertTokenizer.from_pretrained('bert-base-uncased', verbose=False)

    result = tok.decode(x, skip_special_tokens=True)
    return result

In [10]:
def detect_trigram(src, tgt):
    assert len(tgt) > 2 and len(src) > 2
        
    tgt_trigrams = [(tgt[i-1],tgt[i],tgt[i+1]) for i in range(1,len(tgt)-1)]
    src_trigrams = [(src[i-1],src[i],src[i+1]) for i in range(1,len(src)-1)]
    
    for src_tri in src_trigrams:
        if src_tri in tgt_trigrams:
            return True ## Detect trigram overlapped with target
        
    return False
    

In [11]:
def detect_4_gram(src, tgt):
    assert len(tgt) > 3 and len(src) > 3
    tgt_4_grams = [(tgt[i-2], tgt[i-1],tgt[i],tgt[i+1]) for i in range(2,len(tgt)-1)]
    src_4_grams = [(src[i-2], src[i-1],src[i],src[i+1]) for i in range(2,len(src)-1)]
    
    for src_gram in src_4_grams:
        if src_gram in tgt_4_grams:
            return True ## Detect 4-gram overlapped with target
        
    return False
    

In [12]:
def detect_5_gram(src, tgt):
    assert len(tgt) > 4 and len(src) > 4
    tgt_5_grams = [(tgt[i-2], tgt[i-1],tgt[i],tgt[i+1], tgt[i+2]) for i in range(2,len(tgt)-2)]
    src_5_grams = [(src[i-2], src[i-1],src[i],src[i+1], src[i+2]) for i in range(2,len(src)-2)]
    
    for src_gram in src_5_grams:
        if src_gram in tgt_5_grams:
            return True ## Detect 5-gram overlapped with target
        
    return False

In [13]:
def detect_ngram_list(src, tgt_list, n_gram='trigram'):
    
    if n_gram =='trigram':
        return sum([detect_trigram(src, tgt) for tgt in tgt_list]) > 0
    elif n_gram =='4-gram':
        return sum([detect_4_gram(src, tgt) for tgt in tgt_list]) > 0
    else :
        return sum([detect_4_gram(src, tgt) for tgt in tgt_list])>0

In [22]:
def get_candidate_set(sent_set, reference=None, n_gram='trigram'):
    
    assert n_gram in ['trigram', '4-gram','5-gram']
        
    if n_gram == 'trigram':
        detect_ngram = detect_trigram
    elif n_gram == '4-gram':
        detect_ngram = detect_4_gram
    else:
        detect_ngram = detect_5_gram
        
    
    possible_set_ids = []
    
    for sent_id, sent in sent_set:
        possible_2_sent_idx = []
        
#         print("Sentece ID ({}) Detect all possible combination\n".format(sent_id))
        
#         print("Detect all possible combination whose length is 2")
        # number of summary sentences = 2
        for tgt_sent_id, tgt_sent in sent_set[sent_id+1:]:
            
            # Detect n-gram (default= trigram) 
            if not detect_ngram(src=sent, tgt=tgt_sent):
                possible_2_sent_idx.append(set([sent_id, tgt_sent_id]))
                
#         print("Number of detected possible combination is {}\n".format(len(possible_2_sent_idx)))
        
#         print("Detect all possible combination whose length is 3")
        possible_3_sent_idx = copy.deepcopy(possible_2_sent_idx)
        
        # number of summary sentences = 3
        for tgt_sent_id, tgt_sent in sent_set[sent_id+1:]:
            for poss_sent_ids in possible_2_sent_idx:
                
                poss_sent = [sent_set[ids][1] for ids in poss_sent_ids]
                if not detect_ngram_list(src=tgt_sent, tgt_list=poss_sent, n_gram=n_gram):
                    poss_3_ids = copy.deepcopy(poss_sent_ids)
                    poss_3_ids.add(tgt_sent_id)
                    
                    possible_3_sent_idx.append(poss_3_ids)
                    
#         print("Number of detected possible combination is {}\n".format(len(possible_3_sent_idx)))

        possible_sent_idx = possible_2_sent_idx + possible_3_sent_idx
#         print("Total number of detected possible combination is {}\n".format(len(possible_sent_idx)))
        
        for ids in possible_sent_idx:
            if not ids in possible_set_ids:
                possible_set_ids.append(ids)

    return possible_set_ids
        

In [15]:
import pylcs

def compute_txt_redundancy_score(candidate_id):

    cand_num = len(candidate_id)
    
    score = torch.zeros([cand_num], dtype=torch.float64)
        
    def _compute_redundancy(cand):
        redundancy = 0.0
        
        for i, src_sen in enumerate(cand):
            for j, tgt_sen in enumerate(cand[i+1:]):
                if i != j:
                    lcs_val = pylcs.lcs(src_sen, tgt_sen)
                    redundancy += lcs_val 
        
        sents_len = sum([len(s) for sents in cand for s in sents])
        return redundancy / sents_len

    for i in range(cand_num):
        score[i] = np.mean(_compute_redundancy(candidate_id[i]))

    return score

In [19]:
new_data_path = pjoin(three_data_dir,'reconstructed_redun_cond.jsonl')

In [20]:
# Origin Candidate
origin_ref_rouges = []
origin_redun_scores = []

origin_doc_sims = []
origin_ref_sims = []


# Reconstructed Candidate
refine_ref_rouges = []
refine_redun_scores = []

refine_doc_sims = []
refine_ref_sims = []


In [26]:

with open(new_data_path, 'w', encoding='utf-8') as f:
    writer = jsonlines.Writer(f)
    
    for data in data_list:
        candidates = data['candidates']
        article = data['article']
        abstract = data["abstract"]


        summaries = [cand[0] for cand in candidates]    
        encoded_cand_set = [[bert_encode(s, 180) for s in cs] for cs in summaries]
        threshold = min([cand[1] for cand in candidates]) 

        sent_set = []

        for i, encoded_cand in enumerate(encoded_cand_set):
            for j, encoded_sent in enumerate(encoded_cand):

                sent_id = sum([len(prev) for prev in encoded_cand_set[:i]])+j
                sent_set.append((sent_id, encoded_sent))

        reduced_cand_ids = get_candidate_set(sent_set)
        reduced_cand_sents = [[sent_set[i][1] for i in ids] for ids in reduced_cand_ids]
        reduced_cand_set_dec = [[bert_decode(x) for x in cand] for cand in reduced_cand_sents]


        # Drop candidate which has lower score than threshold
        rouge_cands_set = []
        for c in reduced_cand_set_dec:
            scores = rouge.get_scores('\n'.join(abstract), '\n'.join(c))[0]
            score = scores['rouge-l']['f']

            rouge_cands_set.append((score, c))

        rouge_cands_set = sorted(rouge_cands_set, key=lambda x: x[0], reverse=True)
        fined_cands_set = [sc for sc in rouge_cands_set if sc[0] >= threshold]


        redun_cands_set = []
        cond_redun_score = compute_txt_redundancy_score(reduced_cand_set_dec)

        for redun_sc, c in zip(cond_redun_score, reduced_cand_set_dec):
            redun_score = redun_sc.item()
            redun_cands_set.append((redun_score, c))

        redun_cands_set = sorted(redun_cands_set, key=lambda x: -x[0], reverse=True)
        fined_cands_set = redun_cands_set[:20]


        ## Redundancy score of Original Candidates
        origin_redun = compute_txt_redundancy_score(summaries)
        origin_scores = []
        for re_sc, (ro_sc, sent) in zip(origin_redun, [(cs[1], '\n'.join(cs[0])) for cs in candidates]):
            origin_scores.append((re_sc.item(), ro_sc, sent))

        origin_scores = sorted(origin_scores, key=lambda x: -x[0]*0.1 +x[1], reverse=True)
        
        # Save Top-1 Score
        origin_redun_scores.append(origin_scores[0][0])
        rouge_sc = rouge.get_scores('\n'.join(abstract), '\n'.join(origin_scores[0][-1]))[0]['rouge-l']['f']
        rouge_sc = round(rouge_sc, 4)
        origin_ref_rouges.append(rouge_sc)
        
        origin_doc_sims.append(np.mean(get_cos_similarity(article, origin_scores[0][-1].split('\n'))))
        origin_ref_sims.append(np.mean(get_cos_similarity(abstract, origin_scores[0][-1].split('\n'))))


        ## Redundancy score of Reconstructed Summaries
        scores = fined_cands_set
        
        # Save Top-1 Score
        refine_redun_scores.append(scores[0][0])
        rouge_sc = rouge.get_scores('\n'.join(abstract), '\n'.join(scores[0][-1]))[0]['rouge-l']['f']
        rouge_sc = round(rouge_sc, 4)
        refine_ref_rouges.append(rouge_sc)
        
        refine_doc_sims.append(np.mean(get_cos_similarity(article, scores[0][-1])))
        refine_ref_sims.append(np.mean(get_cos_similarity(abstract, scores[0][-1])))
        
        ## Redundancy score of Reference Summary
        abstract_scores = compute_txt_redundancy_score([abstract])
        
        reconstructed_candidates = [s[:][-1] for s in scores[:3]]
        
        origin_cand_rouge = [round(rouge.get_scores('\n'.join(abstract), '\n'.join(cand[0]))[0]['rouge-l']['f'],4) for cand in candidates]
        new_cand_rouge = [round(rouge.get_scores('\n'.join(abstract), '\n'.join(cand))[0]['rouge-l']['f'],4) for cand in reconstructed_candidates]
        
        print("Origin ROUGE : {}".format(origin_cand_rouge))
        print("New Candidate ROUGE : {}\n".format(new_cand_rouge))
        
        new_data = {'article':article, 'candidates':candidates, 'abstract':abstract, 'reconstructed_candidates': reconstructed_candidates}
        writer.write(new_data)




Origin ROUGE : [0.4474, 0.5116, 0.4478]
New Candidate ROUGE : [0.3692, 0.1791, 0.3692]

Origin ROUGE : [0.4138, 0.3373, 0.375]
New Candidate ROUGE : [0.3778, 0.3059, 0.3908]

Origin ROUGE : [0.3208, 0.3368, 0.2936]
New Candidate ROUGE : [0.2857, 0.2118, 0.3846]



KeyboardInterrupt: 

In [None]:
print("Origin Redundancy score : {}".format(round(np.mean(origin_redun), 4)))
print("Origin cosine similarity between document and summaries : {}".format(round(np.mean(origin_doc_sims), 4)))
print("Origin ROUGE score between reference and summaries : {}".format(round(np.mean(origin_ref_rouges), 4)))

In [None]:
print("Origin Redundancy score : {}".format(round(np.mean(refine_redun), 4)))
print("Origin cosine similarity between document and summaries : {}".format(round(np.mean(refine_doc_sims), 4)))
print("Origin ROUGE score between reference and summaries : {}".format(round(np.mean(refine_ref_rouges), 4)))