In [61]:
# import pandas as pd
import csv
import numpy as np
import os
import copy
from os.path import join as pjoin
from glob import iglob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
#!/usr/bin/env python
from __future__ import division

import argparse
import glob
import os
import random
import signal
import time

import torch

import distributed
from models import data_loader, model_builder
from models.data_loader import load_dataset
from models.model_builder import ExtSummarizer
from models.trainer_ext import build_trainer
from others.logging import logger, init_logger

model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers', 'encoder', 'ff_actv', 'use_interval', 'rnn_size']


In [63]:
root_path = '/data/ksb/'
bert_root_path = pjoin(root_path, 'BertSum/PreSumm')
bert_model_dir = pjoin(bert_root_path, 'models')

data_dir = pjoin(root_path, 'cnn-dailymail/finished_files')

three_data_dir = pjoin(root_path, 'three-mat')
three_data_test = pjoin(three_data_dir, 'test')

#### Loss function 비교  

*Trained Model parameter 필요*


In [64]:
def get_cos_similarity(inputs, summaries):
    tfidf_vectorizer = TfidfVectorizer()

    cos_similarity_list = []
    for input_, summary_ in zip(inputs, summaries):
        try:
            tfidf_matrix = tfidf_vectorizer.fit_transform([input_, summary_])

            similarity = cosine_similarity(tfidf_matrix[0] , tfidf_matrix[1])[0][0]
        except ValueError:
            similarity = 0.0
            
        cos_similarity_list.append(similarity)

    return cos_similarity_list

In [65]:
import jsonlines
import json

data_list = []
for data_p in iglob(pjoin(three_data_test, '**.json'), recursive=False):
    
    with open(data_p,'r',encoding='utf-8') as f:
        data = json.load(f)
        data_list.append(data)

### Origin candidate set

In [66]:
from rouge import Rouge
rouge = Rouge()

In [16]:
doc_sim_list = []
ref_sim_list = []

min_doc_sim_list = []
max_doc_sim_list = []

doc_rouge_list = []
ref_rouge_list = []

max_ref_rouge_list = []
min_ref_rouge_list = []

for data in data_list:
    candidates = data['candidates']
    article = data['article']
    abstract = data["abstract"]
    
    sim_list = [np.mean(get_cos_similarity(article, cand[0])) for cand in candidates]
    doc_sim_list += sim_list
    min_doc_sim_list += [min(sim_list)]
    max_doc_sim_list += [max(sim_list)]
    
    ref_sim_list += [np.mean(get_cos_similarity(abstract, cand[0])) for cand in candidates]
    
    doc_rouge_list +=[rouge.get_scores('\n'.join(cand[0]), '\n'.join(article))[0]['rouge-l']['f'] for cand in candidates]
    
    rouge_list = [rouge.get_scores('\n'.join(cand[0]), '\n'.join(abstract))[0]['rouge-l']['f'] for cand in candidates]
    ref_rouge_list += rouge_list
    min_ref_rouge_list += [min(rouge_list)]
    max_ref_rouge_list += [max(rouge_list)]

In [17]:
print("Mean cosine similarity between document and summaries : {}".format(round(np.mean(doc_sim_list), 4)))
print("Mean cosine similarity between reference and summaries : {}\n".format(round(np.mean(ref_sim_list), 4)))

print("Min cosine similarity between document and summaries : {}".format(round(np.mean(min_doc_sim_list), 4)))
print("Max cosine similarity between document and summaries : {}".format(round(np.mean(max_doc_sim_list), 4)))

Mean cosine similarity between document and summaries : 0.2746
Mean cosine similarity between reference and summaries : 0.1834

Min cosine similarity between document and summaries : 0.1434
Max cosine similarity between document and summaries : 0.4457


In [18]:
print("Mean Rouge score between document and summaries : {}".format(round(np.mean(doc_rouge_list), 4)))
print("Mean Rouge score between reference and summaries : {}\n".format(round(np.mean(ref_rouge_list), 4)))

print("Min Rouge score between reference and summaries : {}".format(round(np.mean(min_ref_rouge_list), 4)))
print("Max Rouge score between reference and summaries : {}".format(round(np.mean(max_ref_rouge_list), 4)))

Mean Rouge score between document and summaries : 0.3288
Mean Rouge score between reference and summaries : 0.4156

Min Rouge score between reference and summaries : 0.358
Max Rouge score between reference and summaries : 0.4719


In [67]:
from transformers import BertTokenizer
tok = BertTokenizer.from_pretrained('bert-base-uncased', verbose=False)

def bert_encode(x, max_len=-1):
    cls_token_id = tok.cls_token_id
    sep_token_id = tok.sep_token_id

    _ids = tok.encode(x, add_special_tokens=False)
    ids = [cls_token_id] # [CLS]
    if max_len > 0:
        ids.extend(_ids[:max_len - 2])
    else:
        ids.extend(_ids[:512 - 2])
    ids.append(sep_token_id) # [SEP], meaning end of sentence
    return ids

def bert_decode(x):
    result = tok.decode(x, skip_special_tokens=True)
    return result

In [68]:
def detect_trigram(src, tgt):
    if len(tgt) < 3 or len(src) < 3:
        return False
    
    tgt_trigrams = [(tgt[i-1],tgt[i],tgt[i+1]) for i in range(1,len(tgt)-1)]
    src_trigrams = [(src[i-1],src[i],src[i+1]) for i in range(1,len(src)-1)]
    
    for src_tri in src_trigrams:
        if src_tri in tgt_trigrams:
            return True ## Detect trigram overlapped with target
        
    return False
    

In [69]:
def detect_4_gram(src, tgt):
    if len(tgt) < 4 or len(src) < 4:
        return False
    
    tgt_4_grams = [(tgt[i-2], tgt[i-1],tgt[i],tgt[i+1]) for i in range(2,len(tgt)-1)]
    src_4_grams = [(src[i-2], src[i-1],src[i],src[i+1]) for i in range(2,len(src)-1)]
    
    for src_gram in src_4_grams:
        if src_gram in tgt_4_grams:
            return True ## Detect 4-gram overlapped with target
        
    return False
    

In [70]:
def detect_5_gram(src, tgt):
    if len(tgt) < 5 or len(src) < 5:
        return False
    
    tgt_5_grams = [(tgt[i-2], tgt[i-1],tgt[i],tgt[i+1], tgt[i+2]) for i in range(2,len(tgt)-2)]
    src_5_grams = [(src[i-2], src[i-1],src[i],src[i+1], src[i+2]) for i in range(2,len(src)-2)]
    
    for src_gram in src_5_grams:
        if src_gram in tgt_5_grams:
            return True ## Detect 5-gram overlapped with target
        
    return False

In [71]:
def detect_ngram_list(src, tgt_list, n_gram='trigram'):
    
    if n_gram =='trigram':
        return sum([detect_trigram(src, tgt) for tgt in tgt_list]) > 0
    elif n_gram =='4-gram':
        return sum([detect_4_gram(src, tgt) for tgt in tgt_list]) > 0
    else :
        return sum([detect_4_gram(src, tgt) for tgt in tgt_list])>0

In [72]:
def get_candidate_set(sent_set, reference=None, n_gram='trigram'):
    
    assert n_gram in ['trigram', '4-gram','5-gram']
        
    if n_gram == 'trigram':
        detect_ngram = detect_trigram
    elif n_gram == '4-gram':
        detect_ngram = detect_4_gram
    else:
        detect_ngram = detect_5_gram
        
    
    possible_set_ids = []
    
    for sent_id, sent, txt_sent in sent_set:
        possible_2_sent_idx = []
        
        # number of summary sentences = 2
        for tgt_sent_id, tgt_sent, tgt_sent_txt in sent_set[sent_id+1:]:
            
            # Detect n-gram (default= trigram) 
            if not detect_ngram(src=sent, tgt=tgt_sent):
                possible_2_sent_idx.append(set([sent_id, tgt_sent_id]))
                
        possible_3_sent_idx = copy.deepcopy(possible_2_sent_idx)
        
        # number of summary sentences = 3
        for tgt_sent_id, tgt_sent, tgt_sent_txt in sent_set[sent_id+1:]:
            for poss_sent_ids in possible_2_sent_idx:
                
                poss_sent = [sent_set[ids][1] for ids in poss_sent_ids]
                if not detect_ngram_list(src=tgt_sent, tgt_list=poss_sent, n_gram=n_gram):
                    poss_3_ids = copy.deepcopy(poss_sent_ids)
                    poss_3_ids.add(tgt_sent_id)
                    
                    possible_3_sent_idx.append(poss_3_ids)
                    
        possible_sent_idx = possible_2_sent_idx + possible_3_sent_idx
        
        for ids in possible_sent_idx:
            if not ids in possible_set_ids:
                possible_set_ids.append(ids)

    return possible_set_ids
        

In [73]:
import pylcs

def compute_txt_redundancy_score(candidate_id):

    cand_num = len(candidate_id)
    
    score = torch.zeros([cand_num], dtype=torch.float64)
        
    def _compute_redundancy(cand):
        redundancy = 0.0
        
        for i, src_sen in enumerate(cand):
            for j, tgt_sen in enumerate(cand[i+1:]):
                if i != j:
                    lcs_val = pylcs.lcs(src_sen, tgt_sen)
                    redundancy += lcs_val 
        
        sents_len = sum([len(s) for sents in cand for s in sents])
        return redundancy / sents_len

    for i in range(cand_num):
        score[i] = np.mean(_compute_redundancy(candidate_id[i]))

    return score

In [74]:
def get_rouge(src, tgt, rouge_score='rouge-1', metric='f'):
    sc = rouge.get_scores(' '.join(src), ' '.join(tgt))[0]
    return sc[rouge_score][metric]

In [75]:
new_data_path = pjoin(three_data_dir,'reconstructed_test.jsonl')

In [76]:
# Origin Candidate
origin_ref_rouges = []
origin_redun_scores = []

origin_doc_sims = []
origin_ref_sims = []


# Reconstructed Candidate
refine_ref_rouges = []
refine_redun_scores = []

refine_doc_sims = []
refine_ref_sims = []


In [60]:
stop =0
with open(new_data_path, 'w', encoding='utf-8') as f:
    writer = jsonlines.Writer(f)
    
    for data in data_list:
        candidates = data['candidates']
        article = data['article']
        abstract = data["abstract"]


        summaries = [cand[0] for cand in candidates]    
        encoded_cand_set = [[bert_encode(s, 180) for s in cs] for cs in summaries]
        threshold = min([get_rouge(abstract, cand[0]) for cand in candidates]) 

        sent_set = []


        for i, encoded_cand in enumerate(encoded_cand_set):
            for j, encoded_sent in enumerate(encoded_cand):
        
                sent_id = sum([len(prev) for prev in encoded_cand_set[:i]])+j
                sent_set.append((sent_id, encoded_sent, candidates[i][0][j]))

        reduced_cand_ids = get_candidate_set(sent_set)
        reduced_cand_sents = [[sent_set[i][2] for i in ids] for ids in reduced_cand_ids]


        # Drop candidate which has lower score than threshold
        rouge_cands_set = []
        for c in reduced_cand_sents:
            score = get_rouge(abstract, c)
            rouge_cands_set.append((score, c))

        rouge_cands_set = sorted(rouge_cands_set, key=lambda x: x[0], reverse=True)
        fined_cands_set = [sc for sc in rouge_cands_set if sc[0] >= threshold]

        ## Redundancy score of Original Candidates
        origin_redun = compute_txt_redundancy_score(summaries)
        origin_scores = []
        for re_sc, (ro_sc, sent) in zip(origin_redun, [(get_rouge(abstract, cs[0]), '\n'.join(cs[0])) for cs in candidates]):
            origin_scores.append((re_sc.item(), ro_sc, sent))

        origin_scores = sorted(origin_scores, key=lambda x: -x[0]*0.1 +x[1], reverse=True)
        
        # Save Top-1 Score
        origin_redun_scores.append(origin_scores[0][0])
        origin_ref_rouges.append(origin_scores[0][1])
        

        ## Redundancy score of Reconstructed Summaries
        refined_cands_set_ = [cs[1] for cs in fined_cands_set]
        redun_score = compute_txt_redundancy_score(refined_cands_set_)

        scores = []
        for re_sc, (ro_sc, sent) in zip(redun_score, [(cs[0], '\n'.join(cs[1])) for cs in fined_cands_set]):
            scores.append((re_sc.item(), ro_sc, sent))

        scores = sorted(scores, key=lambda x: -x[0]*0.1 + x[1], reverse=True)
        
        # Save Top-1 Score
        refine_redun_scores.append(scores[0][0])
        refine_ref_rouges.append(scores[0][1])
                
        ## Redundancy score of Reference Summary
        abstract_scores = compute_txt_redundancy_score([abstract])
        
        reconstructed_candidates = [s[:][-1].split('\n') for s in scores[:3]]
        
        origin_cand_rouge = [round(get_rouge(abstract, cand[0]),4) for cand in candidates]
        new_cand_rouge = [round(get_rouge(abstract, cand),4) for cand in reconstructed_candidates]
        
        print("Origin ROUGE : {}".format(origin_cand_rouge))
        print("New Candidate ROUGE : {}\n".format(new_cand_rouge))
                              
        if max(new_cand_rouge) > max(origin_cand_rouge):
            max_id = np.argmax(new_cand_rouge)
            origin_max_id = np.argmax(origin_cand_rouge)
            print(reconstructed_candidates[max_id])
            print(candidates[origin_max_id],'\n')
            print(abstract)
            
            print(candidates,'\n')
            stop += 1
        
            if stop == 5:
                break
        
        new_data = {'article':article, 'candidates':candidates, 'abstract':abstract, 'reconstructed_candidates': reconstructed_candidates}
        writer.write(new_data)


Origin ROUGE : [0.4474, 0.5116, 0.4776]
New Candidate ROUGE : [0.6027, 0.6027, 0.6027]

['he was not booked by the referee but could face a heavy retrospective ban .', 'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .', "juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league ."]
[["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .", 'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .', 'he was not booked by the referee but could face a heavy retrospective ban .', 'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'], 0.40383502768823876] 

['juan arango escaped punishment from the referee for biting jesus zavela .', 'he could face a retrospective punishment for the incident .', "arango had earlier scor

Origin ROUGE : [0.2388, 0.3023, 0.1724]
New Candidate ROUGE : [0.5306, 0.5246, 0.5079]

['ecb are to instigate a recruitment process involving head hunters sport recruitment international .', 'the main problem is that the job description has yet to be fixed .']
[['ecb are to instigate a recruitment process involving head hunters sport recruitment international .', 'the main problem is that the job description has yet to be fixed .', 'the severe cuts that sky have been forced to make since agreeing their # 11m-a-match premier league deal is apparent even in the caribbean .', 'roy hodgson is launching roses 2015 , the annual clash between york and lancaster universities over 50 sports and 132 fixtures .'], 0.2803513638465095] 

['the ecb were expected to appoint a former england captain for the role .', 'the job description for the new cricket role has yet to be fixed .', 'sport recruitment international expected to suggest overseas candidates .']
[[['alec stewart admitted he has had no 

In [56]:
cand = [[['the unusual format has been captured in a series of photographs by visual journalist anna erickson .', "meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo where he watches over the parking lot in exchange for a spot", 'john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .'], 0.32618005880037965], [["around 30 drivers live in rvs in a parking lot in seattle 's sodo area .", 'john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .', 'bud dodson , 57 , is a maintenance man who watches over the parking lot .'], 0.2774552491533624], [['the unusual format has been captured in a series of photographs by visual journalist anna erickson .', 'she came across them when she stopped to ask a seemingly homeless man for directions .', 'john worden , 52 , has been living in his $ 200 vehicle for years since his apartment burned down and he was left homeless .', 'bud dodson , 57 , watches over the parking lot in exchange for a semi-permanent spot .'], 0.2726937669376694]] 

cands = [c[0] for c in cand]
cands

[['the unusual format has been captured in a series of photographs by visual journalist anna erickson .',
  "meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo where he watches over the parking lot in exchange for a spot",
  'john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .'],
 ["around 30 drivers live in rvs in a parking lot in seattle 's sodo area .",
  'john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .',
  'bud dodson , 57 , is a maintenance man who watches over the parking lot .'],
 ['the unusual format has been captured in a series of photographs by visual journalist anna erickson .',
  'she came across them when she stopped to ask a seemingly homeless man for directions .',
  'john worden , 52 , has been living in his $ 200 vehicle for years since his apartment burned down and he was left homeless .',
  'bud dodson , 57 

In [57]:
#gsum + refactor
new = ['the unusual format has been captured in a series of photographs by visual journalist anna erickson.', # gsum or bart
       "around 30 drivers live in rvs in a parking lot in seattle's sodo area.", # refactor
       'she came across them when she stopped to ask a seemingly homeless man for directions.'] # gsum

ri = ['the unusual format has been captured in a series of photographs by visual journalist anna erickson .',
      "around 30 drivers live in rvs in a parking lot in seattle 's sodo area .",
     'she came across them when she stopped to ask a seemingly homeless man for directions .']

#bart
origin = ['the unusual format has been captured in a series of photographs by visual journalist anna erickson .',
          "meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo where he watches over the parking lot in exchange for a spot",
          'john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .'] 

real = ["around 30 people live a floating life in seattle 's sodo ( south of downtown ) area in their rvs .",
        'there is one parking lot in particular where the owner lets them act as watchmen in exchange for a spot to live .',
        'visual journalist anna erickson , who photographed the community , said they are just grateful to have a home .']

print(get_rouge(real, ri))
print(get_rouge(real, new))


0.456521734215501
0.41758241268445845


In [88]:
tmp = []
for c in cands:
    for s in c:
        tmp.append((get_rouge(real, [s]), s))

In [91]:
tmp = sorted(tmp, key=lambda x : x[0], reverse=True)
tmp

[(0.37499999545,
  "meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo where he watches over the parking lot in exchange for a spot"),
 (0.3692307660307693,
  "around 30 drivers live in rvs in a parking lot in seattle 's sodo area ."),
 (0.26865671294274895,
  'bud dodson , 57 , watches over the parking lot in exchange for a semi-permanent spot .'),
 (0.23529411404844294,
  'the unusual format has been captured in a series of photographs by visual journalist anna erickson .'),
 (0.23529411404844294,
  'the unusual format has been captured in a series of photographs by visual journalist anna erickson .'),
 (0.21212120877869609,
  'bud dodson , 57 , is a maintenance man who watches over the parking lot .'),
 (0.12121211786960524,
  'she came across them when she stopped to ask a seemingly homeless man for directions .'),
 (0.08219177672358811,
  'john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .')

In [92]:
sent_level_cand = ["meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo where he watches over the parking lot in exchange for a spot",
                  "around 30 drivers live in rvs in a parking lot in seattle 's sodo area .",
                  'bud dodson , 57 , watches over the parking lot in exchange for a semi-permanent spot .']

get_rouge(real, sent_level_cand)

0.4597701101334391

In [82]:
ext_1 = ['the unusual format has been captured in a series of photographs by visual journalist anna erickson .']
ext_2 = ["meet bud dodson , 57 , and welcome to his home : an rv in seattle 's sodo where he watches over the parking lot in exchange for a spot"]
ext_3 = ['john worden , 52 , has been living in his vehicle for years since his apartment burned down and he was left homeless .']

print(get_rouge(real, ext_1))
print(get_rouge(real, ext_2))
print(get_rouge(real, ext_3))

0.23529411404844294
0.37499999545
0.08219177672358811


In [84]:
for se in ri:
    print(get_rouge(real, [se]))
    

print(get_rouge(real, ri))

0.23529411404844294
0.3692307660307693
0.12121211786960524
0.456521734215501


In [None]:
print("Origin Redundancy score : {}".format(round(np.mean(origin_redun), 4)))
print("Origin cosine similarity between document and summaries : {}".format(round(np.mean(origin_doc_sims), 4)))
print("Origin ROUGE score between reference and summaries : {}".format(round(np.mean(origin_ref_rouges), 4)))

In [None]:
print("Origin Redundancy score : {}".format(round(np.mean(refine_redun), 4)))
print("Origin cosine similarity between document and summaries : {}".format(round(np.mean(refine_doc_sims), 4)))
print("Origin ROUGE score between reference and summaries : {}".format(round(np.mean(refine_ref_rouges), 4)))