In [1]:
# import pandas as pd
import csv
import numpy as np
import os
from os.path import join as pjoin
from glob import iglob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#!/usr/bin/env python
from __future__ import division

import argparse
import glob
import os
import random
import signal
import time

import torch

import distributed
from models import data_loader, model_builder
from models.data_loader import load_dataset
from models.model_builder import ExtSummarizer
from models.trainer_ext import build_trainer
from others.logging import logger, init_logger

model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers', 'encoder', 'ff_actv', 'use_interval', 'rnn_size']


In [3]:
root_path = '/data/ksb/'
bert_root_path = pjoin(root_path, 'BertSum/PreSumm')
bert_model_dir = pjoin(bert_root_path, 'models')

data_dir = pjoin(root_path, 'cnn-dailymail/finished_files')

#### Loss function 비교  

*Trained Model parameter 필요*


In [4]:
def get_cos_similarity(inputs, summaries):
    tfidf_vectorizer = TfidfVectorizer()

    cos_similarity_list = []
    for input_, summary_ in zip(inputs, summaries):
        try:
            tfidf_matrix = tfidf_vectorizer.fit_transform([input_, summary_])

            similarity = cosine_similarity(tfidf_matrix[0] , tfidf_matrix[1])[0][0]
        except ValueError:
            similarity = 0.0
            
        cos_similarity_list.append(similarity)

    return cos_similarity_list

In [5]:
import jsonlines
import json

with open(pjoin(root_path, 'three-sample.jsonl'),'r',encoding='utf-8') as f:
    data = json.load(f)
 
    article = data["article"]
    candidate = data["candidates"]        
    abstract = data["abstract"]

### Origin candidate set

In [6]:
candidate

[[["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness'],
  0.40032206119162644],
 [["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'],
  0.40383502768823876],
 [["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of

In [7]:
doc_sim_list = [round(np.mean(get_cos_similarity(article, cand[0])),3) for cand in candidate]
ref_sim_list = [round(np.mean(get_cos_similarity(abstract, cand[0])),3) for cand in candidate]

print("Cosine similarity between document and summaries : {}".format(doc_sim_list))
print("Cosine similarity between reference and summaries : {}".format(ref_sim_list))

Cosine similarity between document and summaries : [0.375, 0.593, 0.711]
Cosine similarity between reference and summaries : [0.171, 0.11, 0.105]


In [8]:
from rouge import Rouge 
rouge = Rouge()

In [9]:
doc_rouge_list = [round(rouge.get_scores('\n'.join(cand[0]), '\n'.join(article))[0]['rouge-l']['f'],3) for cand in candidate]
ref_rouge_list = [round(rouge.get_scores('\n'.join(cand[0]), '\n'.join(abstract))[0]['rouge-l']['f'],3) for cand in candidate]
rouge_list = [round(cand[1],3) for cand in candidate]

print("Rouge score between document and summaries : {}".format(doc_rouge_list))
print("Rouge score between reference and summaries : {}".format(ref_rouge_list))
print("Rouge score between reference and summaries(written) : {}".format(rouge_list))


Rouge score between document and summaries : [0.571, 0.618, 0.453]
Rouge score between reference and summaries : [0.447, 0.512, 0.478]
Rouge score between reference and summaries(written) : [0.4, 0.404, 0.387]


#### Get new candidate set

In [10]:
candidate_sets = [cand[0] for cand in candidate]
candidate_sets

[["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
  'he was not booked by the referee but could face a heavy retrospective ban .',
  'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness'],
 ["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
  'he was not booked by the referee but could face a heavy retrospective ban .',
  'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness .',
  'he was not bo

In [11]:
from functools import reduce
candidate_sents = reduce(lambda x, y: x + y, candidate_sets)
candidate_sents

["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
 'he was not booked by the referee but could face a heavy retrospective ban .',
 'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness',
 "juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
 'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
 'he was not booked by the referee but could face a heavy retrospective ban .',
 'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .',
 "juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
 'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness .',
 'he was not booked by the 

In [32]:
def compute_redundancy_score(candidate_id):

    cand_num = len(candidate_id)
    
    score = torch.zeros([cand_num], dtype=torch.float64)
        
    def _lcs(X, Y, m, n):
        if m == 0 or n == 0:
            return 0
        elif X[m-1] == Y[n-1]:
            return 1 + _lcs(X, Y, m-1, n-1)
        else:
            return max(_lcs(X, Y, m, n-1), _lcs(X, Y, m-1, n))

    def _compute_redundancy(cand):
        redundancy = 0.0
        for i, src_sen in enumerate(cand):
            for j, tgt_sen in enumerate(cand):
                if i is not j:
                    redundancy += _lcs(src_sen, tgt_sen, len(src_sen), len(tgt_sen)) / len(src_sen)
        return redundancy

    for i in range(cand_num):
        score[i] = np.mean(_compute_redundancy(candidate_id[i]))

    return score

In [26]:
from transformers import BertTokenizer

def bert_encode(x, max_len=-1):
    tok = BertTokenizer.from_pretrained('bert-base-uncased', verbose=False)
    cls_token_id = tok.cls_token_id
    sep_token_id = tok.sep_token_id

    _ids = tok.encode(x, add_special_tokens=False)
    ids = [cls_token_id] # [CLS]
    if max_len > 0:
        ids.extend(_ids[:max_len - 2])
    else:
        ids.extend(_ids[:512 - 2])
    ids.append(sep_token_id) # [SEP], meaning end of sentence
    return ids

In [14]:
candidate_sents_list = []
abstract_txt = '\n'.join(abstract)
for sent in candidate_sents:
    rouge_L = rouge.get_scores(abstract_txt, sent)[0]['rouge-l']['r']
    candidate_sents_list.append((rouge_L, sent))

cand_dict = sorted(candidate_sents_list, key=lambda x: x[0], reverse=True)
cand_dict

[(0.5625,
  "juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league ."),
 (0.5,
  'he was not booked by the referee but could face a heavy retrospective ban .'),
 (0.5,
  "juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league ."),
 (0.5,
  'he was not booked by the referee but could face a heavy retrospective ban .'),
 (0.5,
  'he was not booked by the referee but could face a heavy retrospective ban .'),
 (0.45,
  'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'),
 (0.375,
  'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness'),
 (0.32142857142857145,
  "club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring ."),
 (0.2,
  'the venezuelan icon sank his teeth into the shoulde

In [15]:
from itertools import combinations
import numpy as np

sent_id = np.arange(len(cand_dict))
indices = list(combinations(sent_id, 2))
indices += list(combinations(sent_id, 3))
indices

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (6, 7),
 (6, 8),
 (6, 9),
 (7, 8),
 (7, 9),
 (8, 9),
 (0, 1, 2),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 1, 6),
 (0, 1, 7),
 (0, 1, 8),
 (0, 1, 9),
 (0, 2, 3),
 (0, 2, 4),
 (0, 2, 5),
 (0, 2, 6),
 (0, 2, 7),
 (0, 2, 8),
 (0, 2, 9),
 (0, 3, 4),
 (0, 3, 5),
 (0, 3, 6),
 (0, 3, 7),
 (0, 3, 8),
 (0, 3, 9),
 (0, 4, 5),
 (0, 4, 6),
 (0, 4, 7),
 (0, 4, 8),
 (0, 4, 9),
 (0, 5, 6),
 (0, 5, 7),
 (0, 5, 8),
 (0, 5, 9),
 (0, 6, 7),
 (0, 6, 8),
 (0, 6, 9),
 (0, 7, 8),
 (0, 7, 9),
 (0, 8, 9),
 (1, 2, 3),
 (1, 2, 4),
 (1, 2, 5),
 (1, 2, 6),
 (1, 2, 7),
 (1, 2, 8),
 (1, 2, 9),
 (1, 3, 4),
 (1, 3, 5),
 (1, 3, 6),
 (1, 3, 7),
 (1, 3, 8),
 (1, 3, 9),
 (1, 4,

In [16]:
candidate_set = [[cand_dict[idx][1] for idx in idxs] for idxs in indices]
candidate_set

[["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'he was not booked by the referee but could face a heavy retrospective ban .'],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  "juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league ."],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'he was not booked by the referee but could face a heavy retrospective ban .'],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'he was not booked by the referee but could face a heavy retrospective ban .'],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down 

In [36]:
cand_set_list = []

c = [[bert_encode(sent,180) for sent in cs] for cs in candidate_set]

In [37]:
redundancy = compute_redundancy_score(c)
redundancy

KeyboardInterrupt: 

In [None]:
for i, cs in enumerate(candidate_set):
    rouge_L = rouge.get_scores(abstract_txt, '\n'.join(cs))[0]['rouge-l']['r']

    cand_set_list.append((rouge_L, redundancy[i], cs))
    
    
result = sorted(cand_set_list, key=lambda x: x[0], reverse=True)
result