In [1]:
# import pandas as pd
import csv
import numpy as np
import os
from os.path import join as pjoin
from glob import iglob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
root_path = '/data/ksb/'
data_dir = pjoin(root_path, 'cnn-dailymail/finished_files')

#### Loss function 비교  

*Trained Model parameter 필요*


In [None]:
def forward(self, text_id, candidate_id, summary_id):
        
    batch_size = text_id.size(0)
        
    # get document embedding
    input_mask = ~(text_id == 0)
    doc_emb = self.encoder(text_id, attention_mask=input_mask)[0] # last layer
    # compute weights
    input_mask = ~input_mask
    weights = self.transformer(doc_emb.transpose(0, 1), src_key_padding_mask=input_mask)  # [seq_len, bz, dm]
    global_encoding = weights[0] # [bz, dm]
    weights = weights.transpose(0, 1)  # [bz, seq_len, dm]
    weights = torch.matmul(doc_emb, global_encoding.unsqueeze(-1)).squeeze(-1)
    weights = F.softmax(weights / math.sqrt(self.hidden_size), dim=1)
        
    # get summary embedding
    input_mask = ~(summary_id == 0)
    summary_emb = self.encoder(summary_id, attention_mask=input_mask)[0] # last layer

    # get candidate embedding
    candidate_num = candidate_id.size(1)
    candidate_id = candidate_id.view(-1, candidate_id.size(-1))
    input_mask = ~(candidate_id == 0)
    candidate_emd = self.encoder(candidate_id, attention_mask=input_mask)[0]

    # scoring
    doc_emb = F.normalize(doc_emb, dim=2)
    summary_emb = F.normalize(summary_emb, dim=2)
    summary_score = self.compute_score(doc_emb, summary_emb, weights)
    doc_emb = torch.repeat_interleave(doc_emb, candidate_num, dim=0)
    weights = torch.repeat_interleave(weights, candidate_num, dim=0)
    candidate_emd = F.normalize(candidate_emd, dim=2)
    score = self.compute_score(doc_emb, candidate_emd, weights)
    score = score.view(batch_size, candidate_num)
        
    return {'score': score, 'summary_score': summary_score}

In [None]:
def RankingLoss(score, summary_score=None, margin=0, gold_margin=0, gold_weight=1, no_gold=False, no_cand=False):
    ones = torch.ones_like(score)
    loss_func = torch.nn.MarginRankingLoss(0.0)
    TotalLoss = loss_func(score, score, ones)
    # candidate loss
    n = score.size(1)
    if not no_cand:
        for i in range(1, n):
            pos_score = score[:, :-i]
            neg_score = score[:, i:]
            pos_score = pos_score.contiguous().view(-1)
            neg_score = neg_score.contiguous().view(-1)
            ones = torch.ones_like(pos_score)
            loss_func = torch.nn.MarginRankingLoss(margin * i)
            loss = loss_func(pos_score, neg_score, ones)
            TotalLoss += loss
    if no_gold:
        return TotalLoss
    # gold summary loss
    pos_score = summary_score.unsqueeze(-1).expand_as(score)
    neg_score = score
    pos_score = pos_score.contiguous().view(-1)
    neg_score = neg_score.contiguous().view(-1)
    ones = torch.ones_like(pos_score)
    loss_func = torch.nn.MarginRankingLoss(gold_margin)
    TotalLoss += gold_weight * loss_func(pos_score, neg_score, ones)
    
    return TotalLoss

In [4]:
def get_cos_similarity(inputs, summaries):
    tfidf_vectorizer = TfidfVectorizer()

    cos_similarity_list = []
    for input_, summary_ in zip(inputs, summaries):
        try:
            tfidf_matrix = tfidf_vectorizer.fit_transform([input_, summary_])

            similarity = cosine_similarity(tfidf_matrix[0] , tfidf_matrix[1])[0][0]
        except ValueError:
            similarity = 0.0
            
        cos_similarity_list.append(similarity)

    return cos_similarity_list

In [5]:
import jsonlines
import json

with open('three-sample.jsonl','r',encoding='utf-8') as f:
    data = json.load(f)
 
    article = data["article"]
    candidate = data["candidates"]        
    abstract = data["abstract"]

In [6]:
candidate[0][0]

["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
 'he was not booked by the referee but could face a heavy retrospective ban .',
 'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness']

In [7]:
doc_sim_list = [round(np.mean(get_cos_similarity(article, cand[0])),3) for cand in candidate]
ref_sim_list = [round(np.mean(get_cos_similarity(abstract, cand[0])),3) for cand in candidate]

print("Cosine similarity between document and summaries : {}".format(doc_sim_list))
print("Cosine similarity between reference and summaries : {}".format(ref_sim_list))

Cosine similarity between document and summaries : [0.375, 0.593, 0.711]
Cosine similarity between reference and summaries : [0.171, 0.11, 0.105]


In [9]:
from rouge import Rouge 
rouge = Rouge()

In [16]:
doc_rouge_list = [round(rouge.get_scores('\n'.join(cand[0]), '\n'.join(article))[0]['rouge-l']['f'],3) for cand in candidate]
ref_rouge_list = [round(rouge.get_scores('\n'.join(cand[0]), '\n'.join(abstract))[0]['rouge-l']['f'],3) for cand in candidate]
rouge_list = [round(cand[1],3) for cand in candidate]

print("Rouge score between document and summaries : {}".format(doc_rouge_list))
print("Rouge score between reference and summaries : {}".format(ref_rouge_list))
print("Rouge score between reference and summaries(written) : {}".format(rouge_list))


Rouge score between document and summaries : [0.571, 0.618, 0.453]
Rouge score between reference and summaries : [0.447, 0.512, 0.478]
Rouge score between reference and summaries(written) : [0.4, 0.404, 0.387]


In [20]:
n_gram = 3
scores = []
copy_summaries = [cand[0] for cand in candidate]

for i in range(len(copy_summaries)): # candidate summary set 개수
    for j in range(len(copy_summaries[i])): # sentence 개수
        ext_tok = copy_summaries[i][j].split()
        window = len(ext_tok) // n_gram

        for w in range(window):
            s = rouge.get_scores(' '.join(ext_tok[w:w+n_gram]), '\n'.join(abstract))[0]['rouge-l']['f']

            score = (' '.join(ext_tok[w:w+n_gram]), s)
            scores.append(score)
dscored = sorted(scores,key=lambda x: x[1], reverse=True)
dscored

[('in his team', 0.19354838534859523),
 ('jesus zavela in', 0.19354838534859523),
 ('zavela in a', 0.19354838534859523),
 ('arango had earlier', 0.19354838534859523),
 ('jesus zavela in', 0.19354838534859523),
 ('star juan arango', 0.12903225631633716),
 ('juan arango conjured', 0.12903225631633716),
 ('suarez in his', 0.12903225631633716),
 ('by the referee', 0.12903225631633716),
 ('juan arango (', 0.12903225631633716),
 ('juan arango bites', 0.12903225631633716),
 ('arango bites jesus', 0.12903225631633716),
 ('bites jesus zavela', 0.12903225631633716),
 ('in a moment', 0.12903225631633716),
 ('by the referee', 0.12903225631633716),
 ('had earlier curled', 0.12903225631633716),
 ('earlier curled in', 0.12903225631633716),
 ('curled in a', 0.12903225631633716),
 ('in a magnificent', 0.12903225631633716),
 ('a magnificent free', 0.12903225631633716),
 ('magnificent free kick', 0.12903225631633716),
 ('juan arango bites', 0.12903225631633716),
 ('arango bites jesus', 0.1290322563163371

In [21]:
def delete_(ext_sents, delete_ngram):
    return list(map(lambda sent : sent.replace(delete_ngram, ''),ext_sents))

In [27]:
dscored = [sc for sc in dscored if sc[1]==0.0]
for sc in dscored:
    pruned_summary = list(map(lambda sents : delete_(sents, sc[0]), copy_summaries))

In [28]:
pruned_summary

[["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
  'he was  the referee but could face a heavy retrospective ban .',
  'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness'],
 ["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
  'he was  the referee but could face a heavy retrospective ban .',
  'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness .',
  'he was  the referee but could face a he

In [29]:
doc_rouge_list = [round(rouge.get_scores('\n'.join(cand), '\n'.join(article))[0]['rouge-l']['f'],3) for cand in pruned_summary]
ref_rouge_list = [round(rouge.get_scores('\n'.join(cand), '\n'.join(abstract))[0]['rouge-l']['f'],3) for cand in pruned_summary]

print("Rouge score between document and summaries : {}".format(doc_rouge_list))
print("Rouge score between reference and summaries : {}".format(ref_rouge_list))


Rouge score between document and summaries : [0.563, 0.602, 0.433]
Rouge score between reference and summaries : [0.453, 0.524, 0.492]


In [30]:
doc_sim_list = [round(np.mean(get_cos_similarity(article, cand)),3) for cand in pruned_summary]
ref_sim_list = [round(np.mean(get_cos_similarity(abstract, cand)),3) for cand in pruned_summary]

print("Cosine similarity between document and summaries : {}".format(doc_sim_list))
print("Cosine similarity between reference and summaries : {}".format(ref_sim_list))

Cosine similarity between document and summaries : [0.379, 0.541, 0.641]
Cosine similarity between reference and summaries : [0.192, 0.11, 0.105]


In [32]:
!cp loss_function_.ipynb new_candidate_set.ipynb