In [110]:
# import pandas as pd
import csv
import numpy as np
import os
import copy
from os.path import join as pjoin
from glob import iglob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [111]:
#!/usr/bin/env python
from __future__ import division

import argparse
import glob
import os
import random
import signal
import time

import torch

import distributed
from models import data_loader, model_builder
from models.data_loader import load_dataset
from models.model_builder import ExtSummarizer
from models.trainer_ext import build_trainer
from others.logging import logger, init_logger

model_flags = ['hidden_size', 'ff_size', 'heads', 'inter_layers', 'encoder', 'ff_actv', 'use_interval', 'rnn_size']


In [112]:
root_path = '/data/ksb/'
bert_root_path = pjoin(root_path, 'BertSum/PreSumm')
bert_model_dir = pjoin(bert_root_path, 'models')

data_dir = pjoin(root_path, 'cnn-dailymail/finished_files')

#### Loss function 비교  

*Trained Model parameter 필요*


In [113]:
def get_cos_similarity(inputs, summaries):
    tfidf_vectorizer = TfidfVectorizer()

    cos_similarity_list = []
    for input_, summary_ in zip(inputs, summaries):
        try:
            tfidf_matrix = tfidf_vectorizer.fit_transform([input_, summary_])

            similarity = cosine_similarity(tfidf_matrix[0] , tfidf_matrix[1])[0][0]
        except ValueError:
            similarity = 0.0
            
        cos_similarity_list.append(similarity)

    return cos_similarity_list

In [114]:
import jsonlines
import json

with open(pjoin(root_path, 'three-sample.jsonl'),'r',encoding='utf-8') as f:
    data = json.load(f)
 
    article = data["article"]
    candidate = data["candidates"]        
    abstract = data["abstract"]

### Origin candidate set

In [115]:
candidate

[[["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness'],
  0.40032206119162644],
 [["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'],
  0.40383502768823876],
 [["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of

In [116]:
doc_sim_list = [round(np.mean(get_cos_similarity(article, cand[0])),3) for cand in candidate]
ref_sim_list = [round(np.mean(get_cos_similarity(abstract, cand[0])),3) for cand in candidate]

print("Cosine similarity between document and summaries : {}".format(doc_sim_list))
print("Cosine similarity between reference and summaries : {}".format(ref_sim_list))

Cosine similarity between document and summaries : [0.375, 0.593, 0.711]
Cosine similarity between reference and summaries : [0.171, 0.11, 0.105]


In [117]:
from rouge import Rouge 
rouge = Rouge()

In [118]:
doc_rouge_list = [round(rouge.get_scores('\n'.join(cand[0]), '\n'.join(article))[0]['rouge-l']['f'],3) for cand in candidate]
ref_rouge_list = [round(rouge.get_scores('\n'.join(cand[0]), '\n'.join(abstract))[0]['rouge-l']['f'],3) for cand in candidate]
rouge_list = [round(cand[1],3) for cand in candidate]

print("Rouge score between document and summaries : {}".format(doc_rouge_list))
print("Rouge score between reference and summaries : {}".format(ref_rouge_list))
print("Rouge score between reference and summaries(written) : {}".format(rouge_list))


Rouge score between document and summaries : [0.571, 0.618, 0.453]
Rouge score between reference and summaries : [0.447, 0.512, 0.478]
Rouge score between reference and summaries(written) : [0.4, 0.404, 0.387]


#### Get new candidate set

In [119]:
candidate_sets = [cand[0] for cand in candidate]
candidate_sets

[["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
  'he was not booked by the referee but could face a heavy retrospective ban .',
  'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness'],
 ["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
  'he was not booked by the referee but could face a heavy retrospective ban .',
  'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .'],
 ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
  'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness .',
  'he was not bo

In [120]:
from transformers import BertTokenizer

def bert_encode(x, max_len=-1):
    tok = BertTokenizer.from_pretrained('bert-base-uncased', verbose=False)
    cls_token_id = tok.cls_token_id
    sep_token_id = tok.sep_token_id

    _ids = tok.encode(x, add_special_tokens=False)
    ids = [cls_token_id] # [CLS]
    if max_len > 0:
        ids.extend(_ids[:max_len - 2])
    else:
        ids.extend(_ids[:512 - 2])
    ids.append(sep_token_id) # [SEP], meaning end of sentence
    return ids

def bert_decode(x):
    tok = BertTokenizer.from_pretrained('bert-base-uncased', verbose=False)

    result = tok.decode(x, skip_special_tokens=True)
    return result

In [121]:
def detect_trigram(src, tgt):
    assert len(tgt) > 2 and len(src) > 2
        
    tgt_trigrams = [(tgt[i-1],tgt[i],tgt[i+1]) for i in range(1,len(tgt)-1)]
    src_trigrams = [(src[i-1],src[i],src[i+1]) for i in range(1,len(src)-1)]
    
    for src_tri in src_trigrams:
        if src_tri in tgt_trigrams:
            return True ## Detect trigram overlapped with target
        
    return False
    

In [122]:
def detect_4_gram(src, tgt):
    assert len(tgt) > 3 and len(src) > 3
    tgt_4_grams = [(tgt[i-2], tgt[i-1],tgt[i],tgt[i+1]) for i in range(2,len(tgt)-1)]
    src_4_grams = [(src[i-2], src[i-1],src[i],src[i+1]) for i in range(2,len(src)-1)]
    
    for src_gram in src_4_grams:
        if src_gram in tgt_4_grams:
            return True ## Detect 4-gram overlapped with target
        
    return False
    

In [123]:
def detect_5_gram(src, tgt):
    assert len(tgt) > 4 and len(src) > 4
    tgt_5_grams = [(tgt[i-2], tgt[i-1],tgt[i],tgt[i+1], tgt[i+2]) for i in range(2,len(tgt)-2)]
    src_5_grams = [(src[i-2], src[i-1],src[i],src[i+1], src[i+2]) for i in range(2,len(src)-2)]
    
    for src_gram in src_5_grams:
        if src_gram in tgt_5_grams:
            return True ## Detect 5-gram overlapped with target
        
    return False

In [124]:
encoded_cand_set = [[bert_encode(s, 180) for s in cs] for cs in candidate_sets]

In [125]:
def _map_sent_id(cand_set):
    mapping_ = []
    for i, cand in enumerate(cand_set) :
        summ_ = []
        for j, s in enumerate(cand):
            idx = sum([len(prev) for prev in cand_set[:i]]) + j
            summ_.append((idx, s))
            
        mapping_.append(summ_)
            
    return mapping_
    
def _get_sent_fromId(cand_set, idx):
    dscored = [sent[1] for cand in cand_set for sent in cand if sent[0]==idx]
    return dscored[0]

In [126]:
def detect_ngram_list(src, tgt_list, n_gram='trigram'):
    
    if n_gram =='trigram':
        return sum([detect_trigram(src, tgt) for tgt in tgt_list]) > 0
    elif n_gram =='4-gram':
        return sum([detect_4_gram(src, tgt) for tgt in tgt_list]) > 0
    else :
        return sum([detect_4_gram(src, tgt) for tgt in tgt_list])>0

In [127]:
def map_sent_id(cand_set):
    mapping_ = []
    for i, cand in enumerate(cand_set) :
        summ_ = []
        for j, s in enumerate(cand):
            idx = sum([len(prev) for prev in cand_set[:i]]) + j
            summ_.append((idx, s))
            
        mapping_.append(summ_)
            
    return mapping_
    
def get_sent_fromId(cand_set, idx):
    dscored = [sent[1] for cand in cand_set for sent in cand if sent[0]==idx]
    return dscored[0]

In [128]:
def get_candidate_set(sent_set, reference=None, n_gram='trigram'):
    
    assert n_gram in ['trigram', '4-gram','5-gram']
        
    if n_gram == 'trigram':
        detect_ngram = detect_trigram
    elif n_gram == '4-gram':
        detect_ngram = detect_4_gram
    else:
        detect_ngram = detect_5_gram
        
    
    possible_set_ids = []
    
    for sent_id, sent in sent_set:
        possible_2_sent_idx = []
        
        print("Sentece ID ({}) Detect all possible combination\n".format(sent_id))
        
        print("Detect all possible combination whose length is 2")
        # number of summary sentences = 2
        for tgt_sent_id, tgt_sent in sent_set[sent_id+1:]:
            
            # Detect n-gram (default= trigram) 
            if not detect_ngram(src=sent, tgt=tgt_sent):
                possible_2_sent_idx.append(set([sent_id, tgt_sent_id]))
                
        print("Number of detected possible combination is {}\n".format(len(possible_2_sent_idx)))
        
        print("Detect all possible combination whose length is 3")
        possible_3_sent_idx = copy.deepcopy(possible_2_sent_idx)
        
        # number of summary sentences = 3
        for tgt_sent_id, tgt_sent in sent_set[sent_id+1:]:
            for poss_sent_ids in possible_2_sent_idx:
                
                poss_sent = [sent_set[ids][1] for ids in poss_sent_ids]
                if not detect_ngram_list(src=tgt_sent, tgt_list=poss_sent, n_gram=n_gram):
                    poss_3_ids = copy.deepcopy(poss_sent_ids)
                    poss_3_ids.add(tgt_sent_id)
                    
                    possible_3_sent_idx.append(poss_3_ids)
                    
        print("Number of detected possible combination is {}\n".format(len(possible_3_sent_idx)))

        possible_sent_idx = possible_2_sent_idx + possible_3_sent_idx
        print("Total number of detected possible combination is {}\n".format(len(possible_sent_idx)))
        
        for ids in possible_sent_idx:
            if not ids in possible_set_ids:
                possible_set_ids.append(ids)

    return possible_set_ids
        

In [129]:
sent_set = []

for i, encoded_cand in enumerate(encoded_cand_set):
    for j, encoded_sent in enumerate(encoded_cand):
        
        sent_id = sum([len(prev) for prev in encoded_cand_set[:i]])+j
        sent_set.append((sent_id, encoded_sent))

In [130]:
reduced_cand_ids = get_candidate_set(sent_set)

Sentece ID (0) Detect all possible combination

Detect all possible combination whose length is 2
Number of detected possible combination is 6

Detect all possible combination whose length is 3
Number of detected possible combination is 28

Total number of detected possible combination is 34

Sentece ID (1) Detect all possible combination

Detect all possible combination whose length is 2
Number of detected possible combination is 6

Detect all possible combination whose length is 3
Number of detected possible combination is 22

Total number of detected possible combination is 28

Sentece ID (2) Detect all possible combination

Detect all possible combination whose length is 2
Number of detected possible combination is 3

Detect all possible combination whose length is 3
Number of detected possible combination is 7

Total number of detected possible combination is 10

Sentece ID (3) Detect all possible combination

Detect all possible combination whose length is 2
Number of detected po

In [131]:
reduced_cand_ids

[{0, 1},
 {0, 4},
 {0, 5},
 {0, 6},
 {0, 8},
 {0, 9},
 {0, 1, 4},
 {0, 1, 6},
 {0, 1, 8},
 {0, 4, 5},
 {0, 4, 6},
 {0, 4, 9},
 {0, 5, 6},
 {0, 5, 8},
 {0, 6, 8},
 {0, 6, 9},
 {0, 8, 9},
 {1, 2},
 {1, 3},
 {1, 4},
 {1, 6},
 {1, 7},
 {1, 8},
 {1, 2, 6},
 {1, 3, 4},
 {1, 3, 6},
 {1, 4, 6},
 {1, 4, 7},
 {1, 6, 7},
 {1, 6, 8},
 {1, 7, 8},
 {2, 5},
 {2, 6},
 {2, 9},
 {2, 5, 6},
 {2, 6, 9},
 {3, 4},
 {3, 5},
 {3, 6},
 {3, 9},
 {3, 4, 5},
 {3, 4, 6},
 {3, 4, 9},
 {3, 5, 6},
 {3, 6, 9},
 {4, 5},
 {4, 6},
 {4, 7},
 {4, 9},
 {4, 5, 6},
 {4, 5, 7},
 {4, 6, 7},
 {4, 6, 9},
 {4, 7, 9},
 {5, 6},
 {5, 7},
 {5, 8},
 {5, 6, 7},
 {5, 6, 8},
 {5, 7, 8},
 {6, 7},
 {6, 8},
 {6, 9},
 {6, 7, 8},
 {6, 7, 9},
 {6, 8, 9},
 {7, 8},
 {7, 9},
 {7, 8, 9},
 {8, 9}]

In [132]:
len(reduced_cand_ids)

70

In [133]:
reduced_cand_sents = [[sent_set[i][1] for i in ids] for ids in reduced_cand_ids]
reduced_cand_sents

[[[101,
   2252,
   14841,
   9103,
   5162,
   2732,
   5348,
   19027,
   16656,
   9530,
   26949,
   5758,
   6446,
   22551,
   1999,
   2010,
   2136,
   1005,
   1055,
   1018,
   1011,
   1017,
   4154,
   2011,
   26843,
   1999,
   1996,
   4916,
   2223,
   1011,
   2021,
   2009,
   2001,
   2025,
   2083,
   4013,
   4305,
   11411,
   2271,
   4577,
   1012,
   102],
  [101,
   2002,
   2001,
   2025,
   17414,
   2011,
   1996,
   5330,
   2021,
   2071,
   2227,
   1037,
   3082,
   15354,
   7221,
   1012,
   102]],
 [[101,
   2252,
   14841,
   9103,
   5162,
   2732,
   5348,
   19027,
   16656,
   9530,
   26949,
   5758,
   6446,
   22551,
   1999,
   2010,
   2136,
   1005,
   1055,
   1018,
   1011,
   1017,
   4154,
   2011,
   26843,
   1999,
   1996,
   4916,
   2223,
   1011,
   2021,
   2009,
   2001,
   2025,
   2083,
   4013,
   4305,
   11411,
   2271,
   4577,
   1012,
   102],
  [101,
   1996,
   15332,
   12696,
   7569,
   2010,
   4091,
   2046,
   1

In [134]:
reduced_cand_set_dec = [[bert_decode(x) for x in cand] for cand in reduced_cand_sents]

In [135]:
reduced_cand_set_dec

[["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'he was not booked by the referee but could face a heavy retrospective ban.'],
 ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.'],
 ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'he was not booked by the referee but could face a heavy retrospective ban.'],
 ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'arango had earlier curled in a magni

In [257]:
rouge_cands_set = []
for c in reduced_cand_set_dec:
            
    scores = rouge.get_scores('\n'.join(article), '\n'.join(c))[0]['rouge-l']['f']
#     score = np.mean(scores)
    sims = np.mean(get_cos_similarity(article, c))

    print(sims + scores)

    rouge_cands_set.append((sims + scores, c))

rouge_cands_set = sorted(rouge_cands_set, key=lambda x: x[0], reverse=True)
fined_cands_set = rouge_cands_set # [:20]

0.9773630507059818
1.384731303417963
0.9773630507059818
1.0437617969397377
1.2730622936269413
0.9773630507059818
0.9366400373757537
0.9260464099908341
1.405851051312519
1.4903150272600552
1.2088380293964338
0.9366400373757537
0.9260464099908341
1.405851051312519
1.124979380325411
0.9260464099908341
0.9091160681756775
0.6232636947813281
0.5787242608792187
0.8874630437441401
0.5527758954408039
0.5393323662406573
0.4592611970842423
0.7021790312912723
0.6533294679760643
0.666621125543668
0.8783119305998137
0.8109292601771234
0.5999987743168007
0.5884187541398085
0.5196809813129463
0.4908352284877474
0.5438071634252322
0.6232636947813281
0.6138933870955519
0.7021790312912723
1.010324414419276
0.6226653311784551
0.6880403746594593
0.5787242608792187
1.2362985047265234
0.9455832420932269
0.6533294679760643
0.6959151724098256
0.666621125543668
0.520352977394061
0.5733249123315457
0.5598813831313991
0.8874630437441401
0.6335718863664276
0.5661892159437373
0.6136981189106286
0.8783119305998137
0

In [219]:
rouge_cands_set = []
for c in reduced_cand_set_dec:
    
    scores = [rouge.get_scores('\n'.join(cand[0]), '\n'.join(c))[0]['rouge-l']['r'] for cand in candidate]
    sims = [np.mean(get_cos_similarity(cand[0], c)) for cand in candidate]
    scores = [score * sim for score, sim in zip(scores, sims)]
    score = np.mean(scores)
    print(score)

    rouge_cands_set.append((score, c))

rouge_cands_set = sorted(rouge_cands_set, key=lambda x: x[0], reverse=True)
fined_cands_set = rouge_cands_set # [:20]

0.4167020188165737
0.3552684340698713
0.4167020188165737
0.17158254055894925
0.37907804353574054
0.4167020188165737
0.29104925217984806
0.21632606748910557
0.44345692821062777
0.4275640369628606
0.217389061866791
0.29104925217984806
0.21632606748910557
0.44345692821062777
0.2148936877007498
0.21632606748910557
0.31551577106570994
0.15660574993736218
0.1270060638338575
0.30762659501246997
0.043056640278563214
0.101607186166564
0.2533004236413667
0.08077265394882686
0.12253761399130368
0.06959131741047532
0.16912381194955575
0.22846929561731588
0.06213869905465617
0.12816231929437358
0.19641851543376956
0.3419030103403328
0.11672341811949223
0.15660574993736218
0.1728524146345279
0.08077265394882686
0.5350806087934673
0.5028933372593222
0.2541046621801388
0.1270060638338575
0.5750161245015933
0.3065468365176749
0.12253761399130368
0.26805292692923083
0.06959131741047532
0.22717661356005003
0.06758076142760204
0.12620369888274396
0.30762659501246997
0.11505701045299126
0.18578585805848646

In [249]:
fined_cands_set

[(0.954600745627402,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.',
   'he was not booked by the referee but could face a heavy retrospective ban.']),
 (0.9319011184411028,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.']),
 (0.8820415315846278,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
   'he was

In [107]:
# redun_cands_set = []
# redun_score = compute_txt_redundancy_score(reduced_cand_set_dec)

# for redun_sc, c in zip(redun_score, reduced_cand_set_dec):
#     redun_score = redun_sc.item()
#     redun_cands_set.append((redun_score, c))
    
# redun_cands_set = sorted(redun_cands_set, key=lambda x: -x[0], reverse=True)
# redun_cands_set

0.0
0.0
0.0
0.0
0.0
0.0
0.25290697674418605
0.26495726495726496
0.22590361445783133
0.22093023255813954
0.2755905511811024
0.25290697674418605
0.26495726495726496
0.22590361445783133
0.2845528455284553
0.26495726495726496
0.25301204819277107
0.0
0.0
0.0
0.0
0.0
0.0
0.26838235294117646
0.28523489932885904
0.2786885245901639
0.2768166089965398
0.2727272727272727
0.26595744680851063
0.2888086642599278
0.2737642585551331
0.0
0.0
0.0
0.26838235294117646
0.26838235294117646
0.0
0.0
0.0
0.0
0.23825503355704697
0.28955223880597014
0.28523489932885904
0.2786885245901639
0.2786885245901639
0.0
0.0
0.0
0.0
0.2768166089965398
0.2727272727272727
0.27564102564102566
0.2768166089965398
0.2727272727272727
0.0
0.0
0.0
0.26595744680851063
0.2888086642599278
0.2737642585551331
0.0
0.0
0.0
0.27666666666666667
0.26595744680851063
0.2888086642599278
0.0
0.0
0.2737642585551331
0.0


[(0.0,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'he was not booked by the referee but could face a heavy retrospective ban.']),
 (0.0,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.']),
 (0.0,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'he was not booked by the referee but could face a heavy retrospective ban.']),
 (0.0,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
 

In [43]:
# origin candidate cosine similarity 
threshold = min([np.mean(get_cos_similarity(article, cand[0])) for cand in candidate])
threshold

0.3749457697160617

In [81]:
# Drop candidate which has lower score than threshold
fined_cands_set = [sc for sc in cos_sim_cands_set if sc[0] >= threshold]

In [84]:
fined_cands_set = [sc for sc in cos_sim_cands_set[:20]]

In [85]:
fined_cands_set

[(-0.000411789336754273,
  ['the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
   'he was not booked by the referee but could face a heavy retrospective ban.',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.']),
 (-0.000411789336754273,
  ['the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
   'he was not booked by the referee but could face a heavy retrospective ban.',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.']),
 (-0.000411789336754273,
  ['the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
   'he was not booked by the referee but could face a heavy retrospective ban.',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.']),
 (-0.005036555798719977,


In [211]:
len(fined_cands_set)

70

In [193]:
def get_lcs(X, Y, m, n):
    if m == 0 or n == 0:
        return 0
    elif X[m-1] == Y[n-1]:
        return 1 + get_lcs(X, Y, m-1, n-1)
    else:
        return max(get_lcs(X, Y, m, n-1), get_lcs(X, Y, m-1, n))

In [165]:
def compute_redundancy_score(candidate_id):

    cand_num = len(candidate_id)
    
    score = torch.zeros([cand_num], dtype=torch.float64)
        
    def _compute_redundancy(cand):
        redundancy = 0.0
        
        for i, src_sen in enumerate(cand):
            for j, tgt_sen in enumerate(cand[i+1:]):
                print(src_sen)
                print(tgt_sen)

                lcs_val = get_lcs(src_sen, tgt_sen, len(src_sen), len(tgt_sen))
                print(lcs_val)
                redundancy += lcs_val / len(src_sen)
                print(redundancy)
                
        return redundancy

    for i in range(cand_num):
        print(i)
        score[i] = np.mean(_compute_redundancy(candidate_id[i]))

    return score

In [166]:
import pylcs

def compute_txt_redundancy_score(candidate_id):

    cand_num = len(candidate_id)
    
    score = torch.zeros([cand_num], dtype=torch.float64)
        
    def _compute_redundancy(cand):
        redundancy = 0.0
        
        for i, src_sen in enumerate(cand):
            for j, tgt_sen in enumerate(cand[i+1:]):
                if i != j:
                    lcs_val = pylcs.lcs(src_sen, tgt_sen)
                    redundancy += lcs_val 
        
        sents_len = sum([len(s) for sents in cand for s in sents])
        return redundancy / sents_len

    for i in range(cand_num):
        score[i] = np.mean(_compute_redundancy(candidate_id[i]))

    return score

### origin redundancy & rouge

In [167]:
# cand_set_enc = [[bert_encode(s, 180) for s in cs] for cs in candidate_sets]


In [168]:
compute_txt_redundancy_score(candidate_sets)

tensor([0.2530, 0.3902, 0.2604], dtype=torch.float64)

In [172]:
origin_scores = []
origin_cands = [(rouge.get_scores('\n'.join(abstract), '\n'.join(cand[0]))[0]['rouge-l']['f'], cand[0]) for cand in candidate]

for re_sc, (ro_sc, sent) in zip(compute_txt_redundancy_score(candidate_sets), origin_cands):
    origin_scores.append((re_sc.item(), ro_sc, sent))


In [174]:
origin_scores = sorted(origin_scores, key=lambda x: -x[0], reverse=True)
origin_scores

[(0.2530487804878049,
  0.447368416398892,
  ["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness']),
 (0.26037735849056604,
  0.44776118916462465,
  ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness .',
   'he was not booked by the referee but could face a heavy retrospective ban .']),
 (0.3902439024390244,
  0.5116279025851812,
  ["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the 

In [175]:
origin_scores = sorted(origin_scores, key=lambda x: x[1], reverse=True)
origin_scores

[(0.3902439024390244,
  0.5116279025851812,
  ["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .']),
 (0.26037735849056604,
  0.44776118916462465,
  ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness .',
   'he was not booked by the referee but could face a heavy retrospective ban .']),
 (0.2530487804878049,
  0.447368416398892,
  ["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was n

In [176]:
origin_scores = sorted(origin_scores, key=lambda x: -x[0]*0.1 +x[1], reverse=True)
origin_scores

[(0.3902439024390244,
  0.5116279025851812,
  ["juan arango bites jesus zavela in a moment of madness in club tijuana 's 4-3 defeat by monterrey in the mexican league .",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat .',
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down .']),
 (0.2530487804878049,
  0.447368416398892,
  ["club tijuana star juan arango conjured memories luis suarez in his team 's 4-3 defeat by monterrey in the mexican league - but it was not through prodigious scoring .",
   'he was not booked by the referee but could face a heavy retrospective ban .',
   'juan arango ( left ) bites the shoulder of opponent jesus zavela in a moment of madness']),
 (0.26037735849056604,
  0.44776118916462465,
  ["juan arango bites jesus zavela in club tijuana 's 4-3 defeat by monterre

### refined redundancy & rouge

In [250]:
fined_cands_set

[(0.954600745627402,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.',
   'he was not booked by the referee but could face a heavy retrospective ban.']),
 (0.9319011184411028,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.']),
 (0.8820415315846278,
  ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
   'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
   'he was

In [258]:
refined_cands_set_ = [[s for s in cs[1]] for cs in fined_cands_set]
refined_cands_set_

[["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'the venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.',
  'he was not booked by the referee but could face a heavy retrospective ban.'],
 ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
  'he was not booked by the referee but could face a heavy retrospective ban.'],
 ["club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.",
  'the venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.',
  'he was not bo

In [259]:
compute_txt_redundancy_score(refined_cands_set_)

tensor([0.2209, 0.2259, 0.2259, 0.0000, 0.0000, 0.2383, 0.2756, 0.2846, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.2896, 0.2529, 0.2529, 0.2650, 0.2650,
        0.2650, 0.2530, 0.0000, 0.0000, 0.2768, 0.2768, 0.2727, 0.2727, 0.2684,
        0.2684, 0.2787, 0.0000, 0.2787, 0.2787, 0.2852, 0.2852, 0.2768, 0.0000,
        0.0000, 0.0000, 0.2684, 0.2756, 0.2660, 0.2660, 0.2660, 0.0000, 0.2888,
        0.2888, 0.2888, 0.0000, 0.0000, 0.0000, 0.2767, 0.2727, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2738, 0.2738,
        0.2738, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
       dtype=torch.float64)

In [260]:
redun_score = compute_txt_redundancy_score(refined_cands_set_)

scores = []
for re_sc, (ro_sc, rouge_sc, sent) in zip(redun_score, [(cs[0], 
                                               rouge.get_scores('\n'.join(abstract), '\n'.join(cs[1]))[0]['rouge-l']['f'],
                                               '\n'.join(cs[1])) for cs in fined_cands_set]):
    scores.append((re_sc.item(), ro_sc, rouge_sc, sent))


## TEST

In [235]:
text = "he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.\njuan arango bites jesus zavela in club tijuana's 4 - 3 defeat by monterrey in the mexican league."
text_2 = "juan arango bites jesus zavela in a moment of madness in club tijuana's 4 - 3 defeat by monterrey in the mexican league.\nthe venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.\nhe was not booked by the referee but could face a heavy retrospective ban."

for cand in candidate:
    
    r = rouge.get_scores('\n'.join(cand[0]), text)[0]
    print(r, '\n')
    b = np.mean(get_cos_similarity(article, text.split('\n')))
    print(b, '\n')
    
    r = rouge.get_scores('\n'.join(cand[0]), text_2)[0]
    print(r, '\n')
    b = np.mean(get_cos_similarity(article, text_2.split('\n')))
    print(b, '\n\n')

{'rouge-1': {'r': 0.5957446808510638, 'p': 0.5833333333333334, 'f': 0.5894736792110804}, 'rouge-2': {'r': 0.4528301886792453, 'p': 0.42105263157894735, 'f': 0.4363636313702479}, 'rouge-l': {'r': 0.574468085106383, 'p': 0.5625, 'f': 0.5684210476321331}} 

0.10898081428954975 

{'rouge-1': {'r': 0.7441860465116279, 'p': 0.6666666666666666, 'f': 0.7032966983117982}, 'rouge-2': {'r': 0.5185185185185185, 'p': 0.49122807017543857, 'f': 0.5045044995081569}, 'rouge-l': {'r': 0.6976744186046512, 'p': 0.625, 'f': 0.6593406543557543}} 

0.7700408398990929 


{'rouge-1': {'r': 0.8723404255319149, 'p': 0.7068965517241379, 'f': 0.7809523760072563}, 'rouge-2': {'r': 0.8113207547169812, 'p': 0.5972222222222222, 'f': 0.68799999511552}, 'rouge-l': {'r': 0.8723404255319149, 'p': 0.7068965517241379, 'f': 0.7809523760072563}} 

0.10898081428954975 

{'rouge-1': {'r': 0.9069767441860465, 'p': 0.6724137931034483, 'f': 0.7722772228330557}, 'rouge-2': {'r': 0.9074074074074074, 'p': 0.6805555555555556, 'f': 0.7

In [239]:
r = rouge.get_scores('\n'.join(article), text)[0]
print(r, '\n')
b = np.mean(get_cos_similarity(article, text.split('\n')))
print(b, '\n')
    
r = rouge.get_scores('\n'.join(article), text_2)[0]
print(r, '\n')
b = np.mean(get_cos_similarity(article, text_2.split('\n')))
print(b, '\n\n')

{'rouge-1': {'r': 0.8936170212765957, 'p': 0.35, 'f': 0.5029940079314426}, 'rouge-2': {'r': 0.7358490566037735, 'p': 0.2154696132596685, 'f': 0.3333333298294251}, 'rouge-l': {'r': 0.8723404255319149, 'p': 0.3416666666666667, 'f': 0.49101796002725095}} 

0.10898081428954975 

{'rouge-1': {'r': 0.9302325581395349, 'p': 0.3333333333333333, 'f': 0.490797542128044}, 'rouge-2': {'r': 0.7407407407407407, 'p': 0.22099447513812154, 'f': 0.3404255283751924}, 'rouge-l': {'r': 0.8837209302325582, 'p': 0.31666666666666665, 'f': 0.4662576648274305}} 

0.7700408398990929 




In [261]:
scores = sorted(scores, key=lambda x: x[2], reverse=True)
scores

[(0.26595744680851063,
  0.5999987743168007,
  0.5333333286542222,
  "he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.\njuan arango bites jesus zavela in club tijuana's 4 - 3 defeat by monterrey in the mexican league."),
 (0.26595744680851063,
  0.5999987743168007,
  0.5333333286542222,
  "he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.\njuan arango bites jesus zavela in club tijuana's 4 - 3 defeat by monterrey in the mexican league."),
 (0.26595744680851063,
  0.5999987743168007,
  0.5333333286542222,
  "he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.\njuan arango bites jes

In [262]:
scores = sorted(scores, key=lambda x: -x[0], reverse=True)
scores

[(0.0,
  0.5527758954408039,
  0.4761904712522046,
  'he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.'),
 (0.0,
  0.5527758954408039,
  0.4761904712522046,
  'he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.'),
 (0.0,
  0.5527758954408039,
  0.4761904712522046,
  'he was not booked by the referee but could face a heavy retrospective ban.\narango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.'),
 (0.0,
  0.5884327720329133,
  0.4687499950781251,
  "arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2 - 0 down.\njuan arango bites jesus zavela in club tijuana's 4 - 3 defeat by monterrey in the mexican league."

In [263]:
scores = sorted(scores, key=lambda x: x[1], reverse=True)
scores

[(0.22093023255813954,
  1.4903150272600552,
  0.31578946903047095,
  "club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.\nthe venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.\nhe was not booked by the referee but could face a heavy retrospective ban."),
 (0.22590361445783133,
  1.405851051312519,
  0.34210525850415513,
  "club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.\nthe venezuelan icon sank his teeth into the shoulder of jesus zavala in a moment of madness.\nhe was not booked by the referee but could face a heavy retrospective ban."),
 (0.22590361445783133,
  1.405851051312519,
  0.34210525850415513,
  "club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterr

In [264]:
rouge.get_scores('\n'.join(abstract), scores[0][-1])

[{'rouge-1': {'r': 0.25, 'p': 0.42857142857142855, 'f': 0.31578946903047095},
  'rouge-2': {'r': 0.08333333333333333,
   'p': 0.15625,
   'f': 0.10869564763705122},
  'rouge-l': {'r': 0.25, 'p': 0.42857142857142855, 'f': 0.31578946903047095}}]

In [265]:
scores = sorted(scores, key=lambda x: -x[0]*0.1 + x[1], reverse=True)
scores

[(0.22093023255813954,
  1.4903150272600552,
  0.31578946903047095,
  "club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.\nthe venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.\nhe was not booked by the referee but could face a heavy retrospective ban."),
 (0.0,
  1.384731303417963,
  0.17910447274671432,
  "club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.\nthe venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat."),
 (0.22590361445783133,
  1.405851051312519,
  0.34210525850415513,
  "club tijuana star juan arango conjured memories luis suarez in his team's 4 - 3 defeat by monterrey in the mexican league - but it was not through prodigious scoring.\nthe venez

In [156]:
scores[0]

(0.23825503355704697,
 0.7617588029286911,
 "juan arango bites jesus zavela in a moment of madness in club tijuana's 4 - 3 defeat by monterrey in the mexican league.\nthe venezuelan icon sank his teeth into the shoulder of the opponent as his temper flared in the defeat.\nhe was not booked by the referee but could face a heavy retrospective ban.")

In [185]:
rouge.get_scores('\n'.join(abstract), scores[0][-1])

[{'rouge-1': {'r': 0.32558139534883723, 'p': 0.5, 'f': 0.39436619240626863},
  'rouge-2': {'r': 0.09259259259259259,
   'p': 0.15625,
   'f': 0.11627906509464593},
  'rouge-l': {'r': 0.3023255813953488,
   'p': 0.4642857142857143,
   'f': 0.36619717832176163}}]

In [102]:
from rouge import Rouge
rouge = Rouge()

rouge.get_scores('\n'.join(abstract), '\n'.join(scores[10][-1]))[0]['rouge-l']['f'] * 100

3.5087714299785264

In [66]:
abstract

['juan arango escaped punishment from the referee for biting jesus zavela .',
 'he could face a retrospective punishment for the incident .',
 "arango had earlier scored a free kick in his team 's 4-3 defeat ."]

### abstract redundancy & rouge

In [67]:
compute_txt_redundancy_score([abstract])

tensor([0.2944], dtype=torch.float64)

In [68]:
np.mean(get_cos_similarity(article, abstract))

0.06993467965281458