In [2]:
import utils
from rouge_score import rouge_scorer, scoring
import matplotlib.pyplot as plt
import numpy as np
from keras.models import Model, load_model
import pandas as pd
from sklearn.model_selection import train_test_split
import random

In [3]:
# setup ROUGE scorer and tokenizer
ROUGE_METRICS = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
scorer = rouge_scorer.RougeScorer(ROUGE_METRICS, use_stemmer=True)

# Load trained encoder-decoder model
encoder_model = load_model('../trained_models/encoder_model.h5', compile=False)
decoder_model = load_model('../trained_models/decoder_model.h5', compile=False)

















In [5]:
post_pre = pd.read_csv('../data/ed_cleaned_data.csv')
post_pre = post_pre.loc[:, ~post_pre.columns.str.contains('^Unnamed')]
post_pre = post_pre.fillna("")

x_tr, x_val, y_tr, y_val = train_test_split(
    np.array(post_pre["text"]),
    np.array(post_pre["summary"]),
    test_size=0.1,
    random_state=0,
    shuffle=False,
)

max_text_len = 100
max_summary_len = 20

art_tokenizer, x_tr, x_val = utils.build_tokenizer(x_tr, x_val, max_text_len)
head_tokenizer, y_tr, y_val = utils.build_tokenizer(y_tr, y_val, max_summary_len)

# Size of vocabulary (+1 for padding token)
art_voc = art_tokenizer.num_words + 1
head_voc = head_tokenizer.num_words + 1

reverse_target_word_index = head_tokenizer.index_word
reverse_source_word_index = art_tokenizer.index_word
target_word_index = head_tokenizer.word_index

% of rare words in vocabulary:  45.7757871354066
% of rare words in vocabulary:  48.27797149324035


In [6]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq, verbose=False)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq] + [e_out, e_h, e_c], verbose=False)

        # Sample a token
        sampled_token_index = random.choice(np.argsort(output_tokens[0, -1, :])[3:5])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [7]:
def seq2summary(input_seq):
    # To convert sequence to summary
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['sostok'] and i != target_word_index['eostok']:
            newString = newString + reverse_target_word_index[i] + ' '
    return newString

def seq2text(input_seq):
    # To convert sequence to text
    newString = ''
    for i in input_seq:
        if i != 0:
            newString = newString + reverse_source_word_index[i] + ' '
    return newString

In [9]:
ah = {} # GET TESTING HEADLINES FOR EACH DATASET
ph = {} # GET ED PREDICTED HEADLINES FOR EACH DATASET

for i in range(0, 1000):
    actual_headline = seq2summary(y_tr[i])
    predicted_headline = decode_sequence(x_tr[i].reshape(1, max_text_len))
    ah[i] = actual_headline
    ph[i] = predicted_headline
    print(f'{i}:')
    print(f'\tactual: {actual_headline}')
    print(f'\tpredic: {predicted_headline}')


0:
	actual: start hillary offered nothing to nevertrump conservatives end 
	predic:  drought elie hollande e pay challenging assailant digs oppose ’gun confront carry corbyn profiling ouster recommend marco target’s beijing’s
1:
	actual: start trump just made an unprecedented change to the national security council end 
	predic:  legalize ford reduce counting tortured syrian prior river gorsuch matter’ widow spox sabotage before before philippine frank christmas tiger
2:
	actual: start risky flight from south pole arrives in chile end 
	predic:  legalize laden facebook fallen ’we fact fracking wake incidents lochte ballot y premier bieber manchin mcconnell veterans failures duke
3:
	actual: start trump tells reporters work just ask israel’ end 
	predic:  legalize adviser face a politics’ nationalists ’trump partners prominent viola create spent organization pipeline runner courage building manuel ticket
4:
	actual: start exclusive—regnery why law enforcement will like justice neil gors

KeyboardInterrupt: 