In [1]:
import utils
from rouge_score import rouge_scorer
import numpy as np
from keras.models import load_model
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import os

In [2]:
# setup ROUGE scorer and tokenizer
ROUGE_METRICS = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
scorer = rouge_scorer.RougeScorer(ROUGE_METRICS, use_stemmer=True)

# Load trained encoder-decoder model
encoder_model = load_model('../trained_models/encoder_model.h5', compile=False)
decoder_model = load_model('../trained_models/decoder_model.h5', compile=False)

















In [3]:
post_pre = pd.read_csv('../data/ed_cleaned_data.csv')
post_pre = post_pre.loc[:, ~post_pre.columns.str.contains('^Unnamed')]
post_pre = post_pre.fillna("")

#Train and test split
x_tr, x_val, y_tr, y_val = train_test_split(
    np.array(post_pre["text"]),
    np.array(post_pre["summary"]),
    test_size=0.1,
    random_state=0,
    shuffle=False,
)

max_text_len = 100
#This will cap the max length of the summaries produced to be 10 words long
max_summary_len = 10

#Tokenizer for articles
art_tokenizer, x_tr, x_val = utils.build_tokenizer(x_tr, x_val, max_text_len)
#Tokenizer for headlines
head_tokenizer, y_tr, y_val = utils.build_tokenizer(y_tr, y_val, max_summary_len)

# Size of vocabulary (+1 for padding token)
art_voc = art_tokenizer.num_words + 1
head_voc = head_tokenizer.num_words + 1

reverse_target_word_index = head_tokenizer.index_word
reverse_source_word_index = art_tokenizer.index_word
target_word_index = head_tokenizer.word_index

% of rare words in vocabulary:  45.7757871354066
% of rare words in vocabulary:  48.27797149324035


In [4]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    (e_out, e_h, e_c) = encoder_model.predict(input_seq, verbose=False)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate first word with start word
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        #Predicts value from decoder
        (output_tokens, h, c) = decoder_model.predict([target_seq] + [e_out, e_h, e_c], verbose=False)

        #Gets a random sample from the highest probability indices 
        sampled_token_index = 0
        while sampled_token_index == 0:
            sampled_token_index = random.choice(np.argsort(output_tokens[0, -1, :])[0:5])
        
        #Gets the word associated with sampled token
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [5]:
def seq2summary(input_seq):
    """
    Converts sequence of tokenized values to its respective article.
    """
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['sostok'] and i != target_word_index['eostok']:
            newString = newString + reverse_target_word_index[i] + ' '
    return newString

def seq2text(input_seq):
    """
    Converts sequence of tokenized values to its respective headline.

    """
    newString = ''
    for i in input_seq:
        if i != 0:
            newString = newString + reverse_source_word_index[i] + ' '
    return newString

In [6]:
ah = {} # GET TESTING HEADLINES FOR EACH DATASET
ph = {} # GET ED PREDICTED HEADLINES FOR EACH DATASET

if 'results_ed.csv' in os.listdir('../data/'):
    preds_df = pd.read_csv('../data/results_ed.csv')
    preds = {'predicted headline': preds_df['predicted headline'].tolist(), 'actual headline': preds_df['actual headline'].tolist()}
else:
    preds = {'predicted headline': [], 'actual headline': []}
    
    
i = len(preds['predicted headline'])

#Loops through 1000 articles to predict their headline then load it into csv file
while i <= 1000:
    actual_headline = seq2summary(y_tr[i])[6:-4]
    predicted_headline = decode_sequence(x_tr[i].reshape(1, max_text_len))
    ah[i] = actual_headline
    ph[i] = predicted_headline
    preds['predicted headline'].append(predicted_headline)
    preds['actual headline'].append(actual_headline[6:-4]) # Remove start and end from actual headline for results
    print(f'{i}:')
    print(f'\tactual: {actual_headline}')
    print(f'\tpredic: {predicted_headline}')
    if i % 50 == 0 and i != 0:
        preds_df = pd.DataFrame.from_dict(preds, orient='columns')
        preds_df.to_csv('../data/results_ed.csv')
    i += 1


0:
	actual: start hillary offered nothing to nevertrump conservatives end 
	predic:  krauthammer
1:
	actual: an unprecedented change to the national security council end 
	predic:  whoopi 84 84 preserve ampt york service’ undo turley
2:
	actual: risky flight from south pole arrives in chile end 
	predic:  krauthammer service’ install sostok flashback supporter’s turley rasmussen politico
3:
	actual: start trump tells reporters work just ask israel’ end 
	predic:  flashback ‘for abolish ‘for thrones’ kneel turley bottom’ sostok
4:
	actual: why law enforcement will like justice neil gorsuch end 
	predic:  hamilton’ country’ korea semitic turley turley turley service’ fabric
5:
	actual: start harry belafonte don’t die’ end 
	predic:  krauthammer interaction preserve sostok whoopi sostok krauthammer turley hamilton’
6:
	actual: start breitbart news daily trump addresses congress end 
	predic:  delingpole imminent’ solve turley rehear turley bottom’ fracture kent
7:
	actual: start migrants 