In [69]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding, RepeatVector, concatenate, TimeDistributed
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import utils
from sklearn.model_selection import train_test_split
import numpy as np
import random


In [70]:
CLEANED_ARTICLES_FILE = '../data/cleaned_articles_ed.txt'
CLEANED_HEADLINES_FILE = '../data/cleaned_headlines_ed.txt'

In [190]:
with open(CLEANED_ARTICLES_FILE,'r', encoding="utf-8") as file:
    data = file.read()
    articles = data.split('\n')     
    
with open(CLEANED_HEADLINES_FILE,'r', encoding="utf-8") as file:
    data = file.read()
    headlines = data.split('\n') 
    
articles = articles[:1000]
headlines = headlines[:1000]

In [191]:
art_tokenizer, encoded_art = utils.create_tokenizer(articles)
head_tokenizer, encoded_head = utils.create_tokenizer(headlines)

In [192]:
MAX_LENGTH = 55
MAX_HEADLINE_LENGTH = 15

encoder_vocab_size = len(art_tokenizer.word_index) + 1
decoder_vocab_size = len(head_tokenizer.word_index) + 1

print(encoder_vocab_size)
print(decoder_vocab_size)


src_txt_length = len(encoded_art)
sum_txt_length = len(encoded_head)

print(src_txt_length)
print(sum_txt_length)

padded_encoded_art = pad_sequences(encoded_art,  maxlen=src_txt_length, padding='post')
padded_encoded_head = pad_sequences(encoded_head,  maxlen=sum_txt_length, padding='post')

print((padded_encoded_art[0]))
print((padded_encoded_head[0]))


7669
2307
1000
1000
[ 484    1  228   44   16  283    3  879  114  420 3468  712   11 1434
   45  448  264    3  786  370    4  101  880   20   11    1   51   44
  334  195  322  181   12  370    4  712   24   33 1435    7  420 3468
 3472    8  196 2319    2    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0  

In [100]:
X_train, X_test, y_train, y_test = train_test_split(np.array(padded_encoded_art), np.array(padded_encoded_head), test_size=.10, random_state=82)

In [148]:

#https://machinelearningmastery.com/encoder-decoder-models-text-summarization-keras/

#I would choose Alternate 1 from this link
#That I belive is how we should create our encoder deccoder model

#NOTE: Example code here
# article input model
# inputs1 = Input(shape=(src_txt_length,))
# article1 = Embedding(encoder_vocab_size, 128)(inputs1)
# article2 = LSTM(128)(article1)
# article3 = RepeatVector(sum_txt_length)(article2)
# # summary input model
# inputs2 = Input(shape=(sum_txt_length,))
# summ1 = Embedding(encoder_vocab_size, 128)(inputs2)


# # decoder model
# decoder1 = concatenate([article3, summ1])
# decoder2 = LSTM(128)(decoder1)
# outputs = Dense(decoder_vocab_size, activation='softmax')(decoder2)

# # tie it together [article, summary] [word]
# model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# model.compile(loss='categorical_crossentropy', optimizer='adam')

# model.summary()


# encoder input model
encoder_inputs = Input(shape=(src_txt_length,))
encoder_emb_layer = Embedding(encoder_vocab_size, 128, trainable=True)
encoder_emb = encoder_emb_layer(encoder_inputs)
encoder_lstm1 = LSTM(128, return_sequences=True, return_state=True, dropout=.4, recurrent_dropout=.4)

encoder_output1, state_h, state_c = encoder_lstm1(encoder_emb)


# decoder output model
decoder_inputs = Input(shape=(None, ))
decoder_emb_layer = Embedding(decoder_vocab_size, 128, trainable=True)
decoder_emb = decoder_emb_layer(decoder_inputs)
decoder_lstm1 = LSTM(128, return_sequences=True, return_state=True, dropout=.4, recurrent_dropout=.4)

decoder_outputs, _, _ = decoder_lstm1(decoder_emb, initial_state=[state_h, state_c])

# dense layer
decoder_outputs_layer = (Dense(decoder_vocab_size, activation='softmax'))
decoder_outputs = decoder_outputs_layer(decoder_outputs)

# tie it together
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# es = EarlyStopping(monitor='val-loss', mode='min', verbose=1, patience=2)
model.summary()

Model: "model_32"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_90 (InputLayer)       [(None, 1000)]               0         []                            
                                                                                                  
 input_91 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_38 (Embedding)    (None, 1000, 128)            981632    ['input_90[0][0]']            
                                                                                                  
 embedding_39 (Embedding)    (None, None, 128)            295296    ['input_91[0][0]']            
                                                                                           

In [76]:
# #https://medium.com/@duncanboynton/seq2seq-news-article-summary-using-an-encoder-decoder-lstm-to-summarize-text-5de56fccfbf6

# # 10 points

# def data_generator(X: list, y: list, num_sequences_per_batch: int, vocab_size: int) -> (np.array,np.array):
#     '''
#     Returns data generator to be used by feed_forward
#     https://wiki.python.org/moin/Generators
#     https://realpython.com/introduction-to-python-generators/
    
#     Yields batches of embeddings and labels to go with them.
#     Use one hot vectors to encode the labels 
#     (see the to_categorical function)
    
#     If for_feedforward is True: 
#     Returns data generator to be used by feed_forward
#     else: Returns data generator for RNN model
#     '''    
#     one = []
#     two = []    
#     for idx, i in enumerate(X):
#         if idx > 0 and idx % (num_sequences_per_batch) == 0:
#             yield np.array(one), to_categorical(two, num_classes=vocab_size)
#             one = []
#             two = []
#         one.append(i)
#         two.append(y[idx])
        
        
# word_generator = data_generator(X_train, y_train, 100, decoder_vocab_size)



In [102]:
# steps_per_epoch_words = len(X_train)//128  # Number of batches per epoch

model.fit(
    [X_train, y_train[:,:-1]],
    y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:,1:],
    epochs=5,
    batch_size=100)
    # validation_data=([X_test, y_test[:, :-1]], y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:, 1:]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1757f942e30>

In [149]:
# Inference Models

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_output1,
                      state_h, state_c])

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(128, ))
decoder_state_input_c = Input(shape=(128, ))
decoder_hidden_state_input = Input(shape=(sum_txt_length, 128))

# Get the embeddings of the decoder sequence
dec_emb2 = decoder_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm1(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])


# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_outputs_layer(decoder_outputs2)

# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [205]:
def decode_sequence(input_seq):

    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = 1

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c], verbose=0)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = head_tokenizer.index_word[sampled_token_index]

        if sampled_token != '<eos>':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == '<eos>' or len(decoded_sentence.split()) >= MAX_HEADLINE_LENGTH - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [206]:
# To convert sequence to summary
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0 and i != head_tokenizer.word_index['<eos>']:
            newString = newString + head_tokenizer.index_word[i] + ' '

    return newString


# To convert sequence to text
def seq2text(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0:
            newString = newString + art_tokenizer.index_word[i] + ' '

    return newString



In [207]:
for i in range(0, 19):
    print ('Review:', seq2text(X_test[i]))
    print ('Original summary:', seq2summary(y_test[i]))
    print ('Predicted summary:', decode_sequence(X_test[i].reshape(1,
           src_txt_length)))
    print ('\n')

Review: islamabad directorate general of immigration and passports has contributed rs 86 billion to national exchequer on account of passport and visa fee during last five years the introduction of new measures as per modern requirements by ministry of interior have resulted in enhanced earning in passport and visa fee 
Original summary: passport visa fees fetch rs 86571 bln five year 
Predicted summary:  relief long long long long long long tranche 66 66 succeeds cabinet another q


Review: sydney china's factories flatlined in june as exports shrank and jobs were cut a worrying trend evident across asia that argues for yet more policy stimulus as doubts gather over the potency of measures taken so far the hard times signalled by a range of surveys was not what the 
Original summary: asian factories struggle brexit throws up new thr 
Predicted summary:  relief long long long long long long tranche 66 66 succeeds cabinet another q


Review: karachi pakistan shares closed lower on monda

In [153]:
print(decode_sequence(X_test[0].reshape(1, src_txt_length)))

 relief long long long long long long tranche 66 66 succeeds cabinet another q


In [78]:
result = model.predict([X_test, y_test])



In [162]:
# EOS = '<eos>'

# headlines = []

# for i in result:
#     sentence = []
#     for j in range(200):
#         highest = (i[j][1:].argsort()[-5:][::-1])
#         choice = random.choices(highest)[0]
#         sentence.append(choice)
#         if choice == 1:
#             break
        
#     headlines.append([head_tokenizer.index_word[i] for i in sentence])
    
# # for j in headlines:
# #     print(j)        
# for i in X_test:
#     print([art_tokenizer.index_word[k] for k in i if k != 0])
        

['islamabad', 'directorate', 'general', 'of', 'immigration', 'and', 'passports', 'has', 'contributed', 'rs', '86', 'billion', 'to', 'national', 'exchequer', 'on', 'account', 'of', 'passport', 'and', 'visa', 'fee', 'during', 'last', 'five', 'years', 'the', 'introduction', 'of', 'new', 'measures', 'as', 'per', 'modern', 'requirements', 'by', 'ministry', 'of', 'interior', 'have', 'resulted', 'in', 'enhanced', 'earning', 'in', 'passport', 'and', 'visa', 'fee']
['sydney', "china's", 'factories', 'flatlined', 'in', 'june', 'as', 'exports', 'shrank', 'and', 'jobs', 'were', 'cut', 'a', 'worrying', 'trend', 'evident', 'across', 'asia', 'that', 'argues', 'for', 'yet', 'more', 'policy', 'stimulus', 'as', 'doubts', 'gather', 'over', 'the', 'potency', 'of', 'measures', 'taken', 'so', 'far', 'the', 'hard', 'times', 'signalled', 'by', 'a', 'range', 'of', 'surveys', 'was', 'not', 'what', 'the']
['karachi', 'pakistan', 'shares', 'closed', 'lower', 'on', 'monday', 'as', 'investors', 'took', 'a', 'cautio