In [145]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding, RepeatVector, concatenate, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import utils
from sklearn.model_selection import train_test_split
import numpy as np
import random


In [2]:
CLEANED_ARTICLES_FILE = '../data/cleaned_articles_ed.txt'
CLEANED_HEADLINES_FILE = '../data/cleaned_headlines_ed.txt'

In [120]:
with open(CLEANED_ARTICLES_FILE,'r', encoding="utf-8") as file:
    data = file.read()
    articles = data.split('\n')     
    
with open(CLEANED_HEADLINES_FILE,'r', encoding="utf-8") as file:
    data = file.read()
    headlines = data.split('\n') 
    
articles = articles[:1000]
headlines = headlines[:1000]

In [121]:
art_tokenizer, encoded_art = utils.create_tokenizer(articles)
head_tokenizer, encoded_head = utils.create_tokenizer(headlines)

In [128]:
MAX_LENGTH = 1000
MAX_HEADLINE_LENGTH = 1000

encoder_vocab_size = len(art_tokenizer.word_index)
decoder_vocab_size = len(head_tokenizer.word_index)

print(encoder_vocab_size)
print(decoder_vocab_size)


src_txt_length = len(encoded_art)
sum_txt_length = len(encoded_head)

print(src_txt_length)
print(sum_txt_length)

padded_encoded_art = pad_sequences(encoded_art,  maxlen=MAX_LENGTH, padding='post')
padded_encoded_head = pad_sequences(encoded_head,  maxlen=MAX_HEADLINE_LENGTH, padding='post')

print((padded_encoded_art[0]))
print((padded_encoded_head[0]))


7669
2307
1000
1000
[1401  485    1  229   45   17  284    3  880  115  421 3469  713   12
 1435   46  449  265    3  787  371    4  102  881   21   12    1   52
   45  335  196  323  182   13  371    4  713   25   34 1436    8  421
 3469 3473    9  197 2320    2 1401    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0  

In [129]:
X_train, X_test, y_train, y_test = train_test_split(np.array(padded_encoded_art), np.array(padded_encoded_head), test_size=.10, random_state=82)


In [130]:

#https://machinelearningmastery.com/encoder-decoder-models-text-summarization-keras/

#I would choose Alternate 1 from this link
#That I belive is how we should create our encoder deccoder model

#NOTE: Example code here
# article input model
# inputs1 = Input(shape=(src_txt_length,))
# article1 = Embedding(encoder_vocab_size, 128)(inputs1)
# article2 = LSTM(128)(article1)
# article3 = RepeatVector(sum_txt_length)(article2)
# # summary input model
# inputs2 = Input(shape=(sum_txt_length,))
# summ1 = Embedding(encoder_vocab_size, 128)(inputs2)


# # decoder model
# decoder1 = concatenate([article3, summ1])
# decoder2 = LSTM(128)(decoder1)
# outputs = Dense(decoder_vocab_size, activation='softmax')(decoder2)

# # tie it together [article, summary] [word]
# model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# model.compile(loss='categorical_crossentropy', optimizer='adam')

# model.summary()


# encoder input model
inputs = Input(shape=(src_txt_length,))
encoder1 = Embedding(encoder_vocab_size, 128)(inputs)
encoder2 = LSTM(128)(encoder1)
encoder3 = RepeatVector(sum_txt_length)(encoder2)

# decoder output model
decoder1 = LSTM(128, return_sequences=True)(encoder3)
outputs = Dense(decoder_vocab_size, activation='softmax')(decoder1)

# tie it together
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_18 (InputLayer)       [(None, 1000)]            0         
                                                                 
 embedding_17 (Embedding)    (None, 1000, 128)         981632    
                                                                 
 lstm_28 (LSTM)              (None, 128)               131584    
                                                                 
 repeat_vector_14 (RepeatVe  (None, 1000, 128)         0         
 ctor)                                                           
                                                                 
 lstm_29 (LSTM)              (None, 1000, 128)         131584    
                                                                 
 dense_14 (Dense)            (None, 1000, 2307)        297603    
                                                          

In [131]:
#https://medium.com/@duncanboynton/seq2seq-news-article-summary-using-an-encoder-decoder-lstm-to-summarize-text-5de56fccfbf6

# 10 points

def data_generator(X: list, y: list, num_sequences_per_batch: int, vocab_size: int) -> (np.array,np.array):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/
    
    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels 
    (see the to_categorical function)
    
    If for_feedforward is True: 
    Returns data generator to be used by feed_forward
    else: Returns data generator for RNN model
    '''    
    one = []
    two = []    
    for idx, i in enumerate(X):
        if idx > 0 and idx % (num_sequences_per_batch) == 0:
            yield np.array(one), to_categorical(two, num_classes=vocab_size)
            one = []
            two = []
        one.append(i)
        two.append(y[idx])
        
        
word_generator = data_generator(X_train, y_train, 100, decoder_vocab_size)



In [132]:
steps_per_epoch_words = len(X_train)//100  # Number of batches per epoch


model.fit(word_generator, epochs=5, steps_per_epoch=steps_per_epoch_words//5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x25a6249d0f0>

In [134]:
result = model.predict(X_test)



In [194]:

final = []
temp = ''
for i in result:
    for j in range(10):
        weights = i[j]
        idx = random.choices(range(decoder_vocab_size), weights=weights)[0]
        temp = temp + head_tokenizer.index_word[idx] + ' '
    final.append(temp)
    temp = ''
    
print(final)

print(' '.join([art_tokenizer.index_word[i] for i in X_test[0] if i != 0]))

['following nestle map rig signed q tumbles forb jet cr ', 'fuel praises resigns 6.19 resist contract nasdaq covering shipbuilder collapse ', 'prolonged files observe invite 146 moratorium gunmaker acknowledges with completed ', 'final selfie per billi sou nawaz votes operations jan income ', 'itc thirsty occurs less chance forge rs5 ongoing domestic files ', 'rs739 shipbuilder 2nd salaried ventures quaid sale nissan washington 742 ', 'mixed feet back acknowledges 20521 curbs russia limited agency spain ', 'sou cheered tig based against review shaky pcin private should ', 'books finalised paradise covering highs september soft 610 exporters 9 ', 'expat operati chinese itc resources items ron add payment slump ', '18.20 modi most fy16 supply per helping futuretech payers row ', 'engines shops remaining guessed wrong 3565 worri chief govts sustain ', 'yemen seeking dubai ups downgrades needs stronger cess currenci gulf ', 'ireland norways basically revises months overcome aircraft edged 