In [1]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding, RepeatVector, concatenate, TimeDistributed
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import utils
from sklearn.model_selection import train_test_split
import numpy as np
import random


In [2]:
CLEANED_ARTICLES_FILE = '../data/cleaned_articles_ed.txt'
CLEANED_HEADLINES_FILE = '../data/cleaned_headlines_ed.txt'

In [3]:
with open(CLEANED_ARTICLES_FILE,'r', encoding="utf-8") as file:
    data = file.read()
    articles = data.split('\n')     
    
with open(CLEANED_HEADLINES_FILE,'r', encoding="utf-8") as file:
    data = file.read()
    headlines = data.split('\n') 
    
articles = articles[:200]
headlines = headlines[:200]

In [4]:
art_tokenizer, encoded_art = utils.create_tokenizer(articles)
head_tokenizer, encoded_head = utils.create_tokenizer(headlines)

In [5]:
MAX_LENGTH = 55
MAX_HEADLINE_LENGTH = 15

encoder_vocab_size = len(art_tokenizer.word_index) + 1
decoder_vocab_size = len(head_tokenizer.word_index) + 1

print(encoder_vocab_size)
print(decoder_vocab_size)


src_txt_length = len(encoded_art)
sum_txt_length = len(encoded_head)

print(src_txt_length)
print(sum_txt_length)

padded_encoded_art = pad_sequences(encoded_art,  maxlen=src_txt_length, padding='post')
padded_encoded_head = pad_sequences(encoded_head,  maxlen=sum_txt_length, padding='post')

print((padded_encoded_art[0]))
print((padded_encoded_head[0]))


2564
724
200
200
[ 216    1  144   73   16  286    2  446  158  607  995  287   11  996
   36  186  159    2  608  288    4   53  447   18   11    1   58   73
  124  109  385   89   19  288    4  287   22   39  609    7  607  995
 1000    9  160 1001    3    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0

In [6]:
X_train, X_test, y_train, y_test = train_test_split(np.array(padded_encoded_art), np.array(padded_encoded_head), test_size=.10, random_state=82)

In [7]:

#https://machinelearningmastery.com/encoder-decoder-models-text-summarization-keras/

#I would choose Alternate 1 from this link
#That I belive is how we should create our encoder deccoder model

#NOTE: Example code here
# article input model
# inputs1 = Input(shape=(src_txt_length,))
# article1 = Embedding(encoder_vocab_size, 128)(inputs1)
# article2 = LSTM(128)(article1)
# article3 = RepeatVector(sum_txt_length)(article2)
# # summary input model
# inputs2 = Input(shape=(sum_txt_length,))
# summ1 = Embedding(encoder_vocab_size, 128)(inputs2)


# # decoder model
# decoder1 = concatenate([article3, summ1])
# decoder2 = LSTM(128)(decoder1)
# outputs = Dense(decoder_vocab_size, activation='softmax')(decoder2)

# # tie it together [article, summary] [word]
# model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# model.compile(loss='categorical_crossentropy', optimizer='adam')

# model.summary()


# encoder input model
encoder_inputs = Input(shape=(src_txt_length,))
encoder_emb = Embedding(encoder_vocab_size, 128, trainable=True)(encoder_inputs)
encoder_lstm1 = LSTM(128, return_sequences=True, return_state=True, dropout=.4, recurrent_dropout=.4)

encoder_output1, state_h, state_c = encoder_lstm1(encoder_emb)




# decoder output model
decoder_inputs = Input(shape=(None, ))
decoder_emb_layer = Embedding(decoder_vocab_size, 128)
decoder_emb = decoder_emb_layer(decoder_inputs)
decoder_lstm1 = LSTM(128, return_sequences=True, dropout=.4, recurrent_dropout=.4)

decoder_outputs = decoder_lstm1(decoder_emb, initial_state=[state_h, state_c])

# dense layer
decoder_outputs = (Dense(decoder_vocab_size, activation='softmax'))(decoder_outputs)

# tie it together
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# es = EarlyStopping(monitor='val-loss', mode='min', verbose=1, patience=2)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 200)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 200, 128)             328192    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 128)            92672     ['input_2[0][0]']             
                                                                                              

In [8]:
# #https://medium.com/@duncanboynton/seq2seq-news-article-summary-using-an-encoder-decoder-lstm-to-summarize-text-5de56fccfbf6

# # 10 points

# def data_generator(X: list, y: list, num_sequences_per_batch: int, vocab_size: int) -> (np.array,np.array):
#     '''
#     Returns data generator to be used by feed_forward
#     https://wiki.python.org/moin/Generators
#     https://realpython.com/introduction-to-python-generators/
    
#     Yields batches of embeddings and labels to go with them.
#     Use one hot vectors to encode the labels 
#     (see the to_categorical function)
    
#     If for_feedforward is True: 
#     Returns data generator to be used by feed_forward
#     else: Returns data generator for RNN model
#     '''    
#     one = []
#     two = []    
#     for idx, i in enumerate(X):
#         if idx > 0 and idx % (num_sequences_per_batch) == 0:
#             yield np.array(one), to_categorical(two, num_classes=vocab_size)
#             one = []
#             two = []
#         one.append(i)
#         two.append(y[idx])
        
        
# word_generator = data_generator(X_train, y_train, 100, decoder_vocab_size)



In [9]:
# steps_per_epoch_words = len(X_train)//128  # Number of batches per epoch

model.fit(
    [X_train, y_train[:,:-1]],
    y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:,1:],
    epochs=4,
    batch_size=100)
    # validation_data=([X_test, y_test[:, :-1]], y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:, 1:]))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1757a059210>