In [1]:
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

2023-03-15 16:16:37.013122: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the dataset
data = pd.read_csv('train.csv', nrows=1000)

# Preprocess the articles and highlights
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-z0-9.,!?%$ ]', '', text)
    return text

data['article'] = data['article'].apply(preprocess_text)
data['highlights'] = data['highlights'].apply(preprocess_text)
data['highlights'] = data['highlights'].apply(lambda x: '<sos> ' + x + ' <eos>')


In [3]:
# Tokenize the articles and highlights
article_tokenizer = Tokenizer()
article_tokenizer.fit_on_texts(data['article'])
article_sequences = article_tokenizer.texts_to_sequences(data['article'])

highlight_tokenizer = Tokenizer()
highlight_tokenizer.fit_on_texts(data['highlights'])
highlight_sequences = highlight_tokenizer.texts_to_sequences(data['highlights'])

max_article_len = max([len(seq) for seq in article_sequences])
max_highlight_len = max([len(seq) for seq in highlight_sequences])

article_sequences_padded = pad_sequences(article_sequences, maxlen=max_article_len)
highlight_sequences_padded = pad_sequences(highlight_sequences, maxlen=max_highlight_len)

# Create input and target sequences
X = article_sequences_padded
y = highlight_sequences_padded[:, :-1]
y_target = highlight_sequences_padded[:, 1:]

In [4]:
embedding_dim = 128
hidden_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_article_len,))
encoder_embedding = Embedding(len(article_tokenizer.word_index) + 1, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(len(highlight_tokenizer.word_index) + 1, embedding_dim)
decoder_embedded = decoder_embedding(decoder_inputs)
decoder_lstm = LSTM(hidden_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedded, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(len(highlight_tokenizer.word_index) + 1, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

2023-03-15 16:16:39.782800: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
batch_size = 64
epochs = 5

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit([X, y], np.expand_dims(y_target, -1),
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2,
                    callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [6]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(hidden_dim,))
decoder_state_input_c = Input(shape=(hidden_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedded_inference = decoder_embedding(decoder_inputs)
decoder_outputs_inference, state_h_inference, state_c_inference = decoder_lstm(decoder_embedded_inference, initial_state=decoder_states_inputs)

In [8]:
# Create the inference decoder model
decoder_states_inference = [state_h_inference, state_c_inference]
decoder_outputs_inference = decoder_dense(decoder_outputs_inference)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs_inference] + decoder_states_inference)

def generate_summary(article_text):
    article_input = preprocess_text(article_text)
    article_sequence = article_tokenizer.texts_to_sequences([article_input])
    article_sequence_padded = pad_sequences(article_sequence, maxlen=max_article_len)

    encoder_states_value = encoder_model.predict(article_sequence_padded)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = highlight_tokenizer.word_index['<sos>']
    summary = []

    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + encoder_states_value)
        token_index = np.argmax(output_tokens[0, -1, :])
        token = highlight_tokenizer.index_word[token_index]

        if token == '<eos>' or len(summary) >= max_highlight_len:
            break

        summary.append(token)
        target_seq[0, 0] = token_index
        encoder_states_value = [h, c]

    return ' '.join(summary)

# Test the model on a sample article
sample_article = str(data.sample(1)['article'])
print(sample_article)
generated_summary = generate_summary(sample_article)
print("Generated summary:", generated_summary)


552    cnn  spains controversial, and highly restrict...
Name: article, dtype: object


KeyError: '<sos>'