In [None]:
import os
import numpy as np
import pandas as pd

from headline_generation.preprocessing import DataPreProcessor
from headline_generation.model import HeadlineGenerator
from headline_generation.utils import generate_embedding_matrix_from_file 

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.initializers import Constant

## Running on CPU
tf.config.set_visible_devices([], 'GPU')

## To replicate the results
from tensorflow.random import set_seed
from numpy.random import seed

set_seed(42)
seed(42)


## Loading Data

In [None]:
headlines = list()

dataset_dir = 'dataset/'
for filename in os.listdir(dataset_dir):
    filepath = os.path.join(dataset_dir, filename)
    if 'Article' in filename:
        headlines.extend(pd.read_csv(filepath).headline.tolist())

headline_lengths = list(map(len, list(map(str.split, headlines))))
print(f'''
    Number of Headlines: {len(headlines):,}
    Maximum Headline length: {np.max(headline_lengths)}
    Minimum Headline length: {np.min(headline_lengths)}
    Average Headline length: {np.mean(headline_lengths):.2f}
    STD of Headline length: {np.std(headline_lengths):.2f}
''')


## Preprocessing Data

In [None]:
out_of_vocabulary_token = '<OOV>'
max_sentence_len = int(np.mean(headline_lengths) + (2 * np.std(headline_lengths)))
padding_type = 'pre'

In [None]:
data_preprocessor = DataPreProcessor(headlines)

data_preprocessor = data_preprocessor.clean_data()
data_preprocessor = data_preprocessor.preprocess(out_of_vocabulary_token, max_sentence_len, padding_type)

X, y = data_preprocessor.get_features_and_labels()

In [None]:
X.shape

In [None]:
y.shape

## The Model

### Hyper Parameters

In [None]:
vocab_size = len(data_preprocessor.vocab) + 2
embedding_file_path = 'glove.6B/glove.6B.50d.txt'
embedding_dim = 50
sequence_len = max_sentence_len - 1
dropout_rate = 0.2
learning_rate = 0.01
epochs = 100
optimizer = 'adam'
loss_fn = 'categorical_crossentropy'

### Embedding Matrix

In [None]:
embedding_matrix, hits, misses = generate_embedding_matrix_from_file(
    embedding_file_path, vocab_size, embedding_dim,
    data_preprocessor.tokenizer.word_index
)

print(f'Found embeddings for {hits} words, could not find embeddings for {misses} words')

### Model

In [None]:
layers = [
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=sequence_len,
        embeddings_initializer=Constant(embedding_matrix),
        trainable=False
    ),
    LSTM(units=sequence_len, return_sequences=True),
    Dropout(rate=dropout_rate),
    LSTM(units=sequence_len),
    Dense(units=64, activation='relu'),
    Dropout(rate=dropout_rate),
    Dense(units=vocab_size, activation='softmax')
]

In [None]:
headline_generator = HeadlineGenerator()
headline_generator.create(layers, loss_fn, optimizer)

In [None]:
headline_generator.train(X, y, epochs)

In [None]:
# headline_generator = HeadlineGenerator()
# headline_generator.load_saved('headline_generator-11:25:34.h5')

### Generating Headlines

In [None]:
headline_generator.generate_headline(
    10,
    sequence_len,
    data_preprocessor.tokenizer,
    word_sample_size=5, 
    initial_sentence='<START> Woman'
)