In [None]:
import os
import re
import string

import numpy as np
import pandas as pd

from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow import one_hot
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant


## To replicate the results
from tensorflow.random import set_seed
from numpy.random import seed

set_seed(42)
seed(42)


## Defining Utility Functions

In [None]:
def generate_n_grams(sentence):

    n_grams = []
    sentence_words = sentence.split()
    for i in range(2, len(sentence_words) + 1):
        n_grams.append(' '.join(sentence_words[0: i]))

    return n_grams


def generate_headline(start, max_sentence_length, sequence_len, tokenizer, model, word_sample_size):

    generated_sentence = start.split()
    for i in range (max_sentence_length-2):

        generated_sentence_tokens = tokenizer.texts_to_sequences([generated_sentence])
        generated_sentence_padded_tokens = pad_sequences(generated_sentence_tokens, maxlen=sequence_len)

        pred_tokens = model.predict(generated_sentence_padded_tokens)[0]
        top_n_pred_tokens = np.argpartition(pred_tokens, -word_sample_size)[-word_sample_size:]
        pred_token = np.random.choice(top_n_pred_tokens, size=1)

        pred_text = tokenizer.sequences_to_texts([pred_token])[0]
        generated_sentence.append(pred_text)
        if pred_text == '<END>':
            return ' '.join(generated_sentence)


    generated_sentence.append('<END>')
    return ' '.join(generated_sentence)

## Loading Data

In [None]:
headlines = np.array([])

dataset_dir = 'dataset/'
for filename in os.listdir(dataset_dir):
    filepath = dataset_dir + filename
    if 'Article' in filename:
        headlines = np.append(headlines, pd.read_csv(filepath).headline.values)

f'Extracted a total of {headlines.shape[0]} headlines from the dataset'

## Cleaning Data

In [None]:
headlines = pd.Series(headlines)

## Converting to Lowercase
headlines = headlines.apply(str.lower)

## Removing Punctuations
headlines = headlines.apply(
    lambda headline: re.sub(r'[^\w\s]', '', headline))


## Preprocessing Data

#### Adding &lt;SRART&gt; and &lt;END&gt; tokens 

In [None]:
headlines = headlines.apply(
    lambda headline: f'<START> {headline} <END>')

#### Generating n grams

In [None]:
vocab = set()
headline_ngrams = np.array([])

for headline in tqdm(headlines):
    n_grams = generate_n_grams(headline)
    vocab = vocab.union(set(n_grams[-1].split()))
    # headline_ngrams.append(generate_n_grams(headline))
    headline_ngrams = np.append(headline_ngrams, generate_n_grams(headline))

f'There are {len(vocab)} words in the dataset'

### Tokenizing the data

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>', filters=[], lower=False)

In [None]:
tokenizer.fit_on_texts(vocab)

In [None]:
tokenizer.word_index

In [None]:
tokenized_headline_ngrams = tokenizer.texts_to_sequences(headline_ngrams)

In [None]:
tokenized_headline_ngrams

### Checking length of headlines in dataset and Padding

In [None]:
headline_lengths = headlines.map(str.split).map(len)

print(f'''
    Maximum Headline length: {headline_lengths.max()}
    Minimum Headline length: {headline_lengths.min()}
    Average Headline length: {headline_lengths.mean():.2f}
    STD of Headline length: {headline_lengths.std():.2f}
''')

#### Since them longest headline is not too long e.g 150 or 200, we can use the length of longest headline for padding
#### If it was around 150 or 200 then we would truncate the longer sentences and use a padding length of a smaller value e.g mean_length + (2 * std of length)

In [None]:
padded_tokenized_headline_ngrams = pad_sequences(tokenized_headline_ngrams, maxlen=headline_lengths.max(), padding='pre')

In [None]:
padded_tokenized_headline_ngrams[:5]

#### Separating Features and Labels

In [None]:
X = padded_tokenized_headline_ngrams[:, :-1]
y = padded_tokenized_headline_ngrams[:, -1]

y = one_hot(y, depth=len(vocab) + 2)

In [None]:
X[:5]

In [None]:
y[0].shape

## Defining Hyper Parameters

In [None]:
vocab_size = len(vocab) + 2
embedding_dim = 50
sequence_len = headline_lengths.max() - 1
dropout_rate = 0.2
learning_rate = 0.01

## Using Pre-Trained Embeddings

In [None]:
embeddings_index = dict()
with open('glove.6B/glove.6B.50d.txt') as f:
    for line in f:
        word, embedding = line.split(maxsplit=1)
        embedding = np.fromstring(embedding, "f", sep=" ")
        embeddings_index[word] = embedding

print("Found %s word vectors." % len(embeddings_index))

In [None]:
hits = 0
misses = 0

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

## Creating Model

In [None]:
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=sequence_len,
        embeddings_initializer=Constant(embedding_matrix),
        trainable=False
    ),
    LSTM(units=sequence_len, return_sequences=True),
    Dropout(rate=dropout_rate),
    LSTM(units=sequence_len),
    Dense(units=64, activation='relu'),
    Dropout(rate=dropout_rate),
    Dense(units=vocab_size, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model.summary()

In [None]:
model.fit(X, y, epochs=100)

In [None]:
model.save('headline_generating_model.h5')

## Evaluating the Model

In [None]:
model = load_model('headline_generating_model.h5')

In [None]:
generate_headline('<START>', 15, sequence_len, tokenizer, model, word_sample_size=5)

In [None]:
generate_headline('<START> new', 15, sequence_len, tokenizer, model, word_sample_size=5)

In [None]:
generate_headline('<START> new york', 10, sequence_len, tokenizer, model, word_sample_size=5)

In [None]:
generate_headline('<START> new york', 10, sequence_len, tokenizer, model, word_sample_size=5)