In [2]:
import os
import re
import string

import numpy as np
import pandas as pd

from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow import one_hot
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant


## To replicate the results
from tensorflow.random import set_seed
from numpy.random import seed

set_seed(42)
seed(42)


## Defining Utility Functions

In [3]:
def generate_n_grams(sentence):

    n_grams = []
    sentence_words = sentence.split()
    for i in range(2, len(sentence_words) + 1):
        n_grams.append(' '.join(sentence_words[0: i]))

    return n_grams


def generate_headline(start, max_sentence_length, sequence_len, tokenizer, model, word_sample_size):

    generated_sentence = start.split()
    for i in range (max_sentence_length-2):

        generated_sentence_tokens = tokenizer.texts_to_sequences([generated_sentence])
        generated_sentence_padded_tokens = pad_sequences(generated_sentence_tokens, maxlen=sequence_len)

        pred_tokens = model.predict(generated_sentence_padded_tokens)[0]
        top_n_pred_tokens = np.argpartition(pred_tokens, -word_sample_size)[-word_sample_size:]
        pred_token = np.random.choice(top_n_pred_tokens, size=1)

        pred_text = tokenizer.sequences_to_texts([pred_token])[0]
        generated_sentence.append(pred_text)
        if pred_text == '<END>':
            return ' '.join(generated_sentence)


    generated_sentence.append('<END>')
    return ' '.join(generated_sentence)

## Loading Data

In [4]:
headlines = np.array([])

dataset_dir = 'dataset/'
for filename in os.listdir(dataset_dir):
    filepath = dataset_dir + filename
    if 'Article' in filename:
        headlines = np.append(headlines, pd.read_csv(filepath).headline.values)

f'Extracted a total of {headlines.shape[0]} headlines from the dataset'

'Extracted a total of 9335 headlines from the dataset'

## Cleaning Data

In [5]:
headlines = pd.Series(headlines)

## Converting to Lowercase
headlines = headlines.apply(str.lower)

## Removing Punctuations
headlines = headlines.apply(
    lambda headline: re.sub(r'[^\w\s]', '', headline))


## Preprocessing Data

#### Adding &lt;SRART&gt; and &lt;END&gt; tokens 

In [9]:
headlines = headlines.apply(
    lambda headline: f'<START> {headline} <END>')

#### Generating n grams

In [12]:
vocab = set()
headline_ngrams = np.array([])

for headline in tqdm(headlines):
    n_grams = generate_n_grams(headline)
    vocab = vocab.union(set(n_grams[-1].split()))
    # headline_ngrams.append(generate_n_grams(headline))
    headline_ngrams = np.append(headline_ngrams, generate_n_grams(headline))

f'There are {len(vocab)} words in the dataset'

100%|██████████| 9335/9335 [00:19<00:00, 470.12it/s] 


'There are 11276 words in the dataset'

### Tokenizing the data

In [20]:
tokenizer = Tokenizer(oov_token='<OOV>', filters=[], lower=False)

In [21]:
tokenizer.fit_on_texts(vocab)

In [22]:
tokenizer.word_index

{'<OOV>': 1,
 'infinity': 2,
 'dims': 3,
 'movies': 4,
 'snowflakes': 5,
 'undermining': 6,
 'likely': 7,
 'mere': 8,
 'tilting': 9,
 'proposal': 10,
 'enjoy': 11,
 'tragedy': 12,
 'seniors': 13,
 'karate': 14,
 'bizarre': 15,
 'applications': 16,
 'openmarriage': 17,
 'mounting': 18,
 'napping': 19,
 'cart': 20,
 'poor': 21,
 'carvey': 22,
 'inherit': 23,
 'attack': 24,
 'nirvana': 25,
 'job': 26,
 'love': 27,
 'hockey': 28,
 'wolf': 29,
 'shocking': 30,
 'manafort': 31,
 'chill': 32,
 'hills': 33,
 'nurse': 34,
 'sasse': 35,
 'spicer': 36,
 'oil': 37,
 'frees': 38,
 'fork': 39,
 'gangs': 40,
 'carrie': 41,
 'eloquence': 42,
 'molder': 43,
 'deneuve': 44,
 'dicks': 45,
 'year': 46,
 'lifelong': 47,
 'seedy': 48,
 'airconditioning': 49,
 'cajun': 50,
 'bland': 51,
 'jeff': 52,
 'rescue': 53,
 'superhighway': 54,
 'editor': 55,
 'coda': 56,
 'subjects': 57,
 'princes': 58,
 'eaten': 59,
 'denies': 60,
 'lift': 61,
 'remaking': 62,
 'those': 63,
 'volatility': 64,
 'kangaroos': 65,
 'mur

In [23]:
tokenized_headline_ngrams = tokenizer.texts_to_sequences(headline_ngrams)

In [24]:
tokenized_headline_ngrams

[[2317, 7999],
 [2317, 7999, 6588],
 [2317, 7999, 6588, 5519],
 [2317, 7999, 6588, 5519, 1080],
 [2317, 7999, 6588, 5519, 1080, 11165],
 [2317, 7999, 6588, 5519, 1080, 11165, 6483],
 [2317, 7999, 6588, 5519, 1080, 11165, 6483, 1224],
 [2317, 7999, 6588, 5519, 1080, 11165, 6483, 1224, 9875],
 [2317, 7999, 6588, 5519, 1080, 11165, 6483, 1224, 9875, 809],
 [2317, 7999, 6588, 5519, 1080, 11165, 6483, 1224, 9875, 809, 2474],
 [2317, 7379],
 [2317, 7379, 1965],
 [2317, 7379, 1965, 2612],
 [2317, 7379, 1965, 2612, 2474],
 [2317, 3739],
 [2317, 3739, 4815],
 [2317, 3739, 4815, 4480],
 [2317, 3739, 4815, 4480, 4059],
 [2317, 3739, 4815, 4480, 4059, 2474],
 [2317, 5458],
 [2317, 5458, 5620],
 [2317, 5458, 5620, 9132],
 [2317, 5458, 5620, 9132, 3739],
 [2317, 5458, 5620, 9132, 3739, 9933],
 [2317, 5458, 5620, 9132, 3739, 9933, 1080],
 [2317, 5458, 5620, 9132, 3739, 9933, 1080, 5605],
 [2317, 5458, 5620, 9132, 3739, 9933, 1080, 5605, 2000],
 [2317, 5458, 5620, 9132, 3739, 9933, 1080, 5605, 2000, 2

### Checking length of headlines in dataset and Padding

In [25]:
headline_lengths = headlines.map(str.split).map(len)

print(f'''
    Maximum Headline length: {headline_lengths.max()}
    Minimum Headline length: {headline_lengths.min()}
    Average Headline length: {headline_lengths.mean():.2f}
    STD of Headline length: {headline_lengths.std():.2f}
''')


    Maximum Headline length: 26
    Minimum Headline length: 3
    Average Headline length: 8.55
    STD of Headline length: 3.26



#### Since them longest headline is not too long e.g 150 or 200, we can use the length of longest headline for padding
#### If it was around 150 or 200 then we would truncate the longer sentences and use a padding length of a smaller value e.g mean_length + (2 * std of length)

In [26]:
padded_tokenized_headline_ngrams = pad_sequences(tokenized_headline_ngrams, maxlen=headline_lengths.max(), padding='pre')

In [27]:
padded_tokenized_headline_ngrams[:5]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,  2317,  7999],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  2317,  7999,  6588],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,  2317,  7999,  6588,  5519],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  2317,  7999,  6588,  5519,  1080],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,  2317,  

#### Separating Features and Labels

In [28]:
X = padded_tokenized_headline_ngrams[:, :-1]
y = padded_tokenized_headline_ngrams[:, -1]

y = one_hot(y, depth=len(vocab) + 2)

In [29]:
X[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 2317],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 2317, 7999],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        2317, 7999, 6588],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 2317,
        7999, 6588, 5519],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0, 2317, 7999,
        6588, 5519, 1080]], dtype=int32)

In [32]:
y[0].shape

TensorShape([11278])

## Defining Hyper Parameters

In [35]:
vocab_size = len(vocab) + 2
embedding_dim = 50
sequence_len = headline_lengths.max() - 1
dropout_rate = 0.2
learning_rate = 0.01

## Using Pre-Trained Embeddings

In [19]:
embeddings_index = dict()
with open('glove.6B/glove.6B.50d.txt') as f:
    for line in f:
        word, embedding = line.split(maxsplit=1)
        embedding = np.fromstring(embedding, "f", sep=" ")
        embeddings_index[word] = embedding

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [25]:
hits = 0
misses = 0

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

Converted 10612 words (665 misses)


## Creating Model

In [26]:
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=sequence_len,
        embeddings_initializer=Constant(embedding_matrix),
        trainable=False
    ),
    LSTM(units=sequence_len, return_sequences=True),
    Dropout(rate=dropout_rate),
    LSTM(units=sequence_len),
    Dense(units=64, activation='relu'),
    Dropout(rate=dropout_rate),
    Dense(units=vocab_size, activation='softmax')
])

In [148]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [149]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 25, 50)            563900    
                                                                 
 lstm_4 (LSTM)               (None, 25, 25)            7600      
                                                                 
 dropout_4 (Dropout)         (None, 25, 25)            0         
                                                                 
 lstm_5 (LSTM)               (None, 25)                5100      
                                                                 
 dense_4 (Dense)             (None, 64)                1664      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 11278)            

In [150]:
model.fit(X, y, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x4281e4340>

In [151]:
model.save('headline_generating_model.h5')

## Evaluating the Model

In [33]:
model = load_model('headline_generating_model.h5')

In [157]:
generate_headline('<START>', 15, sequence_len, tokenizer, model, word_sample_size=5)



'<START> the americans season 6 2 3 recap a wolf <END>'

In [158]:
generate_headline('<START> new', 15, sequence_len, tokenizer, model, word_sample_size=5)



'<START> new jersey ruling proposal on the month in trump war a us and <END>'

In [163]:
generate_headline('<START> new york', 10, sequence_len, tokenizer, model, word_sample_size=5)



'<START> new york plan cuts for north homeless wonkish and in <END>'

In [164]:
generate_headline('<START> new york', 10, sequence_len, tokenizer, model, word_sample_size=5)



'<START> new york city transit <END>'