In [1]:
import pandas as pd
import os
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils as ku
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential

In [2]:
path = r"Data/New york news headlines"
all_headlines =[]
for file in os.listdir(path):
    if 'Articles' in file:
        f = pd.read_csv(path + '//'+file)
        all_headlines.extend(list(f.headline.values))
        break
all_headlines = [h for h in all_headlines if h != "Unknown"]

In [3]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    #txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuela’s descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted',
 'the caged beast awakens',
 'an everunfolding story',
 'o’reilly thrives as settlements add up',
 'mouse infestation',
 'divide in gop now threatens trump tax plan']

In [4]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    #print(total_words)
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        #print(line,token_list)
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words
    
inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    #print(input_sequences)
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 10)            24830     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2483)              250783    
                                                                 
Total params: 320,013
Trainable params: 320,013
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
151/151 - 8s - loss: 7.4274 - 8s/epoch - 51ms/step
Epoch 2/100
151/151 - 3s - loss: 6.9442 - 3s/epoch - 21ms/step
Epoch 3/100
151/151 - 3s - loss: 6.8233 - 3s/epoch - 21ms/step
Epoch 4/100
151/151 - 3s - loss: 6.7426 - 3s/epoch - 20ms/step
Epoch 5/100
151/151 - 3s - loss: 6.6712 - 3s/epoch - 21ms/step
Epoch 6/100
151/151 - 3s - loss: 6.5929 - 3s/epoch - 20ms/step
Epoch 7/100
151/151 - 3s - loss: 6.5349 - 3s/epoch - 20ms/step
Epoch 8/100
151/151 - 3s - loss: 6.4206 - 3s/epoch - 20ms/step
Epoch 9/100
151/151 - 3s - loss: 6.3299 - 3s/epoch - 20ms/step
Epoch 10/100
151/151 - 3s - loss: 6.2353 - 3s/epoch - 20ms/step
Epoch 11/100
151/151 - 3s - loss: 6.1467 - 3s/epoch - 19ms/step
Epoch 12/100
151/151 - 3s - loss: 6.0574 - 3s/epoch - 20ms/step
Epoch 13/100
151/151 - 3s - loss: 5.9658 - 3s/epoch - 20ms/step
Epoch 14/100
151/151 - 3s - loss: 5.8763 - 3s/epoch - 20ms/step
Epoch 15/100
151/151 - 3s - loss: 5.7862 - 3s/epoch - 20ms/step
Epoch 16/100
151/151 - 3s - loss: 5.6976 - 3s/epo

<keras.callbacks.History at 0x176ac088f70>

In [13]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        predict_x=model.predict(token_list) 
        predicted=np.argmax(predict_x,axis=1)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [16]:
print (generate_text("united states", 10, model, max_sequence_len))
print (generate_text("preident trump", 6, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))

United States Up A Grizzled Rivalry Gets A Freshfaced Makeover Too Future
Preident Trump May Sign Orders That Could Expand
Donald Trump A ‘Criminal’ Book For
India And China Players A Uniforms And
New York Today A Belated Budget
Science And Technology A Tribal Identity Voting Against
