In [50]:
import pandas as pd
import string
import numpy as np
import json
import csv
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
from tqdm import tqdm

In [51]:
#remove punctuations and convert text to lowercase
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

In [76]:
corpus= []
with open('../corpus_data/CNN_Articels_clean_2.csv', newline='', encoding='utf-8') as csvfile:
    reader = list(csv.DictReader(csvfile))
    for row in tqdm(reader):
        if row["Category"] == 'news':
            corpus.append(clean_text(row['Second headline']))
    
print(len(corpus))

100%|██████████| 22332/22332 [00:00<00:00, 260287.88it/s]

10739





In [77]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    #get tokens
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    #convert to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
#print(inp_sequences, total_words)

In [78]:
def generate_padded_sequences(input_sequences):
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences,  maxlen=max_sequence_len, padding='pre'))
  predictors, label = input_sequences[:,:-1], input_sequences[:, -1]
  label = tf.keras.utils.to_categorical(label, num_classes = total_words)
  return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

28


In [79]:
def create_model(max_sequence_len, total_words):
  input_len = max_sequence_len - 1
  model = Sequential()
 
  # Add Input Embedding Layer
  model.add(Embedding(total_words, 10, input_length=input_len))
 
  # Add Hidden Layer 1 — LSTM Layer
  model.add(LSTM(100))
  model.add(Dropout(0.1))
 
  # Add Output Layer
  model.add(Dense(total_words, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam')
 
  return model

model = create_model(max_sequence_len, total_words)
model.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x26c7e56d880>

In [80]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1,  padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        predicted_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                predicted_word = word
                break
        seed_text += " "+ predicted_word

    return seed_text.title()

print(generate_text("", max_sequence_len, model, max_sequence_len))

 Of The Pandemic Are A New Newest Poses For The World Of The Year Of The Bulls Amigos Rebel Police Police Say The Real Future Of The Year


In [81]:
datafiles = ['yake1', 'yake3', 'yake5']

for datafile in datafiles:
    output = open(f'../outputs/lstm/{datafile}_output.json', 'w', encoding='utf-8')
    with open(f'../data/{datafile}.json') as json_file:
        data = json.load(json_file)

        for index in tqdm(data):
            for i in range(len(data[index])):
                data[index][i] = generate_text(data[index][i], 10, model, max_sequence_len)

        json.dump(data, output)
                


100%|██████████| 1611/1611 [47:49<00:00,  1.78s/it] 
100%|██████████| 1611/1611 [48:51<00:00,  1.82s/it]
100%|██████████| 1611/1611 [47:17<00:00,  1.76s/it]
