In [3]:
import random
import pickle
import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [4]:
text_df = pd.read_csv("news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

with open("joined_text.txt", "w", encoding="utf-8") as f:
    f.write(joined_text)

In [5]:
partial_text = joined_text[:1000000]

In [7]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [8]:
unique_tokens = np.unique(tokens)
unique_token_index = {token: index for index, token in enumerate(unique_tokens)}

In [38]:
import numpy as np
from scipy.sparse import lil_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

n_words = 10
input_words = []
next_word = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

n_samples = len(input_words)
n_unique_tokens = len(unique_tokens)

X = lil_matrix((n_samples, n_words * n_unique_tokens), dtype=bool)
y = lil_matrix((n_samples, n_unique_tokens), dtype=bool)

for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        if word in unique_token_index:
            token_index = unique_token_index[word]
            X[i, j * n_unique_tokens + token_index] = 1

    if next_word[i] in unique_token_index:
        next_token_index = unique_token_index[next_word[i]]
        y[i, next_token_index] = 1

X = X.tocsr()
y = y.tocsr()

def data_generator():
    while True:
        for i in range(0, n_samples, batch_size):
            X_batch = X[i:i + batch_size].toarray().reshape((-1, n_words, n_unique_tokens))
            y_batch = y[i:i + batch_size].toarray()
            yield X_batch, y_batch

model = Sequential()
model.add(LSTM(128, input_shape=(n_words, n_unique_tokens), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(n_unique_tokens))
model.add(Activation("softmax"))

optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

batch_size = 128
steps_per_epoch = n_samples // batch_size

history = model.fit(data_generator(), steps_per_epoch=steps_per_epoch, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
model.save("text_gen_model2.h5")
with open("history2.p", "wb") as f:
    pickle.dump(history, f)

In [41]:
model = load_model("text_gen_model2.h5")
history = pickle.load(open("history2.p", "rb"))

In [42]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1
        
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [43]:
possible = predict_next_word("I will have to look into this thing because I", 5)



In [44]:
for idx in possible:
    print(unique_tokens[idx])

could
will
would
can
had


In [45]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [46]:
generate_text("I will have to look into this thing because I", 100, 10)



'I will have to look into this thing because I would see we need your wife to keep hillary national orders study sharpton solution over women obamacare food biden give 100 off north points the order should look be from obamacare obama has taken me more weeks between us white states on a major re man on a white court based by the current federal crisis of undocumented leader vote executive carney pledges on bad sharpton obama s memoir in congress finds white site obama to search away into putin 100 away in been un back and whether america were women based to keep his executive border america could help'

In [47]:
generate_text("The president of the United States announced yesterday that he", 100, 10)



'The president of the United States announced yesterday that he got a stunning idea we d see you need the obama speech i is like what it means in to talk that he found more putin than with people study white man administration and executive qaeda declaring putin to raise our name which gets over putin s executive gov joe calls off ukraine the world media broke off isis un among study of economic employees for violent gov obamacare phone executive hebdo class act paul williamson obama said putin and executive john williamson john carney obamacare s dogs of federal leader order putin to search on back obamacare with economic'

In [48]:
for idx in predict_next_word("The president will most likely not be there to help", 5):
    print(unique_tokens[idx])

whether
me
obama
us
your
