In [2]:
import random
import pickle
import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [5]:
text_df = pd.read_csv("/content/sample_data/fake_or_real_news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

with open("joined_text.txt", "w", encoding="utf-8") as f:
    f.write(joined_text)

In [6]:
partial_text = joined_text[:1000000]

In [7]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [8]:
unique_tokens = np.unique(tokens)
unique_token_index = {token: index for index, token in enumerate(unique_tokens)}

In [9]:
n_words = 10
input_words = []
next_word = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

In [10]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)  # for each sample, n input words and then a boolean for each possible next word
y = np.zeros((len(next_word), len(unique_tokens)), dtype=bool)  # for each sample a boolean for each possible next word

In [11]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_word[i]]] = 1

In [12]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [13]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X, y, batch_size=128, epochs=10, shuffle=True).history

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# history = model.fit(X, y, batch_size=128, epochs=5, shuffle=True).history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
model.save("/content/sample_data/text_gen_model2.h5")
with open("history2.p", "wb") as f:
    pickle.dump(history, f)

  saving_api.save_model(


In [15]:
model = load_model("/content/sample_data/text_gen_model2.h5")
history = pickle.load(open("history2.p", "rb"))

In [16]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1

    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [17]:
possible = predict_next_word("I will have to look into this thing because I", 5)



In [18]:
for idx in possible:
    print(unique_tokens[idx])

had
don
just
think
have


In [19]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [21]:
generate_text("I will have to look into this thing because I", 100, 10)



'I will have to look into this thing because I just believe it they are the s why that the only thing is there that people will give what we would do all this will all your comment of the us family in america you don again can the same time of the future we said in america it means about the government it can do a path and has the they ve also long been more research at the polls were using it about two months ago he is a new republican candidate for her president she said there was a chance he will look a president of at'

In [20]:
generate_text("The president of the United States announced yesterday that he", 100, 10)



'The president of the United States announced yesterday that he took from new mexico for american voters like bush leaders toward both candidates and it is doing with his support with their own support with all terrorist candidates in they see it on the rest of the people that will use it their terrorist program at government since they got by official now and their reported to u to keep her victory in a general country a few years to do all but he can be part at the time and i do not tell i was to help our support into this justice department said it are already close'

In [None]:
for idx in predict_next_word("The president will most likely not be there to help", 5):
    print(unique_tokens[idx])

american
the
our
us
president
