Imports

In [None]:
import random
import pickle
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Activation
from keras.optimizers import RMSprop

Data extraction and tokenization

In [None]:
text_df = pd.read_csv("fake_news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)
partial_text = joined_text[:10000] ## taking a fraction
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())
unique_tokens = np.unique(tokens)
print("Token size : ", len(tokens))
freq_unique_tokens = {token: idx for idx, token in enumerate(unique_tokens)}
freq_unique_tokens

Prediction Context Parameters

In [None]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i: i+n_words])
    next_words.append(tokens[i+n_words])

next_words

Train and Test Data

In [None]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool) 
## For each sample we want "n" possible words and a boolean for "next" word which will represent the correct word
Y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, freq_unique_tokens[word]] = True
    Y[i, freq_unique_tokens[next_words[i]]] = True

Model

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(X,Y, batch_size=128, epochs=30, shuffle = True)

In [41]:
model.save("mymodel.keras")
model = load_model("mymodel.keras")

Prediction

In [42]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    x = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        x[0,i, freq_unique_tokens[word]] = True

    predictions = model.predict(x)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [50]:
possible = predict_next_word("He will have to look into this thing and he", 5)
print([unique_tokens[i] for i in possible])

['really', 'unapologetic', 'not', 'democrats', 'tape']
