Adapted from https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

In [1]:
import pandas as pd

In [1]:
import string
 
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r',encoding='utf8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w',encoding='utf8')
    file.write(data)
    file.close()
 
# load document
in_filename = 'fake_news.txt'
doc = load_doc(in_filename)
print(doc[:200])
 
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
 
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))
 
# save sequences to file
out_filename = 'fake_sequences.txt'
save_doc(sequences, out_filename)

and there it is michigan governor gretchen whitmer bans buying us flags during lockdown want is wrong with this woman
said there is no game plan to distribute coronavirus vaccine is he right
are you p
['and', 'there', 'it', 'is', 'michigan', 'governor', 'gretchen', 'whitmer', 'bans', 'buying', 'us', 'flags', 'during', 'lockdown', 'want', 'is', 'wrong', 'with', 'this', 'woman', 'said', 'there', 'is', 'no', 'game', 'plan', 'to', 'distribute', 'coronavirus', 'vaccine', 'is', 'he', 'right', 'are', 'you', 'panicking', 'about', 'enough', 'follow', 'our', 'guide', 'why', 'the', 'hiend', 'home', 'sales', 'market', 'is', 'immune', 'to', 'pandemic', 'conditions', 'realestate', 'tallahassee', 'coronavirus', 'was', 'produced', 'in', 'a', 'laboratory', 'former', 'cia', 'intel', 'officer', 'ahb', 'lmaoo', 'killin', 'me', 'here', 'in', 'other', 'news', 'im', 'immune', 'to', 'the', 'coronavirus', 'for', 'unknown', 'reasons', 'photos', 'show', 'part', 'of', 'huoshenshan', 'hospital', 'in', 'wuhan', 'bu

In [2]:
from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

# load
# lines = pd.read_csv('fake_news.csv')['tweet'].values.tolist()

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r',encoding="utf8")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load
in_filename = 'fake_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            265200    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 5304)              535704    
                                                                 
Total params: 951,804
Trainable params: 951,804
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/1

In [15]:
from random import randint
from pickle import load
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r',encoding='utf8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
    # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        #yhat = model.predict_classes(encoded, verbose=0)
        yhat = np.argmax(model.predict(encoded), axis=-1)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# load cleaned text sequences
in_filename = 'fake_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

for k in range(10):
    print(f'Text {k+1}')
    # select a seed text
    print('Seed')
    seed_text = lines[randint(0,len(lines))]
    print(seed_text + '\n')

    # generate new text
    generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
    print('Generated')
    print(generated)
    print('--------------------------------------------------------------------------------')

Text 1
real government planned agendas italy did not follow who protocol and did an autopsy on a corpse that died from and realised that coronavirus is actually not a virus but a bacterium which gets amplified with electromagnetic radiation that also produces inflammation and hypoxia and multiple other claims coronavirus bioweapon how

Generated
china stole coronavirus from canada and weaponized it via coronaviruschina may god save us from china parentodo tras leer lo siguiente hacemos una consulta a mano alzada hay que reír belgium health minister puts ban on nonessential sexual activities of persons or greater in indoor areas coronavirus detection is possible
--------------------------------------------------------------------------------
Text 2
most protection against viral particles for the wearer but other masks are effective in reducing the reach of because they help stop asymptomatic individuals from unknowingly spreading the disease news coronavirus agrees not to infect anyone 