# Text corpus to generative text

Imports

In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np

from keras.utils import plot_model

Using TensorFlow backend.


## Add data

Tokenize

In [2]:
tokenizer = Tokenizer()

Variables

In [3]:
epochSize = 6
hidden_size = 200

In [4]:
def dataset_preparation(data):
    
    # splits corpus on return - change if you want a different split
    corpus = data.lower().split("^") 
    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    
    return predictors, label, max_sequence_len, total_words

## Training

In [5]:
def create_model(predictors, label, max_sequence_len, total_words):
    
    #model structure
    model = Sequential()
    model.add(Embedding(total_words, hidden_size, input_length=max_sequence_len-1, name='embeddingINPUT'))

    # 2 hidden layers of LSTM with size: hidden_size
    model.add(LSTM(hidden_size, use_bias=False, dropout=0.1, return_sequences = True, name='hiddenLSTM1'))
    model.add(LSTM(hidden_size, use_bias=False, name='hiddenLSTM2'))
    
    model.add(Dense(total_words, activation='softmax', name='denseOUTPUT'))

    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
    
    model.fit(predictors, label, epochs=epochSize, verbose=1, callbacks=[earlystop])
    return model 

In [6]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [7]:
data = open('data.txt').read()

In [8]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)

In [None]:
model = create_model(predictors, label, max_sequence_len, total_words)
#plot_model(model, to_file='model.png')

Epoch 1/6


In [None]:
output = generate_text("The", 30, max_sequence_len)
print(output)

In [9]:
# plot_model(model, to_file='model.png')

In [10]:
output = generate_text("So", 10, max_sequence_len)
print(output)

So alignment highly demonstrate that the potential to improve outcomes enhance


In [11]:
output = generate_text("AI", 100, max_sequence_len)
print(output)

AI and social good ai offers great potential for promoting the public good for example in the realms of education housing public health and sustainability we see great value in collaborating with public and private organizations including academia scientific societies ngos social entrepreneurs and interested private citizens to promote discussions and catalyze efforts to address society’s most pressing challenges some of these projects may address deep societal challenges and will be moonshots – ambitious big bets that could have far reaching impacts others may be creative ideas that could quickly produce positive results by harnessing ai advances are widely shared and


In [12]:
model.save("model.h5")

In [13]:
print(max_sequence_len)

496


In [20]:
output = generate_text("Goals", 100, max_sequence_len)
print(output)

Goals and developing is their pricing by humans if we are providing general purpose tools integrating tools for customers or developing custom solutions for this are the benefits and potential risks of their products and the actions they have taken to deliver benefits and avoid minimise or mitigate the risks they must ensure that processes are in place to address the concerns and complaints of users and other parties and that these are transparent we believe that effective communication when coupled with a principled approach to ethical considerations is a competitive advantage and will lead to progress even when hard moral
