# Word-Wise Text Generation

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import string

import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

## Dataset

In [2]:
data = open('covid-19_article.txt','r').read().lower()

In [3]:
tokens = data.split()
'remove punctuations form string'
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
print("There are %d total tokens and %d unique tokens in data" %((len(tokens),len(set(tokens)))))

There are 5310 total tokens and 1618 unique tokens in data


In [4]:
'Making sequences of length 50'
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 5259


### Creating new text file in which each lines are of length 50  

In [5]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [6]:
out_filename = 'covid_data.txt'
save_doc(sequences, out_filename)

In [7]:
seq = open('covid_data.txt','r').read()
lines = seq.split('\n')

### Tokenizer function converts each lines of words into numbers with the help of LabelEncoder() 

In [8]:
def Tokenizer(lines):
    tokenizer = LabelEncoder()
    seq_gen = tokenizer.fit(tokens)
    sequences = []
    for i in range(len(lines)):
        temp = lines[i].split(' ')
        sequences.append(seq_gen.transform(temp))
    return sequences,seq_gen

In [9]:
sequences,tokenizer = Tokenizer(lines)

In [10]:
vocab_size = len(tokenizer.classes_)
vocab_size

1618

### Creating dataset for training

In [11]:
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
print(X.shape)
y = np.eye(vocab_size)[y.reshape(-1)]
print(y.shape)
seq_length = X.shape[1]

(5259, 50)
(5259, 1618)


## Loading Glove vectors

In [12]:
embeddings_index = dict()
f = open('glove.6B.50d.txt','r',encoding = 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [13]:
embedding_matrix = np.zeros((vocab_size, 50))
token_obj = dict(zip(tokenizer.classes_, tokenizer.transform(tokenizer.classes_)))
for word, i in token_obj.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Model Training

In [14]:
def text_generator(Input_shape):
    
    sentence_indices = Input(Input_shape)
    embedding_layer = Embedding(vocab_size, 50, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([embedding_matrix])
    embeddings = embedding_layer(sentence_indices)
    
    X = LSTM(400 , return_sequences = True)(embeddings)
    X = LSTM(400 , return_sequences = False)(X)
    X = Dense(128 , activation = 'relu')(X)
    X = Dense(vocab_size , activation = 'softmax')(X)
    
    model = Model(sentence_indices , X)
    return model

In [15]:
model = text_generator((seq_length))

In [16]:
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam')

In [17]:
'I have only trained till 25 because it takes long time to train the model'
model.fit(X , y , epochs = 50 , batch_size = 32 , shuffle = True )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50

KeyboardInterrupt: 

In [56]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            80900     
_________________________________________________________________
lstm (LSTM)                  (None, 50, 400)           721600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 400)               1281600   
_________________________________________________________________
dense (Dense)                (None, 128)               51328     
_________________________________________________________________
dense_1 (Dense)              (None, 1618)              208722    
Total params: 2,344,150
Trainable params: 2,263,250
Non-trainable params: 80,900
______________________________________________

## Generating text

In [35]:
def generate_text():
    res = X[99]
    string_mapped = res
    string_mapped = list(string_mapped)
    for i in range(500):
        x = np.reshape(res,(1,seq_length))
        y_pred = model.predict(x,verbose=0)
        pred = np.argmax(y_pred)
        string_mapped.append(pred)
        res = list(res)
        res = res[1:]
        res.append(pred)
        res = np.array(res)
    print('\n\nOutput:')
    print('\'',' '.join(tokenizer.inverse_transform(string_mapped)),'\'')

In [40]:
generate_text()



Output:
' most of the vaccines are under design and preparation there are some that have entered efficacy evaluation in animals and initial clinical trials this review mainly focused on the progress and our prospects on field of vaccine development against sarscov2 18 years ago in 2002 the world was astonished by the appearance of severe acute respiratory syndrome sars supported by a zoonotic coronavirus called sarscov from the guangdong province of southern china after about 10 years in 2012 another similar coronavirus triggered the middle east respiratory syndrome merscov in saudi arabia both caused severe pneumonia killing 774 and 858 people with 8700 cases of confirmed infection for the former and 2494 for the latter causing significant economic losses 8 years later despite the mers outbreak remaining in certain parts of the world at the end of 2019 a new zoonotic coronavirus sarscov2 and responsible of coronavirus disease covid19 arose from wuhan hubei province china it spread r