In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time

## Dataset

In [2]:
'I converted .csv file dataset into .txt file'
data = open('covid-19_article.txt','r').read().lower()
print(data[:400])

in december 2019, the outbreak of pneumonia caused by a novel coronavirus, severe acute 
respiratory syndrome coronavirus 2 (sars-cov-2), has led to a serious pandemic in china and other 
countries worldwide. so far, more than 460,000 confirmed cases were diagnosed in nearly 190 
countries, causing globally over 20,000 deaths. currently, the epidemic is still spreading and there is 
no effective m


In [3]:
chars = list(set(data))
data_len = len(data)
char_len = len(chars)      #vocab size
print("There are %d total characters and %d unique characters in data." %((data_len, char_len)))

There are 36416 total characters and 56 unique characters in data.


In [4]:
chars = sorted(chars)

In [5]:
char_to_index = {ch:i for i,ch in enumerate(chars)}
int_text = np.array([char_to_index[i] for i in data])
index_to_char = np.array(chars)

In [6]:
seq_length = 150
char_dataset = tf.data.Dataset.from_tensor_slices(int_text)
sequences = char_dataset.batch(seq_length+1, drop_remainder = True)

## Creating training dataset

In [7]:
def create_input_target_pair(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(create_input_target_pair)

In [8]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(index_to_char[input_example.numpy()])))
  print ('Target data:', repr(''.join(index_to_char[target_example.numpy()])))

Input data:  'in december 2019, the outbreak of pneumonia caused by a novel coronavirus, severe acute \nrespiratory syndrome coronavirus 2 (sars-cov-2), has led to a'
Target data: 'n december 2019, the outbreak of pneumonia caused by a novel coronavirus, severe acute \nrespiratory syndrome coronavirus 2 (sars-cov-2), has led to a '


In [9]:
Batch_size = 64
buffer_size = 10000
dataset = dataset.shuffle(buffer_size).batch(Batch_size, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 150), (64, 150)), types: (tf.int32, tf.int32)>

In [10]:
embedding_dim = 256
rnn_units = 1024

## text generator model

In [11]:
def text_model(char_len , embedding_dim , rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(char_len, embedding_dim, batch_input_shape = [batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(char_len)
    ])
    return model

In [12]:
model = text_model(char_len, embedding_dim, rnn_units, Batch_size)

In [13]:
'Only testing the shape'
for input_example_batch, target_example_batch in dataset.take(1):
    example_prediction = model(input_example_batch)
    assert (example_prediction.shape == (Batch_size, seq_length, char_len)), "Shape error"

In [14]:
sampled_indices = tf.random.categorical(example_prediction[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

## Model Training

In [15]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [16]:
example_loss  = loss(target_example_batch, example_prediction)
print("Prediction shape: ", example_prediction.shape)
print("Loss:      ", example_loss.numpy().mean())

Prediction shape:  (64, 150, 56)
Loss:       4.0254836


In [17]:
model.compile(optimizer='adam', loss=loss)

In [18]:
lstm_dir_checkpoints= './training_checkpoints_LSTM'
checkpoint_prefix = os.path.join(lstm_dir_checkpoints, "checkpt_{epoch}") 
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)

In [19]:
epochs = 110

In [20]:
history = model.fit(dataset, epochs=epochs, callbacks=[checkpoint_callback])

Epoch 1/110
Epoch 2/110
Epoch 3/110
Epoch 4/110
Epoch 5/110
Epoch 6/110
Epoch 7/110
Epoch 8/110
Epoch 9/110
Epoch 10/110
Epoch 11/110
Epoch 12/110
Epoch 13/110
Epoch 14/110
Epoch 15/110
Epoch 16/110
Epoch 17/110
Epoch 18/110
Epoch 19/110
Epoch 20/110
Epoch 21/110
Epoch 22/110
Epoch 23/110
Epoch 24/110
Epoch 25/110
Epoch 26/110
Epoch 27/110
Epoch 28/110
Epoch 29/110
Epoch 30/110
Epoch 31/110
Epoch 32/110
Epoch 33/110
Epoch 34/110
Epoch 35/110
Epoch 36/110
Epoch 37/110
Epoch 38/110
Epoch 39/110
Epoch 40/110
Epoch 41/110
Epoch 42/110
Epoch 43/110
Epoch 44/110
Epoch 45/110
Epoch 46/110
Epoch 47/110
Epoch 48/110
Epoch 49/110
Epoch 50/110
Epoch 51/110
Epoch 52/110
Epoch 53/110
Epoch 54/110
Epoch 55/110
Epoch 56/110
Epoch 57/110
Epoch 58/110
Epoch 59/110
Epoch 60/110
Epoch 61/110
Epoch 62/110
Epoch 63/110
Epoch 64/110
Epoch 65/110
Epoch 66/110
Epoch 67/110
Epoch 68/110
Epoch 69/110
Epoch 70/110
Epoch 71/110
Epoch 72/110
Epoch 73/110
Epoch 74/110
Epoch 75/110
Epoch 76/110
Epoch 77/110
Epoch 78

In [21]:
tf.train.latest_checkpoint(lstm_dir_checkpoints)

'./training_checkpoints_LSTM\\checkpt_110'

In [22]:
model = text_model(char_len, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(lstm_dir_checkpoints))
model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            14336     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 56)             57400     
Total params: 5,318,712
Trainable params: 5,318,712
Non-trainable params: 0
_________________________________________________________________


## Generating text

In [23]:
def generate_text(model, start_string):
    num_generate = 1000 

    input_eval = [char_to_index[s] for s in start_string] 
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    
    temperature = 0.5

    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        
        predictions = tf.squeeze(predictions, 0)

        
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(index_to_char[predicted_id])

    return (start_string + ''.join(text_generated))

In [27]:
test = input("Enter your starting string: ")
print()
print(generate_text(model, start_string=test))

Enter your starting string: severe acute respiratory syndrome-coronavirus-2 outbreak has rapidly reached pandemic proportions and has become a major threat to global health

severe acute respiratory syndrome-coronavirus-2 outbreak has rapidly reached pandemic proportions and has become a major threat to global health. the 
peastices of the ever coronavirus (sars-cov) and migdlies and cintrean parameters. healthcare profession and ment mananement vaccine are 
proved with and martorment, and 55% co from tha specific to be of the covid-19 epidemic. this is an infection cases. melicully human to have of eninome care as in the transmission this network has and engeriance treatment options war computien werious and infectinule treamment of patients wite sederialogical and nampreas. as for offective wat 13%, the world ant hovical encection and 
sectormations is the stodnts in covid-19 patients, with 
covid-19 coult im/hevent and proved to infection prevention and control of the disease caname