In [None]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.1-cp36-cp36m-manylinux2010_x86_64.whl (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 13.7 MB/s eta 0:00:01
[?25hCollecting scipy>=0.19.1
  Downloading scipy-1.5.4-cp36-cp36m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 29.6 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Collecting joblib>=0.11
  Downloading joblib-1.0.1-py3-none-any.whl (303 kB)
[K     |████████████████████████████████| 303 kB 52.7 MB/s eta 0:00:01
Installing collected packages: scipy, threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.0.1 scikit-learn-0.24.1 scipy-1.5.4 threadpoolctl-2.1.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import io
import unicodedata
import re
import os
from sklearn.model_selection import train_test_split
import numpy as np
import time

In [None]:
input_file = 'divina_textonly.txt'
target_file = 'divina_syll_textonly.txt'

In [None]:
input_text_raw = open(input_file, 'rb').read().decode(encoding='utf-8')
target_text_raw = open(output_file, 'rb').read().decode(encoding='utf-8')
print('Length of input text: {} characters'.format(len(input_text_raw)))
print('Length of target text: {} characters'.format(len(target_text_raw)))

Length of input text: 558637 characters
Length of target text: 873431 characters


In [None]:
input_vocab = sorted(set(input_text_raw))
target_vocab = sorted(set(target_text_raw))
input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)

In [None]:
print('Input vocab size: {}'.format(input_vocab_size))
print('Target vocab size: {}'.format(input_vocab_size))

Input vocab size: 79
Target vocab size: 79


In [None]:
def preprocess(text):
    return ['^' + line.strip() + '$' for line in text.split('\n') if line.strip() != '']

input_text_lines = preprocess(input_text_raw)
target_text_lines = preprocess(target_text_raw)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(target_text_prepr)

input_text_lines_enc = tokenizer.texts_to_sequences(input_text_prepr)
target_text_lines_enc = tokenizer.texts_to_sequences(target_text_prepr)

In [None]:
def pad(x):
    return tf.keras.preprocessing.sequence.pad_sequences(x, padding='post') 

In [None]:
input_text = pad(input_text_lines_enc)
target_text = pad(target_text_lines_enc)

In [None]:
input_text

<tf.Tensor: shape=(14233, 55), dtype=int32, numpy=
array([[14, 53,  3, ...,  0,  0,  0],
       [14, 17,  5, ...,  0,  0,  0],
       [14, 12, 23, ...,  0,  0,  0],
       ...,
       [14, 17,  4, ...,  0,  0,  0],
       [14, 11, 30, ...,  0,  0,  0],
       [14,  9, 21, ...,  0,  0,  0]], dtype=int32)>

In [None]:
target_text

<tf.Tensor: shape=(14233, 65), dtype=int32, numpy=
array([[14,  1, 53, ...,  0,  0,  0],
       [14,  1, 17, ...,  0,  0,  0],
       [14,  1, 12, ...,  0,  0,  0],
       ...,
       [14,  1, 17, ...,  0,  0,  0],
       [14,  1, 11, ...,  0,  0,  0],
       [14,  1,  9, ...,  0,  0,  0]], dtype=int32)>

In [None]:
input_train, input_test, target_train, target_test = train_test_split(input_text, target_text)

In [None]:
embedding_dim = 256
units = 1024

encoder_input = tf.keras.layers.Input(shape=(None,))
encoder_embedded = tf.keras.layers.Embedding(input_dim=target_vocab_size, output_dim=embedding_dim)(
    encoder_input
)

# Return states in addition to output
output, state_h, state_c = tf.keras.layers.LSTM(units, return_state=True, name="encoder")(
    encoder_embedded
)
encoder_state = [state_h, state_c]

decoder_input = tf.keras.layers.Input(shape=(None,))
decoder_embedded = tf.keras.layers.Embedding(input_dim=target_vocab_size, output_dim=embedding_dim)(
    decoder_input
)

# Pass the 2 states to a new LSTM layer, as initial state
decoder_output = tf.keras.layers.LSTM(units, name="decoder")(
    decoder_embedded, initial_state=encoder_state
)
output = tf.keras.layers.Dense(target_vocab_size)(decoder_output)

decoder = tf.keras.Model([encoder_input, decoder_input], output)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    20480       input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 256)    20480       input_5[0][0]                    
____________________________________________________________________________________________

In [None]:
class Encoder(tf.keras.Model):
    
    def __init__(self, enc_units, embedding_dim, vocab_size):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding_dim = embedding_dim
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True)
        
    def call(self, x, hidden=None):
        x = self.embedding(x)
        if hidden is None:
            hidden = self.gru.get_initial_state(x)
        output, state = self.gru(x, initial_state=hidden)
        
        return output, state[-1]

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, dec_units, embedding_dim, vocab_size):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):
                
        x = self.embedding(x)
        
        x, state = self.gru(x, initial_state=tf.expand_dims(hidden, 0))

        x = tf.reshape(x, (1, 2024))
        
        x = self.fc(x)

        return x, state

In [None]:
encoder = Encoder(1024, 256, len(tokenizer.word_index))
decoder = Decoder(1024, 256, len(tokenizer.word_index))

In [None]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        _, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([tokenizer.word_index['^']], 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[0]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[t], 1)

    batch_loss = (loss / int(targ.shape[0]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
EPOCHS = 20

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = None
    total_loss = 0

    for inp, targ in zip(input_train, target_train):
        inp = tf.expand_dims(inp, 1)
        targ = tf.expand_dims(targ, 1)
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if batch % 100 == 0:
        print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

ValueError: in user code:

    <ipython-input-300-12bcf96db4c3>:18 train_step  *
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
    <ipython-input-306-a5297abf5375>:16 call  *
        x = tf.reshape(x, (1, 2024))
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py:195 reshape
        result = gen_array_ops.reshape(tensor, shape, name)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_array_ops.py:8378 reshape
        "Reshape", tensor=tensor, shape=shape, name=name)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:750 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py:592 _create_op_internal
        compute_device)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:3536 _create_op_internal
        op_def=op_def)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:2016 __init__
        control_input_ops, op_def)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1856 _create_c_op
        raise ValueError(str(e))

    ValueError: Cannot reshape a tensor with 1024 elements to shape [1,2024] (2024 elements) for '{{node decoder_38/Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32](decoder_38/gru_69/strided_slice_2, decoder_38/Reshape/shape)' with input shapes: [1,1024], [2] and with input tensors computed as partial shapes: input[1] = [1,2024].


In [None]:
target_text[:, 1:][0]

(64,)