<a href="https://colab.research.google.com/github/ashikshafi08/Learning_Tensorflow/blob/main/Experiments/De_shuffling_text_using_tfa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# De-Scrambling the text with Sequence-to-Sequence with Attention Mechanism. 



In [1]:
# Downloading tensorflow addons 
!pip install tensorflow-addons

Collecting tensorflow-addons
[?25l  Downloading https://files.pythonhosted.org/packages/66/4b/e893d194e626c24b3df2253066aa418f46a432fdb68250cde14bf9bb0700/tensorflow_addons-0.13.0-cp37-cp37m-manylinux2010_x86_64.whl (679kB)
[K     |████████████████████████████████| 686kB 4.0MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.13.0


In [2]:
!pip install aicrowd-cli
API_KEY = 'b0fd3331ed02024ed40b448baf316d82' 
!aicrowd login --api-key $API_KEY

# Downloading the Dataset
!rm -rf data
!mkdir data
!aicrowd dataset download --challenge de-shuffling-text -j 3 -o data



Collecting aicrowd-cli
[?25l  Downloading https://files.pythonhosted.org/packages/1f/57/59b5a00c6e90c9cc028b3da9dff90e242ad2847e735b1a0e81a21c616e27/aicrowd_cli-0.1.7-py3-none-any.whl (49kB)
[K     |████████████████████████████████| 51kB 1.9MB/s 
[?25hCollecting tqdm<5,>=4.56.0
[?25l  Downloading https://files.pythonhosted.org/packages/b4/20/9f1e974bb4761128fc0d0a32813eaa92827309b1756c4b892d28adfb4415/tqdm-4.61.1-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 3.2MB/s 
Collecting requests-toolbelt<1,>=0.9.1
[?25l  Downloading https://files.pythonhosted.org/packages/60/ef/7681134338fc097acef8d9b2f8abe0458e4d87559c689a8c306d0957ece5/requests_toolbelt-0.9.1-py2.py3-none-any.whl (54kB)
[K     |████████████████████████████████| 61kB 6.4MB/s 
Collecting gitpython<4,>=3.1.12
[?25l  Downloading https://files.pythonhosted.org/packages/27/da/6f6224fdfc47dab57881fe20c0d1bc3122be290198ba0bf26a953a045d92/GitPython-3.1.17-py3-none-any.whl (166kB)
[K     |█████████

In [3]:
# Importing all the packages we need 
import tensorflow as tf 
import tensorflow_addons as tfa
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Importing the data 

train_data = pd.read_csv('data/train.csv')
val_data = pd.read_csv('data/val.csv')
test_data = pd.read_csv('data/test.csv')

# Printing out all shapes of our data 
print(f'Shape of the train data: {train_data.shape}')
print(f'Shape of the validation data: {val_data.shape}')
print(f'Shape of the test data: {test_data.shape}')

Shape of the train data: (40001, 2)
Shape of the validation data: (4001, 2)
Shape of the test data: (10000, 3)


In [5]:
# How does our train data looks like? 
train_data.head()

Unnamed: 0,text,label
0,"presented here Furthermore, naive improved. im...","Furthermore, the naive implementation presente..."
1,vector a in a form vector multidimensional spa...,Those coefficients form a vector in a multidim...
2,compatible of The model with recent is model s...,The model is compatible with a recent model of...
3,but relevance outlined. hemodynamics its based...,"The model is based on electrophysiology, but i..."
4,of transitions lever-like involve reorientatio...,Conformational transitions in macromolecular c...


In [6]:
# Shuffling our train data 
train_data_shuffled = train_data.sample(frac = 1 , random_state = 42)
train_data_shuffled.head() , train_data_shuffled.shape

(                                                    text                                              label
 32824  on work, supervised label image the segmentati...  In our work, we focus on the weakly supervised...
 16298  we small of a for set work, In this features i...  In this work, we propose a small set of featur...
 30180  ($G_h^{Der}$ to factors the contributes $\tau_...  The increment of both factors ($G_h^{Der}$ and...
 6689   new precise particular, for entailment. bounds...  In particular, we provide new precise analytic...
 26893  a these causation Incorporating features, defi...  Incorporating these three features, a definiti...,
 (40001, 2))

In [7]:
# Splitting sentences and labels
train_sentences = train_data_shuffled['text'].to_numpy()
train_labels = train_data_shuffled['label'].to_numpy()

val_sentences = val_data['text'].to_numpy()
val_labels = val_data['label'].to_numpy()

test_sentences = test_data['text'].to_numpy()
test_labels = test_data['label'].to_numpy()


# Checking the shapes 
print(f'Shape of the train sentences: {train_sentences.shape}')
print(f'Shape of the validation sentences: {val_sentences.shape}')
print(f'Shape of the train labels: {train_labels.shape}')
print(f'Shape of the validation labels: {val_labels.shape}')

Shape of the train sentences: (40001,)
Shape of the validation sentences: (4001,)
Shape of the train labels: (40001,)
Shape of the validation labels: (4001,)


In [26]:
# Creating a tf.data.dataset of our sentences and labels 

train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences , train_labels)).shuffle(1000)
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences , val_labels))

# Adding a batch 
train_dataset = train_dataset.batch(64).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

train_dataset , val_dataset

(<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.string)>,
 <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.string)>)

In [9]:
# Looking into our train_dataset just a batch (only 5 first texts in a batch)
for scrambled_text , unscrambled_text in train_dataset.take(1):
  print(f'Below is the Scrambled version:\n {scrambled_text[:5]}')
  print('\n----------\n')
  print(f'Below is the Un-Scrambled version:\n {unscrambled_text[:5]}')

Below is the Scrambled version:
 [b'introns. depends that of assume the length the exonization process We on'
 b'types combines proposed The approach of information. two different'
 b'(SPIC). algorithm, this We of SPI SPI call Continuous the variation'
 b'data), could and personalized. be estimation/tracking which human such as highly pose small behavior'
 b'problem. enhancement computer resolution classical is image vision Low a']

----------

Below is the Un-Scrambled version:
 [b'We assume that the exonization process depends on the length of introns.'
 b'The proposed approach combines two different types of information.'
 b'We call this variation of the SPI algorithm, SPI Continuous (SPIC).'
 b'small data), such as human pose and behavior estimation/tracking which could be highly personalized.'
 b'Low resolution image enhancement is a classical computer vision problem.']


In [10]:
train_dataset

<PrefetchDataset shapes: ((None,), (None,)), types: (tf.string, tf.string)>

In [11]:
# Getting the example input batch annd target batch 

example_input_batch , example_target_batch = next(iter(train_dataset))

example_input_batch.shape , example_target_batch.shape

(TensorShape([64]), TensorShape([64]))

In [12]:
# Creating text vectorization layer for the scrambled words 
max_vocab_length = 10000

input_text_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = 'lower_and_strip_punctuation' , 
    ngrams = 2 , 
    max_tokens = max_vocab_length 
)

# Fitting on our train sentences (scrambled words )
input_text_vectorizer.adapt(train_sentences)

In [13]:
# First 10 words from the vocabulary 
input_text_vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'of', 'a', 'in', 'to', 'is', 'and', 'we']

In [14]:
# Creating a text vectorization layer for the unscrambled words 
output_text_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = 'lower_and_strip_punctuation' , 
    ngrams = 2, 
    max_tokens = max_vocab_length
)

# Fitting on our train labels (unscrambled words)
output_text_vectorizer.adapt(train_labels)

In [15]:
# First 10 words from the vocab 
output_text_vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'of', 'a', 'in', 'to', 'is', 'and', 'we']

In [16]:
# Passing a scrambled text (strings) into our layer 
scrambled_tokens = input_text_vectorizer(scrambled_text)
scrambled_tokens[:3]

<tf.Tensor: shape=(3, 29), dtype=int64, numpy=
array([[   1, 1036,   14,    3, 1605,    2, 1822,    2,    1,  117,    9,
          13,    1,    1, 1583,    1,    1,    1,    1,    1,    1, 7771,
         745,    0,    0,    0,    0,    0,    0],
       [ 468, 1517,   23,    2,   31,    3,   66,   42,   56,    1,    1,
         406,  857, 1451, 2884,    1, 6242,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [   1,   39,   11,    9,    3,    1,    1,  980, 1066,    2,  761,
           1, 2837,  378,  297,    1,    1,    1,    1,    1,    1,    0,
           0,    0,    0,    0,    0,    0,    0]])>

In [17]:
# Creating a numpy array of the vocabulary
input_vocab = np.array(input_text_vectorizer.get_vocabulary())
output_vocab = np.array(output_text_vectorizer.get_vocabulary())

In [18]:
# Indexing our scrambled tokens into the array of vocbulary
tokens = input_vocab[scrambled_tokens.numpy()]
print(f'Actual sequence:\n\n {scrambled_text[:3]}\n')
print(f'\nThe sequence in tokens:\n\n {tokens[:3]}')

Actual sequence:

 [b'introns. depends that of assume the length the exonization process We on'
 b'types combines proposed The approach of information. two different'
 b'(SPIC). algorithm, this We of SPI SPI call Continuous the variation']


The sequence in tokens:

 [['[UNK]' 'depends' 'that' 'of' 'assume' 'the' 'length' 'the' '[UNK]'
  'process' 'we' 'on' '[UNK]' '[UNK]' 'that of' '[UNK]' '[UNK]' '[UNK]'
  '[UNK]' '[UNK]' '[UNK]' 'process we' 'we on' '' '' '' '' '' '']
 ['types' 'combines' 'proposed' 'the' 'approach' 'of' 'information' 'two'
  'different' '[UNK]' '[UNK]' 'proposed the' 'the approach' 'approach of'
  'of information' '[UNK]' 'two different' '' '' '' '' '' '' '' '' '' ''
  '' '']
 ['[UNK]' 'algorithm' 'this' 'we' 'of' '[UNK]' '[UNK]' 'call'
  'continuous' 'the' 'variation' '[UNK]' 'algorithm this' 'this we'
  'we of' '[UNK]' '[UNK]' '[UNK]' '[UNK]' '[UNK]' '[UNK]' '' '' '' '' ''
  '' '' '']]


In [27]:
# Defining some important parameters 
inp_vocab_size = len(input_vocab)
lab_vocab_size = len(output_vocab) # this will be our label 
embedding_dim = 256
units = 1024
max_length = 15
BATCH_SIZE = 64

#### Encoder 

In [20]:
class Encoder(tf.keras.Model):
  def __init__(self , vocab_size , embedding_dim , enc_units , batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size # batch size
    self.enc_units = enc_units # Encoder units / units
    self.embedding = tf.keras.layers.Embedding(vocab_size , embedding_dim) # the embedding layer

    ## LSTM layer in our Encoder 
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units , 
                                           return_sequences = True , 
                                           return_state = True , 
                                           recurrent_initializer = 'glorot_uniform')
    
  def call(self , x , hidden):
    x = self.embedding(x) # 
    output , h , c = self.lstm_layer(x , initial_state = hidden)
    return output , h , c

  def initialize_hidden_state(self): 
    return [tf.zeros((self.batch_size , self.enc_units)) , tf.zeros((self.batch_size , self.enc_units))]




- hidden_state --> output of the lstm layer

[Difference between return state and return sequence](https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/#:~:text=The%20output%20of%20an%20LSTM,the%20cell%20state%2C%20or%20c.&text=The%20LSTM%20hidden%20state%20output,last%20time%20step%20(again).)

In [21]:
dum = tf.zeros((64 , 1024))


In [22]:
# Test the Encoder layer we built 
encoder = Encoder(vocab_size = inp_vocab_size ,
                  embedding_dim = embedding_dim , 
                  enc_units = units , 
                  batch_size = BATCH_SIZE)

In [23]:
# This will initialize the hidden state of our lstm layer 
encoder.initialize_hidden_state()

[<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>]

In [24]:
sample_hidden = encoder.initialize_hidden_state()  # Sample input

# Apply the text vectorizer and turn the sequence into tokens before passing into Encoder
sample_output , sample_h , sample_c = encoder(input_text_vectorizer(example_input_batch) , sample_hidden)

print(f'Encoder output shape: (batch size , sequence length , units) --> {sample_output.shape}')
print(f'Encoder h vector shape: (batch_size , units) --> {sample_h.shape}')
print(f'Encoder c vector shape: (batch_size , units) --> {sample_c.shape}')

Encoder output shape: (batch size , sequence length , units) --> (64, 29, 1024)
Encoder h vector shape: (batch_size , units) --> (64, 1024)
Encoder c vector shape: (batch_size , units) --> (64, 1024)


#### Decoder 

Some threads I used to refer: 
- https://stackoverflow.com/questions/48187283/whats-the-difference-between-lstm-and-lstmcell



In [47]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size , embedding_dim , dec_units , batch_size , attention_type = 'luong'):
    super(Decoder , self).__init__()
    self.batch_size = batch_size 
    self.dec_units = dec_units 
    self.attention_type = attention_type 

    # Embedding layer 
    self.embedding = tf.keras.layers.Embedding(vocab_size , embedding_dim)

    # Final Dense layer where softmax will be applied (fully connected layer)
    self.fc = tf.keras.layers.Dense(vocab_size)

    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)

    # Sampler 
    self.sampler = tfa.seq2seq.sampler.TrainingSampler() 

    # Create attention mechanism with the memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units , 
                                                              None , 
                                                              self.batch_size*[max_length], self.attention_type)
    
    # Define the decoder with respect to fundamental rnn cell 
    self.rnn_cell = self.build_rnn_cell(batch_size)

    # Define the decoder with respect to fundamental rnn cell 
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell , sampler = self.sampler, 
                                            output_layer = self.fc)
    
 
  def build_attention_mechanism(self , dec_units , memory , memory_sequence_length , attention_type = 'luong'):
    '''
    1. attention_type --> Which sort of attention (Bahdanau , Luong)
    2. dec_units: Final dimension of attention outputs (Decoder units)
    3. memory: Encoder hidden states of shape (batch_size , max_length_inputs , enc_units)
    4. memory_sequence_length: 1D array of shape (batch_size) with every element set to max_length_input (for masking purpose)
    '''

    if (attention_type == 'bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units = dec_units , 
                                           memory = memory , 
                                           memory_sequence_length = memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units = dec_units , memory = memory , memory_sequence_length= memory_sequence_length)  

  def build_rnn_cell(self , batch_size):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell , 
                                            self.attention_mechanism, 
                                            attention_layer_size = self.dec_units)
    return rnn_cell

  def build_initial_state(self , batch_size , encoder_state , Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size , dtype = Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state = encoder_state)
    return decoder_initial_state

  def call(self , inputs , initial_state):
    x = self.embedding(inputs)
    outputs, _ , _ = self.decoder(x , initial_state = initial_state , sequence_length = self.batch_size*[max_length -1])
    return outputs

In [48]:
# Setting up our decoder 
decoder = Decoder(vocab_size= lab_vocab_size , 
                  embedding_dim = embedding_dim , 
                  dec_units = units , 
                  batch_size = BATCH_SIZE , 
                  attention_type = 'luong')


In [49]:
# Sample sequence to be passed inside the decoder
sample_x = tf.random.uniform((BATCH_SIZE , max_length))
sample_x[:1]

<tf.Tensor: shape=(1, 15), dtype=float32, numpy=
array([[0.5982404 , 0.5174072 , 0.7417691 , 0.98033535, 0.20018566,
        0.58651006, 0.58131933, 0.4040811 , 0.8046969 , 0.95714545,
        0.75586677, 0.83635545, 0.7425797 , 0.15451884, 0.5775627 ]],
      dtype=float32)>

In [50]:
# Passing the output of our encoder into the decoder attention
decoder.attention_mechanism.setup_memory(sample_output)

In [51]:
# Getting the initial state 
initial_state = decoder.build_initial_state(BATCH_SIZE , [sample_h , sample_c] , tf.float32)
initial_state[:2]

([<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
  array([[-0.00332343,  0.00951187,  0.00872059, ...,  0.00271745,
           0.00340977,  0.00204801],
         [-0.0031446 ,  0.00791592,  0.00978961, ..., -0.00080166,
          -0.00122762, -0.0031764 ],
         [-0.00295936,  0.00692794,  0.00766462, ..., -0.00340543,
          -0.00272821, -0.00033349],
         ...,
         [-0.00570125,  0.00837591,  0.00742682, ...,  0.00542367,
           0.00557165,  0.00188923],
         [-0.00610816,  0.00962569,  0.00756236, ...,  0.0036595 ,
           0.00469161,  0.00099883],
         [-0.002564  ,  0.00709025,  0.00763696, ...,  0.0014222 ,
           0.00215575,  0.00052292]], dtype=float32)>,
  <tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
  array([[-0.00675011,  0.01907639,  0.01731787, ...,  0.0054781 ,
           0.00679825,  0.00406175],
         [-0.0063671 ,  0.01584223,  0.01940845, ..., -0.00161185,
          -0.00245117, -0.00630689],
         [-0.00598931,  0.01

In [52]:
# Getting the decoder outputs 
sample_decoder_outputs = decoder(sample_x , initial_state)
print(f'Decoder output shape: {sample_decoder_outputs.rnn_output.shape}')

Decoder output shape: (64, 14, 10000)


### Define the optimizer and the loss function 

In [53]:
# We're going to use the Adam optimizer 
optimizer = tf.keras.optimizers.Adam()

In [57]:
steps_per_epoch = len(train_dataset) // BATCH_SIZE
steps_per_epoch 

9

In [60]:
# Creating a function for our loss function 
def loss_function(real , pred):
  '''
  Target shape = (Batch size , max_length) --> (64 , 15)
  Prediction shape = (batch size , max_length , lab_vocab_size)
  '''
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits= True , reduction = 'none')
  loss = cross_entropy(y_true = real , y_pred = pred)
  # Output 0 for y = 0 else output 1
  mask = tf.logical_not(tf.math.equal(real , 0)) 
  mask = tf.cast(mask , dtype = loss.dtype)
  loss = mask*loss
  loss = tf.reduce_mean(loss)
  return loss

### Creating a checkpoint 

This checkpoint will keep track of, 
- Optimizer 
- Encoder 
- Decoder 

In [61]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir , 'ckpt')

# Initializing our checkpoint
checkpoint = tf.train.Checkpoint(optimizer = optimizer , 
                                 encoder = encoder , 
                                 decoder = decoder)

### One `train_step` operations 

Creating a customizable train function. 

Have to read this https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit

In [83]:
@tf.function
def train_step(inp , targ , enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output , enc_h , enc_c = encoder(inp , enc_hidden)
    
    # Decoder input
    dec_input = targ[: , :-1] # Ignore <end> token
    real = targ[: , 1:] # Ignore <start> token

    # Set the Attention Mechanism object with encoder_outputs 
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder 
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE , [enc_h , enc_c] , tf.float32)
    
    # Getting the prediction from the decoder 
    pred = decoder(dec_input , decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real , logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss , variables)
  optimizer.apply_gradients(zip(gradients , variables))

  return loss

SyntaxError: ignored

## Train the model 

In [82]:
import time
EPOCHS  = 10


for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  print(enc_hidden[0].shape , enc_hidden[1].shape)

  for (batch , (inp , targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp , targ , enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch + 1} Batch{batch} Loss{batch_loss.numpy()}\n')

  # Saving the model checkpoints for every 2 epochs 
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix= checkpoint_prefix)

  print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch}\n')
  print(f'Time taken for 1 epoch {time.time() - start} sec\n')

(64, 1024) (64, 1024)


InvalidArgumentError: ignored

In [74]:
test_enc_hidden = encoder.initialize_hidden_state()
test_enc_hidden

[<tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(64, 1024), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>]