[View in Colaboratory](https://colab.research.google.com/github/assaflehr/language-style-transfer/blob/master/notebooks/ST_EXP1_sentence_autoendcoder.ipynb)

Exp1 report:

First check it's working at all (use 160 examples) with short-sentences(<12 words).  Took few hours to find initial working model. encoder-decoder based on simple (one-directional) CudaGRU. a lot of code taken from tf eager [tutorial](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb#scrollTo=cnxC7q-j3jFD)
num_examples = 160 , crop_sentence_max_words=12 , BATCH_SIZE = 64 , embedding_dim = 256 , units = 1024 


Then increase to 16000 examples (all file) and let it run


In [0]:
# Try experimenting with the size of that dataset
num_examples = 16000 #16000  #max = 14716+3679 in current file
crop_sentence_max_words=12
BATCH_SIZE = 64
embedding_dim = 256
units = 1024  #hidden size in encoder and decoder (as one pass to the other)


### Exp 1 



In [21]:
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.9 and enable eager execution
import tensorflow as tf

tf.enable_eager_execution()



ValueError: ignored

In [3]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import time

print(tf.__version__)
if  tf.test.gpu_device_name() != '/device:GPU:0':
  raise SystemError('GPU device not found')



# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w,crop_sentence_max_words):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w= ' '.join(w.split(' ')[:crop_sentence_max_words])
    w = '<start> ' + w + ' <end>'
    
    return w
 
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return sentence list
def create_dataset(path, num_examples,crop_sentence_max_words,verbose=True):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    sentences = [preprocess_sentence(s,crop_sentence_max_words) for s  in lines[:num_examples]]
    return sentences
  
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx['<pad>'] = 0
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word
      
      
def max_length(tensor):
    return max(len(t) for t in tensor)


def load_dataset(path, num_examples,crop_sentence_max_words=9999,verbose=True):
    # creating cleaned input, output pairs
    sentences = create_dataset(path, num_examples,crop_sentence_max_words)
    if verbose:
      print (sentences[:2])

    # index language using the class defined above    
    targ_lang = LanguageIndex(sentences)
    
    # Vectorize the input and target languages
    
    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en in sentences]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_tar = max_length(target_tensor)
      
    # Padding the input and output tensor to the maximum length
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return  target_tensor,  targ_lang,  max_length_tar
  
  # Download the file
path_to_file = tf.keras.utils.get_file(
    'train.modern.nltktok', origin='https://raw.githubusercontent.com/harsh19/Shakespearizing-Modern-English/master/data/train.modern.nltktok',
    extract=False)
print (path_to_file)


target_tensor,  targ_lang,  max_length_targ = load_dataset(path_to_file, num_examples,crop_sentence_max_words=crop_sentence_max_words)
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(target_tensor, target_tensor, test_size=0.2)

  
# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val),'max_length_targ',max_length_targ
  



1.9.0-rc1
Downloading data from https://raw.githubusercontent.com/harsh19/Shakespearizing-Modern-English/master/data/train.modern.nltktok
/content/.keras/datasets/train.modern.nltktok
['<start> i have half a mind to hit you before you speak again <end>', '<start> but if antony is alive , healthy , friendly with caesar , <end>']


(12800, 12800, 3200, 3200, 'max_length_targ', 14)

### model

In [4]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
  if tf.test.is_gpu_available():
    print ('using CuDNNGRU')
    return tf.keras.layers.CuDNNGRU(units, 
                                    return_sequences=True, 
                                    return_state=True, 
                                    recurrent_initializer='glorot_uniform')
  else:
    return tf.keras.layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_activation='sigmoid', 
                               recurrent_initializer='glorot_uniform')

  
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        """
        """
        x = self.embedding(x)
        outputs, states = self.gru(x, initial_state = hidden)        
        return outputs, states
    
    def get_initial_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))  

      
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x, hidden):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # passing the concatenated vector to the GRU
        output, new_hidden = self.gru(x, initial_state = hidden)
        
        # output shape == (batch_size * max_length, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * max_length, vocab)
        x = self.fc(output)
        
        return x,new_hidden
        
    #def get_initial_hidden_state(self):
    #    return tf.zeros((self.batch_sz, self.dec_units)) 


BUFFER_SIZE = len(input_tensor_train)

vocab_tar_size = len(targ_lang.word2idx)
vocab_inp_size= vocab_tar_size 

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))

      
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)      

using CuDNNGRU
using CuDNNGRU


## EVAL SINGLE

In [22]:


def evaluate_single(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    #attention_plot = np.zeros((max_length_targ, max_length_inp))
    
    sentence = preprocess_sentence(sentence,crop_sentence_max_words)

    inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    vis_tensor ('inputs',inputs)
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    vis_tensor ('enc_hidden',enc_hidden)
    vis_tensor ('enc_out',enc_out)
    
    
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        
        predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence
  
def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    result, sentence = evaluate_single(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
        
    print('Input:      {}'.format(sentence))
    print('Predicted : {}'.format(result))
print (vocab_inp_size)   


translate("I haven't done anything to you .", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)
print ('\n\n\n')
#translate("Good madam , restrain yourself .", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)
#translate("life is good .", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)
translate("Good madam , restrain yourself", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)

translate("Good madam , I haven't done anything to you", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)

translate("Ho my god, what happened.", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)

6928
Input:      <start> i haven t done anything to you . <end>
Predicted : i haven t done anything to you . <end> 




Input:      <start> good madam , restrain yourself <end>
Predicted : good madam , restrain yourself ! <end> 
Input:      <start> good madam , i haven t done anything to you <end>
Predicted : good gentlemen , i won t do this you deserve <end> 
Input:      <start> ho my god , what happened . <end>
Predicted : as my children , be patient . <end> 


## Training

1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.
2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.
3. The decoder returns the *predictions* and the *decoder hidden state*.
4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
5. Use *teacher forcing* to decide the next input to the decoder.
6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.
7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [23]:
verbose=True

def vis_tensor(name,t):
  if verbose:
    cnt=5
    content = (t[0][:cnt].numpy() if len(t.shape.dims)<=2 else t[0][0][:cnt].numpy())
    print (name, t.shape, content),#tf.reduce_max(t).numpy(),tf.reduce_mean(t).numpy(),tf.reduce_min(t).numpy())
    print 
    #print ('\n')

    
#print('Found GPU at: {}'.format(device_name))

optimizer = tf.train.AdamOptimizer()

def loss_function(real, pred):
  mask = 1 - np.equal(real, 0)
  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
  return tf.reduce_mean(loss_)


EPOCHS = 200

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.get_initial_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        verbose = False if batch==10 else False

        with tf.GradientTape() as tape:
            vis_tensor('inp',inp)
            vis_tensor('hidden',hidden)
            enc_output, enc_hidden = encoder(inp, hidden)
            vis_tensor('enc_output',enc_output)
            vis_tensor('enc_hidden',enc_hidden)
            
            dec_hidden = enc_hidden
            
            
            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                vis_tensor('dec_input',dec_input)
                vis_tensor('dec_hidden',dec_hidden)
                predictions, dec_hidden = decoder(dec_input, dec_hidden)
                vis_tensor('predictions',predictions)
                vis_tensor('dec_hidden',dec_hidden)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing, TODO: change it to only sometimes run
                dec_input = tf.expand_dims(targ[:, t], 1)
                
           
        total_loss += (loss / int(targ.shape[1]))
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
      
        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())

        if batch % 10 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         loss.numpy() / int(targ.shape[1])))
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss/len(target_tensor)))
    translate("I haven't done anything to you .", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)
    translate("We try to see how you are doing", encoder, decoder, targ_lang, targ_lang, max_length_targ, max_length_targ)

    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.0000
Epoch 1 Batch 10 Loss 0.0004
Epoch 1 Batch 20 Loss 0.0224
Epoch 1 Batch 30 Loss 0.0148
Epoch 1 Batch 40 Loss 0.0082
Epoch 1 Batch 50 Loss 0.0197
Epoch 1 Batch 60 Loss 0.0179
Epoch 1 Batch 70 Loss 0.0345
Epoch 1 Batch 80 Loss 0.0169
Epoch 1 Batch 90 Loss 0.0228
Epoch 1 Batch 100 Loss 0.0352
Epoch 1 Batch 110 Loss 0.0439
Epoch 1 Batch 120 Loss 0.0370
Epoch 1 Batch 130 Loss 0.0375
Epoch 1 Batch 140 Loss 0.0372
Epoch 1 Batch 150 Loss 0.0324
Epoch 1 Batch 160 Loss 0.0575
Epoch 1 Batch 170 Loss 0.0313
Epoch 1 Batch 180 Loss 0.0500
Epoch 1 Batch 190 Loss 0.0405
Epoch 1 Loss 0.0003
Input:      <start> i haven t done anything to you . <end>
Predicted : i haven t done anything to you . <end> 
Input:      <start> we try to see how you are doing <end>
Predicted : we thought you ought to let us die <end> 
Time taken for 1 epoch 44.424299001693726 sec

Epoch 2 Batch 0 Loss 0.0371
Epoch 2 Batch 10 Loss 0.0514
Epoch 2 Batch 20 Loss 0.0419
Epoch 2 Batch 30 Loss 0.0274
Epoch 

Epoch 4 Batch 80 Loss 0.0205
Epoch 4 Batch 90 Loss 0.0327
Epoch 4 Batch 100 Loss 0.0352
Epoch 4 Batch 110 Loss 0.0227
Epoch 4 Batch 120 Loss 0.0419
Epoch 4 Batch 130 Loss 0.0375
Epoch 4 Batch 140 Loss 0.0327
Epoch 4 Batch 150 Loss 0.0332
Epoch 4 Batch 160 Loss 0.0399
Epoch 4 Batch 170 Loss 0.0218
Epoch 4 Batch 180 Loss 0.0525
Epoch 4 Batch 190 Loss 0.0229
Epoch 4 Loss 0.0004
Input:      <start> i haven t done anything to you . <end>
Predicted : i haven t done anything to you . <end> 
Input:      <start> we try to see how you are doing <end>
Predicted : we seemed to you see how he ! <end> 
Time taken for 1 epoch 44.00268769264221 sec

Epoch 5 Batch 0 Loss 0.0539
Epoch 5 Batch 10 Loss 0.0188
Epoch 5 Batch 20 Loss 0.0362
Epoch 5 Batch 30 Loss 0.0324
Epoch 5 Batch 40 Loss 0.0260
Epoch 5 Batch 50 Loss 0.0140
Epoch 5 Batch 60 Loss 0.0280
Epoch 5 Batch 70 Loss 0.0232
Epoch 5 Batch 80 Loss 0.0417
Epoch 5 Batch 90 Loss 0.0325
Epoch 5 Batch 100 Loss 0.0402
Epoch 5 Batch 110 Loss 0.0376
Epoch 5 B

Epoch 7 Batch 140 Loss 0.0289
Epoch 7 Batch 150 Loss 0.0325
Epoch 7 Batch 160 Loss 0.0337
Epoch 7 Batch 170 Loss 0.0358
Epoch 7 Batch 180 Loss 0.0310
Epoch 7 Batch 190 Loss 0.0229
Epoch 7 Loss 0.0004
Input:      <start> i haven t done anything to you . <end>
Predicted : i haven t done anything to harm . <end> 
Input:      <start> we try to see how you are doing <end>
Predicted : we thought you ought to obey his life ! <end> 
Time taken for 1 epoch 44.33566093444824 sec

Epoch 8 Batch 0 Loss 0.0256
Epoch 8 Batch 10 Loss 0.0211
Epoch 8 Batch 20 Loss 0.0220
Epoch 8 Batch 30 Loss 0.0133
Epoch 8 Batch 40 Loss 0.0255
Epoch 8 Batch 50 Loss 0.0211
Epoch 8 Batch 60 Loss 0.0159
Epoch 8 Batch 70 Loss 0.0243
Epoch 8 Batch 80 Loss 0.0242
Epoch 8 Batch 90 Loss 0.0379
Epoch 8 Batch 100 Loss 0.0301
Epoch 8 Batch 110 Loss 0.0161
Epoch 8 Batch 120 Loss 0.0308
Epoch 8 Batch 130 Loss 0.0385
Epoch 8 Batch 140 Loss 0.0333
Epoch 8 Batch 150 Loss 0.0258
Epoch 8 Batch 160 Loss 0.0415
Epoch 8 Batch 170 Loss 0.0

Epoch 10 Batch 180 Loss 0.0683
Epoch 10 Batch 190 Loss 0.0353
Epoch 10 Loss 0.0005
Input:      <start> i haven t done anything to you . <end>
Predicted : i haven t done anything to you . <end> 
Input:      <start> we try to see how you are doing <end>
Predicted : we didn t expect mark antony to hell ! <end> 
Time taken for 1 epoch 44.03478407859802 sec

Epoch 11 Batch 0 Loss 0.0559
Epoch 11 Batch 10 Loss 0.0614
Epoch 11 Batch 20 Loss 0.0470
Epoch 11 Batch 30 Loss 0.0287
Epoch 11 Batch 40 Loss 0.0510


SystemError: ignored

## TODO: Eval


In [0]:
## TODO: Eval

In [10]:
def eval(encoder, decoder, lang, max_length,val_dataset):
  """
    evaluate total-loss on one EPOC of the val_dataset
  """

  
  hidden = encoder.get_initial_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset): #dataset return (#batch,(x,y)
      loss = 0

      _ , enc_hidden = encoder(inp, hidden)

      dec_hidden = enc_hidden
      
      #
      dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       

      # Teacher forcing - feeding the target as the next input
      for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
          predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)

          loss += loss_function(targ[:, t], predictions)

          # using teacher forcing, TODO: change it to only sometimes run
          dec_input = tf.expand_dims(targ[:, t], 1)

      total_loss += (loss / int(targ.shape[1]))


val_dataset = tf.data.Dataset.from_tensor_slices((target_tensor_val, target_tensor_val)).shuffle(BUFFER_SIZE)
val_dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))
eval(encoder, decoder, lang, max_length,val_dataset)

NameError: ignored

In [19]:
val_dataset = tf.data.Dataset.from_tensor_slices((target_tensor_val, target_tensor_val)).shuffle(BUFFER_SIZE)
val_dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))

for (batch, (inp, targ)) in enumerate(val_dataset):
  print ('batch',batch,'inp',inp.shape,'targ',targ.shape)


batch 0 inp (64, 64, 14) targ (64, 64, 14)
batch 1 inp (64, 64, 14) targ (64, 64, 14)
batch 2 inp (64, 64, 14) targ (64, 64, 14)
