In [0]:
# Initialize drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Move to drive and import from onmt
%cd drive
%cd 'My Drive'
%cd AML

In [0]:
# Imports
from __future__ import absolute_import, division, print_function
!pip install tensorflow-gpu==2.0.0a
from constants import *
warnings.filterwarnings("ignore")

In [0]:
RUN_NUMBER = 10
tf.random.set_seed(1)
USE_ANNEALING = False
USE_GRADIENT_CLIPPING = False
EPOCHS=40
BATCH_SIZE = 128

ENCODER_MODEL = 'LSTM'
CHECKPOINT_PATH = './checkpoints/'+ENCODER_MODEL + '/' + str(RUN_NUMBER)
RESULTS_PATH = './results/' + ENCODER_MODEL + '/' + str(RUN_NUMBER) +'/'
# Create target directory if doesn't exist
if not os.path.exists('./results/' + ENCODER_MODEL + '/' ):
    os.mkdir('./results/' + ENCODER_MODEL + '/' + str(RUN_NUMBER))
TRAIN_FROM_SCRATCH = True

In [0]:
# Read the data
train_en, train_de, test_en, test_de, val_en, val_de = readdata()

# Run tokenization for English
tok_train_en, tok_val_en, tok_test_en, train_en_sen_len, val_en_sen_len,\
test_en_sen_len, en_dict_w2i, en_dict_i2w, en_max_words = tokenize(train_en, val_en, test_en, max_length=MAX_INPUT_SIZE)

#add pad token
en_dict_w2i.update({'<PAD>':0})
en_dict_i2w.update({0:'<PAD>'})
en_vocab_size = np.amax(tok_train_en)

# Run tokenization for Deutsch
tok_train_de, tok_val_de, tok_test_de, train_de_sen_len, val_de_sen_len,\
test_de_sen_len, de_dict_w2i, de_dict_i2w, de_max_words = tokenize(train_de, val_de, test_de, max_length=MAX_INPUT_SIZE)

#add pad token
de_dict_w2i.update({'<PAD>':0})
de_dict_i2w.update({0:'<PAD>'})
de_vocab_size = np.amax(tok_train_de)

# Create Glove Embedding dictionary
glove_embedding_matrix = create_embedding_indexmatrix(en_max_words, 
                                                      embedding_dim=EMBEDDING_DIM,
                                                      dict_en=en_dict_i2w)

In [0]:
BUFFER_SIZE = len(tok_train_en)
steps_per_epoch = len(tok_train_en)//BATCH_SIZE

# Create dataset
train_dataset = tf.data.Dataset.from_tensor_slices((tok_train_en, tok_train_de)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)

# Create validation dataset
VAL_LEN = len(tok_val_en)
val_dataset = tf.data.Dataset.from_tensor_slices((tok_val_en, tok_val_de))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

# Create test dataset
TEST_LEN = len(tok_test_de)
test_dataset = tf.data.Dataset.from_tensor_slices((tok_test_en, tok_test_de))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

# Then clean up unused objects
del tok_train_en, tok_train_de, tok_val_en, tok_val_de
gc.collect()

In [0]:
# Initialize embedding and encoder
embed = Embedding(input_dim=en_max_words, output_dim=EMBEDDING_DIM,
                                 embeddings_initializer=Constant(glove_embedding_matrix),
                                 input_length=MAX_INPUT_SIZE,
                                 trainable=False)
encoder = None
if ENCODER_MODEL == 'LSTM':
  from Encoder3 import LSTMEncoder
  encoder = LSTMEncoder(batch_size=BATCH_SIZE,
                        drop_out=DROP_OUT,
                        r_drop_out=R_DROP_OUT, 
                        embedding_dim=EMBEDDING_DIM,
                        max_input_size=MAX_INPUT_SIZE)
elif ENCODER_MODEL == 'CNN':
  from Encoder3 import CNNEncoder
  encoder = CNNEncoder(batch_size=BATCH_SIZE,
                      drop_out=DROP_OUT,
                      embedding_dim=EMBEDDING_DIM,
                      max_input_size=MAX_INPUT_SIZE,
                      kernel_size=KERNEL_SIZE)
  
elif ENCODER_MODEL == 'ATTN':
  from Encoder import ATTNEncoder
  encoder = ATTNEncoder(batch_size=BATCH_SIZE, 
                        drop_out=DROP_OUT,
                        max_input_size=MAX_INPUT_SIZE, 
                        embedding_dim= EMBEDDING_DIM)
else:
  TypeError('Invalid Encoder Model given')

In [0]:
from Decoder import LSTMDecoder
decoder = LSTMDecoder(batch_size=BATCH_SIZE, 
                      drop_out=DROP_OUT, 
                      r_drop_out = R_DROP_OUT,
                      max_input_size=MAX_INPUT_SIZE, 
                      embedding_dim=EMBEDDING_DIM,
                      vocab_size =de_max_words)

In [0]:
optimizer = None
if USE_ANNEALING:
  optimizer = tf.keras.optimizers.SGD(learning_rate=SGD_LEARNING_RATE)
else:
  optimizer = tf.keras.optimizers.Adam(lr=LEARNING_RATE)
def loss_function(real, pred):
  mask = 1 - np.equal(real, 0.)
  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
  return tf.reduce_mean(loss_)

In [0]:
checkpoint_prefix = os.path.join('./checkpoints/'+ "adi-chkpts" + '/' +"/2" +'/', "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)
TRAIN_FROM_SCRATCH= False
if not TRAIN_FROM_SCRATCH:
  # Required for TF to recognize input/ouput:
  c_t = decoder.initialize_hidden_state()
  h_t = decoder.initialize_hidden_state()
  example_input_batch, example_target_batch = next(iter(train_dataset))
  H, _, _,_,_ = encoder(embed(example_input_batch))
  z_t = K.cast(tf.expand_dims([de_dict_w2i['bos']] * BATCH_SIZE, 1), dtype='float32')
  context = decoder.initialize_hidden_state()
  predictions, h_t, c_t, _, _, context = decoder(z_t, h_t, c_t, c_t, c_t, H, context)
  # Load weights:
  checkpoint.restore(tf.train.latest_checkpoint(CHECKPOINT_PATH))
  print('loaded')


In [0]:
def softmax(x):
    exp_x = np.exp(x)
    return exp_x/np.sum(exp_x)
  
def evaluate(inp, targ, compute_perp=False):
  perp, batch_loss = [0,0]
  # Compute embeddings
  inp_embed = embed(inp)
  H, h_1, c_1, h_2, c_2 = encoder.call(inp_embed)
  for t in [h_1, c_1, h_2, c_2]:
    h_1 = decoder.initialize_hidden_state()
    c_1 = decoder.initialize_hidden_state()
    h_2 = decoder.initialize_hidden_state()
    c_2 = decoder.initialize_hidden_state()
    
  context = decoder.initialize_hidden_state()
  z_t = K.cast(tf.expand_dims([de_dict_w2i['bos']] * BATCH_SIZE, 1), 
               dtype='float32')   # using teacher forcing
  for t in range(1, MAX_INPUT_SIZE):
      predictions, h_1, c_1, h_2, c_2, context = decoder(z_t, h_1, c_1, h_2, c_2, H, context)
      if compute_perp:
        for k in range(BATCH_SIZE):
          real = targ[:,t][k]
          mask = 1 - np.equal(real,0)
          prob = softmax(predictions[k,:])
          perp += mask*np.log2(prob[real])     
      batch_loss += loss_function(targ[:, t], predictions)      
      z_t = tf.expand_dims(targ[:, t], 1) # using teacher forcing
  return batch_loss, perp

def validate(dataset, encoder, decoder, sent_len=None, compute_perp=False, validate = True):
  LEN= None
  if validate:
    LEN = VAL_LEN
  else:
    LEN = TEST_LEN
  N_BATCH = LEN// BATCH_SIZE
  loss, perp = [0,0]
  for (batch, (inp, targ)) in enumerate(dataset):
    batch_loss, batch_perp = evaluate(inp, targ, compute_perp)
    loss += batch_loss
    perp += batch_perp
  if compute_perp:
    total_words = sum(sent_len) - 2*LEN #-2 because we exclude BOS and EOS token 
    perp = np.power(2.0, -perp/total_words)
  return loss/N_BATCH, perp

In [0]:
[sec_last_loss, last_loss, lowest_val_loss] = [5000, 5000 , 5000]
delta = 0.0
all_batch_loss = []
N_Params = 0
for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = 0
        with tf.GradientTape() as tape:
            batch_loss, _ = evaluate(inp, targ, False)
        all_batch_loss.append(batch_loss)
        total_loss += batch_loss
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        if USE_GRADIENT_CLIPPING:
          gradients = [tf.clip_by_value(grad, -5, 5) for grad in gradients]
        optimizer.apply_gradients(zip(gradients, variables))
        total_loss += batch_loss
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))  

    # estimate time         
    time_taken = time.time() - start 
    
    # save if validation loss is less than minimum validation loss 
    val_loss, val_perp = validate(val_dataset, encoder, decoder, 
                                  val_en_sen_len, True)
    
    # early stopping
    if USE_ANNEALING: 
      if last_loss - val_loss < delta and SGD_LEARNING_RATE > 0.0001 and epoch>5:
        SGD_LEARNING_RATE /= 10
        print("Learning rate is now", SGD_LEARNING_RATE)
        optimizer = tf.keras.optimizers.SGD(learning_rate=SGD_LEARNING_RATE)
      elif last_loss - val_loss < delta:
        break
      else:
        sec_last_loss = last_loss
        last_loss = val_loss  
    else:
      if last_loss - val_loss < delta and sec_last_loss - last_loss < delta:
        break
      else: 
        sec_last_loss = last_loss
        last_loss = val_loss     

In [0]:

# Test set
test_loss, test_perp = validate(test_dataset, encoder, decoder, test_de_sen_len, True, False)
print('test loss: {:.4f} \n'.format(test_loss))
print('test perplexity {:.4f} \n'.format(test_perp))

# Write results
f = open(RESULTS_PATH + 'test_res.txt', "a")
f.write('Final Results Test Set \n'.format(epoch +1) )
f.write('Test Loss: {:.4f} \n'.format(test_loss))
f.write('Test Perplexity {:.4f} \n'.format(test_perp))
f.close()

In [0]:
  del encoder, decoder, optimizer