In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from keras.utils.vis_utils import plot_model
from keras.callbacks import History

In [2]:
!curl https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar --output daksh.tar
!tar -xvf  'daksh.tar' 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1915M  100 1915M    0     0   213M      0  0:00:08  0:00:08 --:--:--  219M
dakshina_dataset_v1.0/bn/
dakshina_dataset_v1.0/bn/lexicons/
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv
dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv
dakshina_dataset_v1.0/bn/native_script_wikipedia/
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.info.sorted.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.sorted.tsv.gz
dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.shuf.txt.gz
dakshina_dataset_v1.0/bn/native_script

In [3]:
!pip install wandb
!wandb login
import wandb
from wandb.keras import WandbCallback

[34m[1mwandb[0m: Currently logged in as: [33makshaygrao[0m (use `wandb login --relogin` to force relogin)


In [4]:
wandb.init(project="DeepLearningAssignment-3", entity='cs21s002-ee21s113-dlassignment-1')

[34m[1mwandb[0m: Currently logged in as: [33makshaygrao[0m (use `wandb login --relogin` to force relogin)


In [5]:
# wandb.init(project="DeepLearningAssignment-3", entity='akshaygrao')

In [6]:
def obtain_input_target_data_from_path(path,tokenizer_obj):
  input_texts = []
  target_texts = []
  
  df = pd.read_csv(path,sep="\t",names=["1", "2","3"]).astype(str)
  if tokenizer_obj is None:
    # Shuffle rows in random order with a fixed seed(for reproducability)
    df=df.sample(frac=1,random_state=1)
  # Add all the  input and target texts with start sequence and end sequence added to target 
  for index, row in df.iterrows():
      input_text=row['2']
      target_text= row['1']
      # Skip empty lines/words
      if target_text =='</s>' or input_text=='</s>':
        continue
      
      target_text = "\t" + target_text + "\n"
      input_texts.append(input_text)
      target_texts.append(target_text)
  
  return input_texts, target_texts

In [7]:
def convert_text_to_sequences(tokenizer_obj,inp_texts):
  if tokenizer_obj is None:
    tokenizer_obj = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
    tokenizer_obj.fit_on_texts(inp_texts)
  ret_tensor = tokenizer_obj.texts_to_sequences(inp_texts)
  ret_tensor = tf.keras.preprocessing.sequence.pad_sequences(ret_tensor,padding='post')

  return ret_tensor,tokenizer_obj

In [8]:
# This method converts a dataset(from path) to input and target sequences
def pre_process_data(path,input_tokenizer=None,target_tokenizer=None,input_length=None,target_length=None):
  
  input_texts, target_texts = obtain_input_target_data_from_path(path,input_tokenizer)
  
  input_tensor,input_tokenizer = convert_text_to_sequences(input_tokenizer,input_texts)
  
  target_tensor,target_tokenizer = convert_text_to_sequences(target_tokenizer,target_texts)
  
  # Above functions return padded version wrt longest sequence in the given list of sequence
  # The below function, pads more zeros wrt input_length and target_length
  if input_length is not None and target_length is not None:
      input_tensor=tf.concat([input_tensor,tf.zeros((input_tensor.shape[0],input_length-input_tensor.shape[1]))],axis=1)
      target_tensor=tf.concat([target_tensor,tf.zeros((target_tensor.shape[0],target_length-target_tensor.shape[1]))],axis=1)
  return input_texts,input_tensor,input_tokenizer,target_texts,target_tensor,target_tokenizer

In [9]:
transliteration_target_language = 'kn'

In [10]:
train_input_texts,train_input_tensor,input_tokenizer,train_target_texts,train_target_tensor,target_tokenizer = pre_process_data("/content/dakshina_dataset_v1.0/"+transliteration_target_language+"/lexicons/"+transliteration_target_language+".translit.sampled.train.tsv")
# Only training dataset is used to fit the tokenizer on text. Other datasets just use this vocab for pre-processing
# The length for padding is also set from training datasets
val_input_texts,val_input_tensor,val_input_tokenizer,val_target_texts,val_target_tensor,val_target_tokenizer = pre_process_data("/content/dakshina_dataset_v1.0/"+transliteration_target_language+"/lexicons/"+transliteration_target_language+".translit.sampled.dev.tsv",input_tokenizer,target_tokenizer,train_input_tensor.shape[1],train_target_tensor.shape[1])
test_input_texts,test_input_tensor,test_input_tokenizer,test_target_texts,test_target_tensor,test_target_tokenizer = pre_process_data("/content/dakshina_dataset_v1.0/"+transliteration_target_language+"/lexicons/"+transliteration_target_language+".translit.sampled.test.tsv",input_tokenizer,target_tokenizer,train_input_tensor.shape[1],train_target_tensor.shape[1])

In [11]:
# print(train_input_texts[:2])
# print(train_input_tensor[:2])
# print(val_input_texts[:2])
# print(val_input_tensor[:2])
print(train_target_texts[15])
print(train_target_tensor[15])
for each_ele in train_target_texts[15]:
  print(each_ele)
print(len(train_input_texts))
print(train_input_tensor.shape)
# print(len(train_target_texts))
# print(train_target_tensor.shape)

	ಘಟನೆಗಳಿಗೆ

[ 1 51 24 10 15 12 21  4 12 15  2  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]
	
ಘ
ಟ
ನ
ೆ
ಗ
ಳ
ಿ
ಗ
ೆ


50624
(50624, 26)


In [12]:
num_encoder_tokens = len(input_tokenizer.word_index)+1
num_decoder_tokens = len(target_tokenizer.word_index)+1
max_encoder_seq_length =  train_input_tensor.shape[1]
max_decoder_seq_length = train_target_tensor.shape[1]

In [13]:
print(num_encoder_tokens)
print(num_decoder_tokens)
print(max_encoder_seq_length)
print(max_decoder_seq_length)

27
64
26
26


In [14]:
def build_layered_RNN_model(rnn_type,embedding_in_dim,embedding_out_dim,layers,dropout,inp_length,model_out_dim,initial_state = None):
   #input layer ; takes in tokenize input
  model_inputs = keras.Input(shape=( inp_length))
  #embedding layer
  embed = keras.layers.Embedding(embedding_in_dim, embedding_out_dim)(model_inputs)
  
  last_layer_model = None
  if rnn_type == 'LSTM':
    #adding everything except the last LSTM layer, because in last layer return state=True
    for i in range(layers):
      layered_model = keras.layers.LSTM(model_out_dim, return_sequences=True,return_state=True,dropout=dropout)
      if i==0:
        inp_layer = embed
      else:
        inp_layer = last_layer_model
      
      model_layer_out,state_h, state_c = layered_model(inp_layer,initial_state)
      
      last_layer_model = model_layer_out
    
    model_states = [state_h, state_c]
    
  elif rnn_type=='GRU':
    #adding everything except the last GRU layer, because in last layer return state=True    
    for i in range(layers):
      layered_model = keras.layers.GRU(model_out_dim, return_sequences=True,return_state=True,dropout=dropout)
      if i==0:
        inp_layer = embed
      else:
        inp_layer = last_layer_model
      
      model_layer_out,state = layered_model(inp_layer,initial_state)
      
      last_layer_model = model_layer_out

    model_states = [state]
  elif rnn_type=='RNN':
    #adding everything except the last RNN layer, because in last layer return state=True
    for i in range(layers):      
      layered_model = keras.layers.SimpleRNN(model_out_dim, return_sequences=True,return_state=True,dropout=dropout)
      if i==0:
        inp_layer = embed
      else:
        inp_layer = last_layer_model
        
      model_layer_out,state = layered_model(inp_layer,initial_state)
      
      last_layer_model = model_layer_out

    model_states = [state]
    
  return model_states,last_layer_model,model_inputs


In [15]:

#Build the model
def build_model(rnn_type,embedding_dim,encoder_layers,decoder_layers,dropout,latent_dim):
  
  encoder_states,encoder_outputs,encoder_inputs = build_layered_RNN_model(rnn_type=rnn_type,embedding_in_dim = num_encoder_tokens,embedding_out_dim = embedding_dim,layers = encoder_layers,dropout = dropout,inp_length = max_encoder_seq_length,model_out_dim = latent_dim)

  _,decoder_outputs,decoder_inputs = build_layered_RNN_model(rnn_type=rnn_type,embedding_in_dim = num_decoder_tokens,embedding_out_dim = embedding_dim,layers = decoder_layers,dropout = dropout,inp_length = max_decoder_seq_length,model_out_dim = latent_dim,initial_state = encoder_states)
  
  decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax",name='final')
  decoder_outputs = decoder_dense(decoder_outputs)

  model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
  
  return model

In [16]:
def get_inference_encoder_model(model,encoder_layers):
  encoder_inputs = model.input[0]  
  if isinstance(model.layers[encoder_layers+3], keras.layers.LSTM):
    encoder_outputs, state_h_enc, state_c_enc = model.layers[encoder_layers+3].output  
    encoder_states = [state_h_enc, state_c_enc]
  elif (isinstance(model.layers[encoder_layers+3], keras.layers.GRU) or isinstance(model.layers[encoder_layers+3], keras.layers.RNN)):
    encoder_outputs, state = model.layers[encoder_layers+3].output  
    encoder_states = [state]

  encoder_model = keras.Model(encoder_inputs, encoder_states)
  return encoder_model

In [17]:
def get_inference_decoder_model(model,encoder_layers,decoder_layers,latent_dim):
  # Decoder during inference takes just one character(i.e vector rep of a character). This is either from previous timestep or start of sequence("\t")
  decoder_inputs =  keras.Input(shape=( 1))
  # Contains input to each decoder layer
  decoder_states_inputs=[]
  # Contains state output from each decoder layer
  decoder_states=[]
  previous_decoder_output = None

  emdedded_rep_of_decoder_input = model.layers[encoder_layers+2](decoder_inputs)
  # "encoder_layer + 4" because inp,embedding of encoder + inp,embedding of decoder
  if isinstance(model.layers[encoder_layers+4], keras.layers.LSTM):
    for i in range(decoder_layers):
      #every layer must have an input through which we can supply it's hidden state
      decoder_state_input_h = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
      decoder_state_input_c = keras.Input(shape=(latent_dim,),name='inp4_'+str(i))
      init_state = [decoder_state_input_h, decoder_state_input_c]
      decoder_lstm = model.layers[i+encoder_layers+4]
      if i==0:
        decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(emdedded_rep_of_decoder_input, initial_state=init_state)
      else:
        decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(previous_decoder_output, initial_state=init_state )
      
      previous_decoder_output = decoder_outputs
      decoder_states_inputs.append (decoder_state_input_h)
      decoder_states_inputs.append (decoder_state_input_c)
      decoder_states.append (state_h_dec)
      decoder_states.append (state_c_dec)
  elif isinstance(model.layers[encoder_layers + 4], keras.layers.GRU):
    for i in range(decoder_layers):
      decoder_state_input = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
      init_state = [decoder_state_input]
      decoder_lstm = model.layers[i+encoder_layers+4]
      if i==0:
        decoder_outputs, state = decoder_lstm(emdedded_rep_of_decoder_input, initial_state=init_state)
      else:
        decoder_outputs, state = decoder_lstm(previous_decoder_output, initial_state=init_state )
      
      previous_decoder_output = decoder_outputs
      decoder_states_inputs.append (decoder_state_input)
      decoder_states.append (state)
  elif isinstance(model.layers[encoder_layers + 4], keras.layers.RNN):
    for i in range(decoder_layers):
      decoder_state_input = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
      init_state = [decoder_state_input]
      decoder_lstm = model.layers[i+encoder_layers+4]
      if i==0:
        decoder_outputs, state = decoder_lstm(emdedded_rep_of_decoder_input, initial_state=init_state)
      else:
        decoder_outputs, state = decoder_lstm(previous_decoder_output, initial_state=init_state )
      
      previous_decoder_output = decoder_outputs
      decoder_states_inputs.append (decoder_state_input)
      decoder_states.append (state)      
  decoder_dense = model.get_layer('final')
  decoder_outputs = decoder_dense(previous_decoder_output)
  decoder_model = keras.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

  return decoder_model

In [18]:
def build_inference_model(model,encoder_layers,decoder_layers,latent_dim):
    encoder_model = get_inference_encoder_model(model,encoder_layers)
    
    decoder_model = get_inference_decoder_model(model,encoder_layers,decoder_layers,latent_dim)

    return encoder_model,decoder_model

In [19]:
index_to_char_target = dict((target_tokenizer.word_index[key], key) for key in target_tokenizer.word_index.keys())

In [20]:
def decode_batch_of_sequences(rnn_type,input_seq,encoder_model,decoder_model,batch_size,encoder_layers,decoder_layers):
    # Get encoder output
    encoder_output_state_values = encoder_model.predict(input_seq)
    if rnn_type=='GRU' or 'RNN':
      decoder_input_state_values=[encoder_output_state_values]
    
    # This is needed because encoder state is fed to all decoder layers
    decoder_input_state_values = decoder_input_state_values * decoder_layers
    
    # This is contain previously predicted character's index for every words in batch.
    prev_char_index = np.zeros((batch_size, 1))
    # We start with \t for every word in batch
    prev_char_index[:, 0] = target_tokenizer.word_index['\t']
    
    predicted_words = [ "" for i in range(batch_size)]
    done=[False for i in range(batch_size)]
    for i in range(max_decoder_seq_length):
        decoder_out = decoder_model.predict(tuple([prev_char_index] + decoder_input_state_values))
        # Decoder output has both output of all timesteps followed by hidden states
        output_probability = decoder_out[0]
        # Decoder state input is previous layer state output
        decoder_input_state_values = decoder_out[1:]
        for j in range(batch_size):
          if done[j]:
            continue          
          sampled_token_index = np.argmax(output_probability[j, -1, :])
          if sampled_token_index == 0:
            sampled_char='\n'
          else:
            sampled_char = index_to_char_target[sampled_token_index]
          if sampled_char == '\n':
            done[j]=True
            continue            
          predicted_words[j] += sampled_char
          #update the previously predicted characters        
          prev_char_index[j,0]=target_tokenizer.word_index[sampled_char]
    return predicted_words

In [21]:
def decode_batch_of_sequences_for_bigger_beam_width(rnn_type,input_seq,encoder_model,decoder_model,batch_size,encoder_layers,decoder_layers,beam_search_width):
    print("input_seq:"+str(input_seq.shape))
    next_list_of_beam_record_objects = []
    predicted_words = [ "" for i in range(batch_size)]
    list_of_beam_record_objects = []
    for j in range(batch_size):
      print("Batch number:"+str(j))
      current_seq = input_seq[j]
      current_seq = tf.expand_dims(current_seq, 0)
      # Get encoder output
      decoder_input_state_values = encoder_model.predict(current_seq)
      if rnn_type=='GRU' or 'RNN':
        decoder_input_state_values=[decoder_input_state_values]
    
      # This is needed because encoder state is fed to all decoder layers
      decoder_input_state_values = decoder_input_state_values[0] * decoder_layers
    
      prev_char_index = np.zeros((1, 1))
      # We start with \t for every word in batch
      prev_char_index[:, 0] = target_tokenizer.word_index['\t']
      done  = False
      for _ in range(beam_search_width):
        current_beam_search_obj = BeamRecordKeeping(decoder_input_state_values,prev_char_index,0,"")
        list_of_beam_record_objects.append(current_beam_search_obj)

      for i in range(max_decoder_seq_length):
        if( i!= 0 and len(next_list_of_beam_record_objects) == 0):
          predicted_words[j] = get_predicted_word_from_beam(list_of_beam_record_objects)
          break
        elif( i != 0 and len(next_list_of_beam_record_objects) < beam_search_width):
          predicted_words[j] = get_predicted_word_from_beam(next_list_of_beam_record_objects)
          break
        if(i != 0):
          list_of_beam_record_objects = next_list_of_beam_record_objects
        next_list_of_beam_record_objects = []
        for beam_index in range(beam_search_width):
          # print("prev_char_index"+str(list_of_beam_record_objects[beam_index].prev_char_index.shape))
          # print("decoder_input_state_values"+str(len(list_of_beam_record_objects[beam_index].decoder_input_state_values)))
          # print("decoder_input_state_values"+str(list_of_beam_record_objects[beam_index].decoder_input_state_values[0].shape))

          decoder_out = decoder_model.predict(tuple([list_of_beam_record_objects[beam_index].prev_char_index] + list_of_beam_record_objects[beam_index].decoder_input_state_values))
          # Decoder output has both output of all timesteps followed by hidden states
          output_probability = decoder_out[0]
          # Decoder state input is previous layer state output
          decoder_input_state_values = decoder_out[1:]
          sampled_token_index = np.argsort(output_probability[0][-1, :])[-beam_search_width:]
          sampled_probability_values = output_probability[0][-1, :][sampled_token_index]

          for each_candidate in range(1,len(sampled_probability_values)+1):
            new_joint_probability = list_of_beam_record_objects[beam_index].joint_probability * math.log(sampled_probability_values[-each_candidate])
            if(len(next_list_of_beam_record_objects) < beam_search_width):
              sampled_char = get_sampled_char(sampled_token_index[-each_candidate])
              if sampled_char == '\n':
                continue
              accumulated_previous_chars = list_of_beam_record_objects[beam_index].accumulated_previous_chars + sampled_char
              prev_char_index[:, 0]=target_tokenizer.word_index[sampled_char]
              next_beam_record_keeping_obj = BeamRecordKeeping(decoder_input_state_values,prev_char_index,new_joint_probability,accumulated_previous_chars)
              next_list_of_beam_record_objects.append(next_beam_record_keeping_obj)
            else:
              replace_indx = -1
              for (current_indx,each_obj) in enumerate(next_list_of_beam_record_objects):
                if(each_obj.joint_probability < new_joint_probability):
                  replace_indx = current_indx
                  break
              if(replace_indx != -1):
                sampled_char = get_sampled_char(sampled_token_index[-each_candidate])
                if sampled_char == '\n':
                  continue
                accumulated_previous_chars = list_of_beam_record_objects[beam_index].accumulated_previous_chars + sampled_char
                prev_char_index[:, 0]=target_tokenizer.word_index[sampled_char]
                next_beam_record_keeping_obj = BeamRecordKeeping(decoder_input_state_values,prev_char_index,new_joint_probability,accumulated_previous_chars)
                next_list_of_beam_record_objects[replace_indx] = next_beam_record_keeping_obj
        
    return predicted_words

In [22]:
def test_accuracy(rnn_type,encoder_model,decoder_model,encoder_layers,decoder_layers,beam_search_width=1):
  success=0
  success_char = 0
  total_chars = 0
  #Get all the predicted words
  if(beam_search_width == 0 or beam_search_width == 1):
    pred=decode_batch_of_sequences(rnn_type,test_input_tensor,encoder_model,decoder_model,test_input_tensor.shape[0],encoder_layers,decoder_layers)
  else:
    pred=decode_batch_of_sequences_for_bigger_beam_width(rnn_type,test_input_tensor,encoder_model,decoder_model,test_input_tensor.shape[0],encoder_layers,decoder_layers,beam_search_width)
  for seq_index in range(test_input_tensor.shape[0]):
      predicted_word = pred[seq_index]
      target_word=test_target_texts[seq_index][1:-1]
      for (indx,each_ele) in enumerate(target_word):
        total_chars += 1
        if(indx < len(predicted_word)):
          if(target_word[indx] == predicted_word[indx]):
            success_char += 1

      #test the word one by one and write to files
      if target_word == predicted_word:
        success+=1
        f = open("success.txt", "a")
        f.write(test_input_texts[seq_index]+' '+target_word+' '+predicted_word+'\n')
        f.close()
      else:
        f = open("failure.txt", "a")
        f.write(test_input_texts[seq_index]+' '+target_word+' '+predicted_word+'\n')
        f.close()
  
  print("success:"+str(success))
  print("success_char:"+str(success_char))
  return float(success)/float(test_input_tensor.shape[0]),float(success_char)/float(total_chars)

In [23]:
def batch_validate(rnn_type,encoder_model,decoder_model,encoder_layers,decoder_layers,beam_search_width=1):
  success = 0
  success_char = 0
  total_chars = 0
  #get all the predicted words
  if(beam_search_width == 0 or beam_search_width == 1):
    pred=decode_batch_of_sequences(rnn_type,val_input_tensor,encoder_model,decoder_model,val_input_tensor.shape[0],encoder_layers,decoder_layers)
  else:
    pred=decode_batch_of_sequences_for_bigger_beam_width(rnn_type,val_input_tensor,encoder_model,decoder_model,val_input_tensor.shape[0],encoder_layers,decoder_layers,beam_search_width)

  for seq_index in range(val_input_tensor.shape[0]):
    predicted_word = pred[seq_index]
    target_word = val_target_texts[seq_index][1:-1]
    #test the words one by one
    if predicted_word == target_word:
      # print("pred:"+str(pred[seq_index]))
      # print("Target: "+str(val_target_texts[seq_index][1:-1]))
      success+=1
      
    for (indx,each_ele) in enumerate(target_word):
      total_chars += 1
      if(indx < len(predicted_word)):
        if(target_word[indx] == predicted_word[indx]):
          # print("pred:"+str(pred[seq_index]))
          # print("Target: "+str(target_word))
          success_char += 1
  
  print("success:"+str(success))
  print("success_char:"+str(success_char))
  # print("val_input_tensor.shape[0]:"+str(val_input_tensor.shape[0]))
  return float(success)/float(val_input_tensor.shape[0]),float(success_char)/float(total_chars)

In [24]:
default_config = {
        "rnn_type": "LSTM",
        "dropout": 0.5,
        "encoder_layers":3,
        "decoder_layers":4,
        "latent_dim": 64,
        "epochs": 10,
        "lr": 0.0001,
        "embedding_out_dim": 64,
        "beam_search":False,
        "batch_size":64
    }

#Keras callback    
history = History()

In [25]:
def HP_tuning_run():
    # Create a MirroredStrategy.
    if tf.config.list_physical_devices('GPU'):
        strategy = tf.distribute.MirroredStrategy()
    else:  # use default strategy
        strategy = tf.distribute.get_strategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    wandb.init(config=default_config, magic=True,project="DeepLearningAssignment-3", entity='cs21s002-ee21s113-dlassignment-1')
    # wandb.init(config=default_config, magic=True,project="DeepLearningAssignment-3", entity='akshaygrao')
    config = wandb.config
    print("Config: "+str(config))
    run_name = str(config).replace("{", "").replace("}","").replace(":","-")
    wandb.run.name = run_name

    # Open a strategy scope and create the model
    with strategy.scope():
      model = build_model(config.rnn_type,config.embedding_out_dim,config.encoder_layers,config.decoder_layers,config.dropout,config.latent_dim)

    plot_model(model, to_file='model.png', show_shapes=True, show_dtype=True,show_layer_names=True)

    model.summary()

    model.compile(optimizer=keras.optimizers.Adam(config.lr), loss=keras.losses.SparseCategoricalCrossentropy(reduction='none'), metrics=["accuracy"])

    hist=model.fit([train_input_tensor, train_target_tensor],tf.concat([train_target_tensor[:,1:],tf.zeros((train_target_tensor[:,:].shape[0],1))], axis=1),batch_size=config.batch_size,epochs=config.epochs,shuffle=True,callbacks=[WandbCallback(), history])

    model.save(f'{run_name.replace(",","-")}.h5')  # creates a HDF5 file 'my_model.h5'
    inf = keras.models.load_model(f'{run_name.replace(",","-")}.h5')
    encoder_inference_model,decoder_inference_model=build_inference_model(inf,encoder_layers=config.encoder_layers,decoder_layers=config.decoder_layers,latent_dim=config.latent_dim)
    plot_model(encoder_inference_model, to_file='encoder_model.png', show_shapes=True)
    plot_model(decoder_inference_model, to_file='decoder_model.png', show_shapes=True)
    
    word_val_acc,char_val_acc=batch_validate(config.rnn_type,encoder_inference_model,decoder_inference_model,config.encoder_layers,config.decoder_layers,config.beam_search)
    print("word_val_acc"+str(word_val_acc))
    print("char_val_acc"+str(char_val_acc))
    wandb.log({"word_val_acc":str(round(word_val_acc,5))})
    wandb.log({"char_val_acc":str(round(char_val_acc,5))})
    wandb.log({"language":str(transliteration_target_language)})
    


In [26]:
def run_best_model():
    encoder_layers = 3
    decoder_layers = 3
    epochs = 20
    lr = 0.0001
    latent_dim = 2048
    rnn_type = 'GRU'
    embedding_dim = 128
    dropout = 0.3
    bs = 64
    beam_width = 0
      # Create a MirroredStrategy.
    if tf.config.list_physical_devices('GPU'):
        strategy = tf.distribute.MirroredStrategy()
    else:  # use default strategy
        strategy = tf.distribute.get_strategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
        # Open a strategy scope and create the model
    with strategy.scope():
      model = build_model(rnn_type,embedding_dim,encoder_layers,encoder_layers,dropout,latent_dim)

    plot_model(model, to_file='best_model.png', show_shapes=True, show_dtype=True,show_layer_names=True)

    model.summary()

    model.compile(optimizer=keras.optimizers.Adam(lr), loss=keras.losses.SparseCategoricalCrossentropy(reduction='none'), metrics=["accuracy"])

    hist=model.fit([train_input_tensor, train_target_tensor],tf.concat([train_target_tensor[:,1:],tf.zeros((train_target_tensor[:,:].shape[0],1))], axis=1),batch_size=bs,epochs=epochs,shuffle=True,callbacks=[WandbCallback(), history])

    encoder_inference_model,decoder_inference_model=build_inference_model(model,encoder_layers=encoder_layers,decoder_layers=decoder_layers,latent_dim=latent_dim)
    plot_model(encoder_inference_model, to_file='best_encoder_model.png', show_shapes=True)
    plot_model(decoder_inference_model, to_file='best_decoder_model.png', show_shapes=True)
    
    word_val_acc,char_val_acc=test_accuracy(rnn_type,encoder_inference_model,decoder_inference_model,encoder_layers,decoder_layers,beam_width)
    print("Test word_val_acc"+str(word_val_acc))
    print("Test char_val_acc"+str(char_val_acc))

    return model,encoder_inference_model,decoder_inference_model

In [27]:
model,encoder_inference_model,decoder_inference_model = run_best_model()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 26, 128)      3456        ['input_1[0][0]']                
                                                                                                  
 gru (GRU)                      [(None, 26, 2048),   13381632    ['embedding[0][0]']              
                                 (None, 2048)]                                                    
                                                                      

In [28]:
model.save('best_model.h5')
encoder_inference_model.save('best_encoder_inference_model.h5')
decoder_inference_model.save('best_decoder_inference_model.h5')



In [None]:
sweep_config = {
    "name": "Assignment 3 - deeper",
    "method": "bayes",
    "metric":{
      "goal": "maximize",
      "name": "word_val_acc"
    },
    "early_terminate":{
      "type": "hyperband",
      "min_iter": 2,
      "eta":2
    },
    "project": 'DeepLearningAssignment-3',
    "parameters": {
        "rnn_type": {
            "values": ["LSTM","GRU"]
        },
        "dropout": {
            "values": [0.2,0.3,0.5]
        },
        "encoder_layers": {
            "values": [2,3]
        },
        "decoder_layers": {
            "values": [2,3]
        },
        "latent_dim": {
            "values": [512,1024,2048]
        },
        "epochs": {
            "values": [15,20]
        },
        "lr": {
            "values": [0.0001]
        },
        "embedding_out_dim": {
            "values":[64,128]
        },
        "beam_search":{
            "values":[0]
        },
        "batch_size":{
            "values":[64]
        }
        
    }
}

In [None]:
# sweep_id = wandb.sweep(sweep_config,  project='DeepLearningAssignment-3', entity='akshaygrao')
# sweep_id = wandb.sweep(sweep_config,  project='DeepLearningAssignment-3', entity='cs21s002-ee21s113-dlassignment-1')
sweep_id="ohk45oaz"

In [None]:
# wandb.agent(sweep_id, function=HP_tuning_run, project='DeepLearningAssignment-3', entity='akshaygrao')
wandb.agent(sweep_id, function=HP_tuning_run, project='DeepLearningAssignment-3', entity='cs21s002-ee21s113-dlassignment-1')

[34m[1mwandb[0m: Agent Starting Run: ly8xpgof with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search: 0
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_out_dim: 128
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	latent_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	rnn_type: LSTM


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Config: {'batch_size': 64, 'beam_search': 0, 'decoder_layers': 2, 'dropout': 0.2, 'embedding_out_dim': 128, 'encoder_layers': 2, 'epochs': 20, 'latent_dim': 512, 'lr': 0.0001, 'rnn_type': 'LSTM'}
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 26, 128)      3456        ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                

VBox(children=(Label(value='0.590 MB of 0.590 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▅▆▆▆▇▇▇▇▇█████
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▆▆▅▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁

0,1
accuracy,0.96602
char_val_acc,0.63569
epoch,19
language,kn
loss,0.1153
word_val_acc,0.27651


[34m[1mwandb[0m: Agent Starting Run: psqw09bp with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search: 0
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_out_dim: 64
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	latent_dim: 2048
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	rnn_type: LSTM


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1




Config: {'batch_size': 64, 'beam_search': 0, 'decoder_layers': 2, 'dropout': 0.5, 'embedding_out_dim': 64, 'encoder_layers': 3, 'epochs': 15, 'latent_dim': 2048, 'lr': 0.0001, 'rnn_type': 'LSTM'}
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 26, 64)       1728        ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 26, 2048),   17309696    ['embedding[0][0]']              
                                 (None, 2048),                                                  

[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: context deadline exceeded (<Response [500]>)


Epoch 15/15
success:1397
success_char:26590
word_val_acc0.2778993435448578
char_val_acc0.654668111089226



VBox(children=(Label(value='0.726 MB of 0.726 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▃▄▅▆▆▇▇▇▇█████
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▆▅▄▃▂▂▂▂▁▁▁▁▁▁

0,1
accuracy,0.98625
char_val_acc,0.65467
epoch,14
language,kn
loss,0.04547
word_val_acc,0.2779


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dsi6b3bf with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search: 0
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_out_dim: 64
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	latent_dim: 1024
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	rnn_type: LSTM


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1




Config: {'batch_size': 64, 'beam_search': 0, 'decoder_layers': 3, 'dropout': 0.2, 'embedding_out_dim': 64, 'encoder_layers': 3, 'epochs': 20, 'latent_dim': 1024, 'lr': 0.0001, 'rnn_type': 'LSTM'}
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 26, 64)       1728        ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 26, 1024),   4460544     ['embedding[0][0]']              
                                 (None, 1024),                                                  

VBox(children=(Label(value='0.859 MB of 0.859 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▃▄▅▅▆▆▇▇▇▇▇████████
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▆▅▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.98985
char_val_acc,0.64871
epoch,19
language,kn
loss,0.03512
word_val_acc,0.26676


[34m[1mwandb[0m: Agent Starting Run: 59n4lex2 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search: 0
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_out_dim: 64
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	latent_dim: 2048
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	rnn_type: GRU


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1




Config: {'batch_size': 64, 'beam_search': 0, 'decoder_layers': 2, 'dropout': 0.5, 'embedding_out_dim': 64, 'encoder_layers': 2, 'epochs': 15, 'latent_dim': 2048, 'lr': 0.0001, 'rnn_type': 'GRU'}
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 26, 64)       1728        ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                 

VBox(children=(Label(value='0.552 MB of 0.552 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▆▆▇▇▇▇████
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▆▆▅▄▃▃▂▂▂▁▁▁▁▁

0,1
accuracy,0.9866
char_val_acc,0.68195
epoch,14
language,kn
loss,0.04659
word_val_acc,0.31649


[34m[1mwandb[0m: Agent Starting Run: bgd4n6as with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search: 0
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_out_dim: 64
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	latent_dim: 1024
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	rnn_type: LSTM


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1




Config: {'batch_size': 64, 'beam_search': 0, 'decoder_layers': 2, 'dropout': 0.2, 'embedding_out_dim': 64, 'encoder_layers': 3, 'epochs': 20, 'latent_dim': 1024, 'lr': 0.0001, 'rnn_type': 'LSTM'}
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 26)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 26, 64)       1728        ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 26, 1024),   4460544     ['embedding[0][0]']              
                                 (None, 1024),                                                  

In [None]:
model = build_model('LSTM',32,2,3,0.3,64)

In [None]:
model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(reduction='none'), metrics=["accuracy"])

In [None]:
hist=model.fit([train_input_tensor, train_target_tensor],tf.concat([train_target_tensor[:,1:],tf.zeros((train_target_tensor[:,:].shape[0],1))], axis=1),batch_size=32,epochs=5,shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Save model
model.save("s2s.keras")
# Run inferencing
# Restore the model and construct the encoder and decoder.
inf = keras.models.load_model("/content/s2s.keras")
encoder_model,decoder_model=build_inference_model(inf,encoder_layers=2,decoder_layers=3,latent_dim=64)

In [None]:
val_acc,char_val_acc = batch_validate('LSTM',encoder_model,decoder_model,2,3)
print("val_acc:"+str(val_acc))
print("char_val_acc:"+str(char_val_acc))