In [None]:
!curl --header "Host: storage.googleapis.com" --header "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77" --header "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header "Accept-Language: en-US,en;q=0.9" --header "Referer: https://github.com/google-research-datasets/dakshina" "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar" -L -o "dakshina_dataset_v1.0.tar"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1915M  100 1915M    0     0   108M      0  0:00:17  0:00:17 --:--:-- 32.0M


In [None]:
import shutil
shutil.unpack_archive("/content/dakshina_dataset_v1.0.tar",'/content/')

In [None]:
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential,Model,load_model
from keras.layers import Dense,LSTM,GRU,SimpleRNN,Input,Dropout,TimeDistributed,RepeatVector,dot,BatchNormalization,concatenate,multiply,Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
!pip install wandb

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/98/5f/45439b4767334b868e1c8c35b1b0ba3747d8c21be77b79f09eed7aa3c72b/wandb-0.10.30-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.5MB/s 
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/27/da/6f6224fdfc47dab57881fe20c0d1bc3122be290198ba0bf26a953a045d92/GitPython-3.1.17-py3-none-any.whl (166kB)
[K     |████████████████████████████████| 174kB 34.6MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl
Collecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405

In [None]:
import wandb
from wandb.keras import WandbCallback


In [None]:
class seq2seq:
    def __init__(self,cell,embedding_size,latent_dim,encoder_layers_size,decoder_layers_size,dropouts,epochs,batch_size):
        self.cell = cell
        self.embedding_size = embedding_size
        self.latent_dim = latent_dim
        self.encoder_layers_size = encoder_layers_size
        self.decoder_layers_size = decoder_layers_size
        self.dropouts=dropouts
        self.epochs=epochs
        self.batch_size=batch_size

    def get_data(self,path):
        d = pd.read_csv(path,sep="\t",header=None,error_bad_lines=False)
        d = d.dropna()

        decoder_target_data = np.zeros((d.shape[0],self.max_length_y,self.decoder_tokens), dtype="float32")

        for i,target_text in enumerate(d[0]):
            target_text = '\t'+target_text+'\n'
            for t, char in enumerate(target_text):
                if t > 0:
                    decoder_target_data[i, t - 1, self.target_token_index[char]] = 1.0
            decoder_target_data[i, t:, self.target_token_index["\n"]] = 1.0

        return ([[self.input_token_index[letter] for letter in list('\t'+word+'\n')] for word in d[1]]),\
                ([[self.target_token_index[letter] for letter in list('\t'+word+'\n')] for word in d[0]]),decoder_target_data

    def create_vocab(self,path):
        d = pd.read_csv(path,sep="\t",header=None,error_bad_lines=False)
        d = d.dropna()

        x = [list('\t'+word+'\n') for word in np.array(d[1])]
        y = [list('\t'+word+'\n') for word in np.array(d[0])]

        telugu_vocab = set()
        english_vocab = set()

        for word in x:
            for char in word:
                english_vocab.add(char)

        for word in y:
            for char in word:
                telugu_vocab.add(char)

        telugu_list = sorted(list(telugu_vocab))
        english_list = sorted(list(english_vocab))

        max_length_x = (np.max([len(i) for i in x]))
        max_length_y = (np.max([len(i) for i in y]))

        return telugu_list,english_list,max_length_x,max_length_y    

    def create_data(self):
        train_path = "/content/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv"
        cv_path = "/content/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv"
        test_path = "/content/dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.test.tsv"

        telugu_list,english_list,self.max_length_x,self.max_length_y = self.create_vocab(train_path)
        self.encoder_tokens = len(english_list)
        self.decoder_tokens = len(telugu_list)

        # Dict for char to index
        self.input_token_index = dict([(char, i) for i, char in enumerate(english_list)])
        self.target_token_index = dict([(char, i) for i, char in enumerate(telugu_list)])

        # Dict for index to char
        self.inv_input_token_index = dict({(value,key) for key,value in self.input_token_index.items()})
        self.inv_target_token_index = dict({(value,key) for key,value in self.target_token_index.items()})

        encoder_train,decoder_train,self.decoder_target_train = self.get_data(train_path)
        encoder_cv,decoder_cv,self.decoder_target_cv = self.get_data(cv_path)
        encoder_test,decoder_test,self.decoder_target_test = self.get_data(test_path)


        self.encoder_train = sequence.pad_sequences(encoder_train,maxlen=self.max_length_x,padding="post")
        self.decoder_train = sequence.pad_sequences(decoder_train,maxlen=self.max_length_y,padding="post")
        self.encoder_cv = sequence.pad_sequences(encoder_cv,maxlen=self.max_length_x,padding="post")
        self.decoder_cv = sequence.pad_sequences(decoder_cv,maxlen=self.max_length_y,padding="post")
        self.encoder_test = sequence.pad_sequences(encoder_test,maxlen=self.max_length_x,padding="post")
        self.decoder_test = sequence.pad_sequences(decoder_test,maxlen=self.max_length_y,padding="post")

    def create_model(self):
        encoder_inputs = Input(shape=(None,))
        x = Embedding(self.encoder_tokens,self.embedding_size,input_length=self.max_length_x)(encoder_inputs)
        for _ in range(self.encoder_layers_size):
            if self.cell == "lstm":
                x,state_h,state_c = LSTM(self.latent_dim,return_state=True,return_sequences=True,dropout=self.dropouts)(x)
                encoder_states = [state_h,state_c]
            elif self.cell == "rnn":
                x,state_c = SimpleRNN(self.latent_dim,return_state=True,return_sequences=True,dropout=self.dropouts)(x)
                encoder_states = [state_c]
            elif self.cell == "gru":
                x,state_c = GRU(self.latent_dim,return_state=True,return_sequences=True,dropout=self.dropouts)(x)
                encoder_states = [state_c]
                        

        decoder_inputs = Input(shape=(None,))
        decoder_embb = Embedding(self.decoder_tokens,self.embedding_size,input_length=self.max_length_y)(decoder_inputs)
        if self.cell == "lstm":
            decoder_lstm,state_h,state_c = LSTM(self.latent_dim, return_sequences=True,return_state=True,dropout=self.dropouts)(decoder_embb,initial_state=encoder_states)
        elif self.cell == "rnn":
            decoder_lstm,state_h = SimpleRNN(self.latent_dim, return_sequences=True,return_state=True,dropout=self.dropouts)(decoder_embb,initial_state=encoder_states)
        elif self.cell == "gru":
            decoder_lstm,state_h = GRU(self.latent_dim, return_sequences=True,return_state=True,dropout=self.dropouts)(decoder_embb,initial_state=encoder_states)

        for i in range(self.decoder_layers_size-1):
            if self.cell == "lstm":
                decoder_lstm,state_h,state_c = LSTM(self.latent_dim, return_sequences=True,return_state=True,dropout=self.dropouts)(decoder_lstm)
            elif self.cell == "rnn":
                decoder_lstm,state_h = SimpleRNN(self.latent_dim, return_sequences=True,return_state=True,dropout=self.dropouts)(decoder_lstm)
            elif self.cell == "gru":
                decoder_lstm,state_h = GRU(self.latent_dim, return_sequences=True,return_state=True,dropout=self.dropouts)(decoder_lstm)
        

        decoder_outputs = TimeDistributed(Dense(self.decoder_tokens, activation='softmax'))(decoder_lstm)

        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        return model

    def percentage_of_correct_test_predictions(self,model):
        count = 0
        for i in tqdm(range(self.encoder_test.shape[0])):
            pred = model([np.expand_dims(self.encoder_test[i],0),np.expand_dims(self.decoder_test[i],0)])
            actual_out = re.sub('\t|\n','',''.join([self.inv_target_token_index[self.decoder_test[i][j]] for j in range(self.max_length_y)]))
            predicted_out = re.sub('\t|\n','',''.join(self.inv_target_token_index[np.argmax(pred[0][j])] for j in range(self.max_length_y)))
            if (actual_out==predicted_out):
                count+=1
        return count/self.encoder_test.shape[0]

    def run(self,model):
        # Compile & run training
        model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=["accuracy"])

        model.fit([self.encoder_train,self.decoder_train], self.decoder_target_train,
                self.batch_size,
                self.epochs,validation_data=([self.encoder_cv,self.decoder_cv], self.decoder_target_cv),
                callbacks = [Word_Accuracy_Callback(self),WandbCallback(monitor='WordAccuracy',save_model = True)],verbose=1) 
        


In [None]:
class Word_Accuracy_Callback(tf.keras.callbacks.Callback):
  def __init__(self,obj):
    self.obj = obj

  def on_epoch_end(self,epoch,logs):
    count = 0
    for i in range(self.obj.encoder_cv.shape[0]):
      pred = self.model([np.expand_dims(self.obj.encoder_cv[i],0),np.expand_dims(self.obj.decoder_cv[i],0)])
      actual_out = re.sub('\t|\n','',''.join([self.obj.inv_target_token_index[self.obj.decoder_cv[i][j]] for j in range(self.obj.max_length_y)]))
      predicted_out = re.sub('\t|\n','',''.join(self.obj.inv_target_token_index[np.argmax(pred[0][j])] for j in range(self.obj.max_length_y)))
      if (actual_out==predicted_out):
        count+=1

    logs["validation accuracy"] = count/self.obj.encoder_cv.shape[0]
    print("vallidation accuracy =", count/self.obj.encoder_cv.shape[0])


In [None]:
wandb.init(project='assignment 3 question 2_', entity='adi00510')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
sweep_config={
    'method': 'random',
    'metric': {
        'name': 'accuracy',
        'goal': 'maximize'
    },
    'parameters':{
        'epochs':{
            'values':[3,5,6]
        },
        'embedding_size':{
            'values':[8,12,16,20]
        },
        'encoder_layers_size':{
            'values':[1,2,3]
        },
        'decoder_layers_size':{
            'values':[1,2,3]
        },
        'cell':{
            'values':["rnn","gru","lstm"]
        },
        'dropouts':{
            'values':[0,0.2,0.3,0.4]
        },
        'latent_dim':{
            'values':[16,32,64,128,256,512]
        },
        'batch_size':{
            'values':[32,64]
        }
    }
}

In [None]:
!pip install --upgrade wandb
import wandb
!wandb login 3c967c63b099a3b2acd600aa30008e7de1ea6498

Requirement already up-to-date: wandb in /usr/local/lib/python3.7/dist-packages (0.10.30)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
sweep_id = wandb.sweep(sweep_config,project='assignment 3 question 2_', entity="adi00510")



Create sweep with ID: x5jjfioq
Sweep URL: https://wandb.ai/adi00510/assignment%203%20question%202_/sweeps/x5jjfioq


In [None]:
def train():
    config_defaults={
      'epochs':5,
      'embedding_size':16,
      'dropouts':0,
      'encoder_layer_size':2,
      'decoder_layer_size':2,
      'cell':"lstm",
      'latent_dim':512,
      'batch_size': 64
       }
    
    wandb.init(config=config_defaults)
    config=wandb.config
    #def __init__(self,cell,embedding_size,latent_dim,encoder_layers_size,decoder_layers_size,dropouts):
    s2s=seq2seq(config.cell,config.embedding_size,config.latent_dim,config.encoder_layers_size,config.decoder_layers_size,config.dropouts,config.epochs,config.batch_size)
    s2s.create_data()
    model=s2s.create_model()
    s2s.run(model)



    

In [None]:
wandb.agent(sweep_id,train)

[34m[1mwandb[0m: Agent Starting Run: chl404lv with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: lstm
[34m[1mwandb[0m: 	decoder_layers_size: 3
[34m[1mwandb[0m: 	dropouts: 0.2
[34m[1mwandb[0m: 	embedding_size: 20
[34m[1mwandb[0m: 	encoder_layers_size: 3
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	latent_dim: 64


Epoch 1/3
vallidation accuracy = 0.0
Epoch 2/3
vallidation accuracy = 0.0
Epoch 3/3
vallidation accuracy = 0.00017596339961288053


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,2.0
loss,0.91317
accuracy,0.7359
val_loss,0.77441
val_accuracy,0.76989
validation accuracy,0.00018
_runtime,719.0
_timestamp,1621492372.0
_step,2.0


0,1
epoch,▁▅█
loss,█▃▁
accuracy,▁▅█
val_loss,█▆▁
val_accuracy,▁▄█
validation accuracy,▁▁█
_runtime,▁▄█
_timestamp,▁▄█
_step,▁▅█


[34m[1mwandb[0m: Agent Starting Run: sp8t5pna with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: gru
[34m[1mwandb[0m: 	decoder_layers_size: 1
[34m[1mwandb[0m: 	dropouts: 0.2
[34m[1mwandb[0m: 	embedding_size: 20
[34m[1mwandb[0m: 	encoder_layers_size: 3
[34m[1mwandb[0m: 	epochs: 6
[34m[1mwandb[0m: 	latent_dim: 512
[34m[1mwandb[0m: Currently logged in as: [33madi00510[0m (use `wandb login --relogin` to force relogin)


Epoch 1/6
vallidation accuracy = 0.06704205525250748
Epoch 2/6
