In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import pandas as pd
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import csv
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [3]:
val_df = pd.read_csv("/kaggle/input/assignment3/aksharantar_sampled/tel/tel_valid.csv", header=None)
train_df = pd.read_csv("/kaggle/input/assignment3/aksharantar_sampled/tel/tel_train.csv", header=None)
test_df = pd.read_csv("/kaggle/input/assignment3/aksharantar_sampled/tel/tel_test.csv", header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


In [4]:
tsv_file = open("/kaggle/input/assignment3/aksharantar_sampled/tel/tel_train.csv")
read_tsv = csv.reader(tsv_file)
val_tsv_file = open("/kaggle/input/assignment3/aksharantar_sampled/tel/tel_valid.csv")
val_read_tsv = csv.reader(val_tsv_file)
test_tsv_file = open("/kaggle/input/assignment3/aksharantar_sampled/tel/tel_test.csv")
test_read_tsv = csv.reader(test_tsv_file)

In [5]:
train_X = []
train_Y = []
test_X = []
test_Y = []
val_X = []
val_Y = []
for i in read_tsv:   
    train_Y.append(i[1])
    train_X.append(i[0])
for i in val_read_tsv:
    val_Y.append(i[1])
    val_X.append(i[0])
for i in test_read_tsv:
    test_Y.append(i[1])
    test_X.append(i[0])

test_Y = np.array(test_Y)
test_X = np.array(test_X)
for i in range(test_Y.shape[0]):
    test_Y[i] = "\t" + test_Y[i] + "\n"
train_Y = np.array(train_Y)
train_X = np.array(train_X)
for i in range(train_Y.shape[0]):
    train_Y[i] = "\t" + train_Y[i] + "\n"
val_Y = np.array(val_Y)
val_X = np.array(val_X)
for i in range(val_Y.shape[0]):
    val_Y[i] = "\t" + val_Y[i] + "\n"

In [19]:
train_X

array(['vargaalavaarine', 'vastadira', 'factamfos', ...,
       'venakkiteesukoovaalane', 'roopaantaraalu', 'chendindindi'],
      dtype='<U28')

In [6]:
print(test_Y)
print(test_X)

['\tవిత్తనాన్ని\n' '\tప్రయాణికులు\n' '\tహసన్\n' ... '\tతెలంగాణ\n'
 '\tపటేల్\n' '\tపేడ\n']
['vithananni' 'prayaanikulu' 'hassan' ... 'telamgaanha' 'patel' 'peda']


In [7]:
# english_characters = set()
# devnagri_characters = set()
input_corpus = set()
output_corpus = set()
for word in train_X:
    for char in word:
        if char not in input_corpus:
            input_corpus.add(char)

for word in train_Y:
    for char in word:
        if char not in output_corpus:
            output_corpus.add(char)

# Validation set
# v_english_characters = set()
# v_devnagri_characters = set()
val_input_corpus = set()
val_output_corpus = set()

for word in val_X:
    for char in word:
        if char not in val_input_corpus:
            val_input_corpus.add(char)

for word in val_Y:
    for char in word:
        if char not in val_output_corpus:
            val_output_corpus.add(char)

In [8]:
input_corpus.add(" ")
output_corpus.add(" ")
input_corpus = sorted(list(input_corpus))
output_corpus = sorted(list(output_corpus))
num_encoder_tokens = len(input_corpus)
num_decoder_tokens = len(output_corpus)

In [9]:
max_encoder_seq_length = max([len(txt) for txt in train_X]) + 2
max_decoder_seq_length = max([len(txt) for txt in train_Y])


In [10]:
print("Number of samples:", len(train_X))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 51200
Number of unique input tokens: 27
Number of unique output tokens: 65
Max sequence length for inputs: 30
Max sequence length for outputs: 21


In [11]:
hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabet_size = len(hindi_alphabets)

In [12]:
input_char_index = dict([(char, i) for i, char in enumerate(input_corpus)])
output_char_index = dict([(char, i) for i, char in enumerate(output_corpus)])

input_data = np.zeros((max_encoder_seq_length,len(train_X)), dtype="int64")
target_data = np.zeros((max_decoder_seq_length,len(train_X)), dtype="int64")

for i, (x, y) in enumerate(zip(train_X, train_Y)):
    for t, char in enumerate(x):
        input_data[t, i] = input_char_index[char]
        
    input_data[t + 1 :,i] = input_char_index[" "]
    
    for t, char in enumerate(y):
        target_data[t, i] = output_char_index[char]
            
    target_data[t + 1 :,i] = output_char_index[" "]
    
input_data_val = np.zeros((max_encoder_seq_length,len(val_X)), dtype="int64")
target_data_val = np.zeros((max_decoder_seq_length,len(val_X)), dtype="int64")


for i, (x, y) in enumerate(zip(val_X, val_Y)):
    for t, char in enumerate(x):
        input_data_val[t, i] = input_char_index[char]
        
    input_data_val[t + 1 :,i] = input_char_index[" "]
    
    for t, char in enumerate(y):
        target_data_val[t, i] = output_char_index[char]
            
    target_data_val[t + 1 :,i] = output_char_index[" "]

In [13]:
# convertin numpy arrays to tensors
input_data = torch.tensor(input_data,dtype=torch.int64)
target_data = torch.tensor(target_data,dtype=torch.int64)
input_data_val = torch.tensor(input_data_val,dtype=torch.int64)
target_data_val = torch.tensor(target_data_val,dtype=torch.int64)

In [14]:
#LSTM RUN Only
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.2):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = num_decoder_tokens

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


In [15]:
num_epochs = 2
learning_rate = 0.001
batch_size = 32
load_model = False
input_size_encoder = num_encoder_tokens
input_size_decoder = num_decoder_tokens
output_size = num_decoder_tokens
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 256  # Needs to be the same for both RNN's
num_enc_layers = 2
num_dec_layers = 2
enc_dropout = 0.1
dec_dropout = 0.1
training = False

In [16]:
#LSTM RUN Only
encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_enc_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_dec_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

In [17]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

train_ds_x = torch.split(input_data,batch_size,dim=1)
train_ds_y = torch.split(target_data,batch_size,dim=1)
input_data_val = input_data_val.to(device)
target_data_val = target_data_val.to(device)
target_val = target_data_val[1:].reshape(-1)

In [18]:
# this cell is only for training, not to be used now as we have saved the model


for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()
    model.train()

    for i, (x,y) in enumerate(zip(train_ds_x,train_ds_y)):
        # Get input and targets and get to cuda
        inp_data = x.to(device)
        target = y.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()
        
        
#torch.save(model.state_dict(),'models\model_pytorch_noAT_state.pt')
#torch.save(model,'models\model_pytorch_noAT.pt')

[Epoch 0 / 2]
[Epoch 1 / 2]


In [19]:
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (embedding): Embedding(27, 256)
    (rnn): LSTM(256, 256, num_layers=2, dropout=0.1)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.1, inplace=False)
    (embedding): Embedding(65, 256)
    (rnn): LSTM(256, 256, num_layers=2, dropout=0.1)
    (fc): Linear(in_features=256, out_features=65, bias=True)
  )
)

In [20]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_char_index.items())
reverse_target_char_index = dict((i, char) for char, i in output_char_index.items())

In [22]:
def translate(model, word, input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device):
    
    word_t = ''
    data = np.zeros((max_encoder_seq_length,1), dtype="int64")
    for t, char in enumerate(word):
        data[t, 0] = input_char_index[char]
        
    data[t + 1 :,0] = input_char_index[" "]
    
    data = torch.tensor(data,dtype=torch.int64).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(data)
        
    x = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)

    for t in range(1, max_decoder_seq_length):
        output, hidden, cell = model.decoder(x, hidden, cell)
        print(output.shape)
        best_guess = output.argmax(1)
        x = best_guess
        ch = reverse_target_char_index[x.item()]
        if ch == '\n':
            break
        else:
            word_t = word_t+ch

    return word_t

In [None]:
total_words = len(train_X)
correct_pred = 0
for i in range(total_words):
    #print(train_Y[i])
    decoded_sentence = translate(model,train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
    #print(decoded_sentence)
    if train_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)

In [23]:
total_words = len(val_X)
correct_pred = 0
for i in range(20):
    #print(train_Y[i])
    decoded_sentence = translate(model,val_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
    #print(decoded_sentence)
    if val_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)

torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])
torch.Size([1, 65])


In [None]:
total_words = len(test_X)
correct_pred = 0
for i in range(total_words):
    #print(test_Y[i][1:-1])
    decoded_sentence = translate(model,test_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
    #print(decoded_sentence)
    #print('\n')
    if test_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)

In [38]:
num_epochs = 2
learning_rate = 0.001
batch_size = 32
load_model = False
input_size_encoder = num_encoder_tokens
input_size_decoder = num_decoder_tokens
output_size = num_decoder_tokens
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 256  # Needs to be the same for both RNN's
num_enc_layers = 2
num_dec_layers = 2
enc_dropout = 0.1
dec_dropout = 0.1
training = False

In [39]:
def training(num_encoder_tokens,input_embedding_size, dp, cell_type, hidden_size, num_enc_layers, num_dec_layers,num_epochs,output_size,input_size_decoder):
    encoder_net = Encoder(input_size_encoder,input_embedding_size, hidden_size, num_enc_layers,dp).to(device)
    decoder_net = Decoder(input_size_decoder,input_embedding_size,hidden_size,output_size,num_dec_layers,dp).to(device)

    model = Seq2Seq(encoder_net, decoder_net).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    train_ds_x = torch.split(input_data,batch_size,dim=1)
    train_ds_y = torch.split(target_data,batch_size,dim=1)
    #print(train_ds_x)
    for epoch in range(num_epochs):
        print(f"[Epoch {epoch} / {num_epochs}]")

        model.eval()
        model.train()

        for i, (x,y) in enumerate(zip(train_ds_x,train_ds_y)):
        # Get input and targets and get to cuda
            inp_data = x.to(device)
            target = y.to(device)

            # Forward prop
            output = model(inp_data, target)


            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            # Back prop
            loss.backward()

            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()
        total_words = len(val_X)
        correct_pred = 0
        model.eval()
        for i in range(total_words):
       # print(val_Y[i][1:-1])
            decoded_sentence = translate(model,val_X[i], input_char_index, output_char_index, reverse_input_char_index, 
                      reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
                      num_encoder_tokens, num_decoder_tokens, device)
            if val_Y[i][1:-1]== decoded_sentence:
                 correct_pred += 1
        #print(decoded_sentence)
        #print('\n')
        test_accuracy = correct_pred / total_words

        print(test_accuracy)

In [40]:
training(input_size_encoder ,256, 0.1, "LSTM", 256, 2, 2,20,num_decoder_tokens,num_decoder_tokens)

[Epoch 0 / 20]
0.082763671875
[Epoch 1 / 20]
0.276123046875
[Epoch 2 / 20]
0.35693359375
[Epoch 3 / 20]
0.380126953125
[Epoch 4 / 20]
0.415283203125
[Epoch 5 / 20]
0.436767578125
[Epoch 6 / 20]
0.443603515625
[Epoch 7 / 20]
0.46240234375
[Epoch 8 / 20]
0.455810546875
[Epoch 9 / 20]
0.47509765625
[Epoch 10 / 20]
0.46923828125
[Epoch 11 / 20]
0.4716796875
[Epoch 12 / 20]
0.48388671875
[Epoch 13 / 20]


KeyboardInterrupt: 

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
    'parameters': {'embedding_size': {'values': [128, 256, 512]},
                   'hidden_size': {'values': [128, 256, 512]},
                   'cell_type': {'values': ['LSTM']},
                   'num_layers': {'values': [1,2,3]},
                   'batch_size': {'values': [128,256,512]},
                   'dropout': {'values': [0.1, 0.2, 0.3, 0.4]},
                   'epochs' = :{'values':[10,20,30,40]}
                }}

In [None]:
# sweep_config = {
#     'method': 'bayes',
#     'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
#     'parameters': {'input_embedding_size': {'values': [128, 256, 512]},
#                    'hidden_layer_size': {'values': [128, 256, 512]},
#                    'cell_type': {'values': ['LSTM', 'RNN', 'GRU']},
#                    'num_layers': {'values': [1,2,3]},
#                    'batch_size': {'values': [128,256,512]},
#                    'dropout': {'values': [0.1, 0.2, 0.3, 0.4]}
#                 }}

In [None]:
def train():
    var1 = wandb.init()
    var2 = var1.config
    epochs = 10

    model, encoder_layers, decoder_layers = training(var2.input_embedding_size, var2.dropout, var2.cell_type , var2.hidden_layer_size, var2.num_layers, var2.num_layers)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=var2.batch_size,
        epochs=epochs,
        callbacks=[WandbCallback()]
    )

    encoder_model, decoder_model = inferencing(model,var2.num_layers, var2.num_layers,encoder_layers,decoder_layers,var2.cell_type,var2.hidden_layer_size)
    correct = 0
    n = val_devnagri.shape[0]
    for i in range(n):
        input = encoder_val_input_data[i:i+1]
        output = decode_sequence(input,encoder_model, decoder_model)
        if output.strip() == val_devnagri[i].strip():
            correct += 1
    wandb.log({'val_accuracy' : correct*100/n})

In [52]:
## This cell is to Run RNN
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(EncoderRNN, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)
        outputs, hidden = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)
        # hidden shape: (num_layers, N, hidden_size)
        return hidden





In [133]:
##For RNN
class DecoderRNN(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(DecoderRNN, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)
      #  self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, hidden = self.rnn(embedding, hidden)
        predictions = self.fc(outputs)
     #   predictions = self.softmax(predictions)
        predictions = predictions.squeeze(0)

        return predictions, hidden

In [160]:
#Defining the complete model for RNN
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = num_decoder_tokens

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
           # output, hidden, cell = self.decoder(x, hidden, cell)
            output, hidden= self.decoder(x, hidden)
            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [161]:
#ENN RUN Only
encoder_net = EncoderRNN(
    input_size_encoder, encoder_embedding_size, hidden_size, num_enc_layers, enc_dropout
).to(device)

decoder_net = DecoderRNN(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_dec_layers,
    dec_dropout
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

In [162]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

train_ds_x = torch.split(input_data,batch_size,dim=1)
train_ds_y = torch.split(target_data,batch_size,dim=1)
input_data_val = input_data_val.to(device)
target_data_val = target_data_val.to(device)
target_val = target_data_val[1:].reshape(-1)

In [163]:
# this cell is only for training, not to be used now as we have saved the model


for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()
    model.train()

    for i, (x,y) in enumerate(zip(train_ds_x,train_ds_y)):
        # Get input and targets and get to cuda
        inp_data = x.to(device)
        target = y.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()
        
        
#torch.save(model.state_dict(),'models\model_pytorch_noAT_state.pt')
#torch.save(model,'models\model_pytorch_noAT.pt')

[Epoch 0 / 10]
[Epoch 1 / 10]
[Epoch 2 / 10]
[Epoch 3 / 10]
[Epoch 4 / 10]
[Epoch 5 / 10]


KeyboardInterrupt: 

In [None]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_char_index.items())
reverse_target_char_index = dict((i, char) for char, i in output_char_index.items())

In [164]:
def translate(model, word, input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device):
    
    word_t = ''
    data = np.zeros((max_encoder_seq_length,1), dtype="int64")
    for t, char in enumerate(word):
        data[t, 0] = input_char_index[char]
        
    data[t + 1 :,0] = input_char_index[" "]
    
    data = torch.tensor(data,dtype=torch.int64).to(device)

    with torch.no_grad():
        hidden = model.encoder(data)
        
    x = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)

    for t in range(1, max_decoder_seq_length):
        output, hidden = model.decoder(x, hidden)
        best_guess = output.argmax(1)
        x = best_guess
        ch = reverse_target_char_index[x.item()]
        if ch == '\n':
            break
        else:
            word_t = word_t+ch

    return word_t

In [None]:
# #LSTM Beam Search
# def beam_search(model, word, input_char_index, output_char_index, reverse_input_char_index,
#                 reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length,
#                 num_encoder_tokens, num_decoder_tokens, beam_width, device):

#     word_t = ''

#     # Encode the input word
#     data = np.zeros((max_encoder_seq_length, 1), dtype="int64")
#     for t, char in enumerate(word):
#         data[t, 0] = input_char_index[char]

#     data[t + 1:, 0] = input_char_index[" "]

#     data = torch.tensor(data, dtype=torch.int64).to(device)

#     with torch.no_grad():
#         hidden,cell = model.encoder(data)

#     # Initialize beam
#     initial_sequence = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
#     beam = [(0.0, initial_sequence, hidden.unsqueeze(0))]  # [(score, sequence, hidden)]

#     for _ in range(max_decoder_seq_length):
#         candidates = []
#         for score, seq, hidden in beam:
#             last_token = seq[-1].item()
#             if last_token == output_char_index['\n']:
#                 # If the sequence ends with the end token, add it to the candidates
#                 candidates.append((score, seq, hidden))
#                 continue

#             x = torch.tensor(np.array(last_token).reshape(1,)).to(device)
#             output, hidden,cell = model.decoder(x, hidden.squeeze(0),cell)
#             probabilities = F.softmax(output, dim=1)

#             # Get the top-k probabilities and tokens
#             topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)

#             for prob, token in zip(topk_probs[0], topk_tokens[0]):
#                 new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
#                 new_hidden = hidden.clone().unsqueeze(0)
#                 candidates.append((score + torch.log(prob).item(), new_seq, new_hidden))

#         # Select top-k candidates based on the accumulated scores
#         beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[0])

#     # Select the best sequence from the beam as the output
#     best_score, best_sequence, _ = beam[0]
#     word_t = ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:-1]])

#     return word_t

In [None]:
num_epochs = 2
learning_rate = 0.01
batch_size = 32
load_model = False
input_size_encoder = num_encoder_tokens
input_size_decoder = num_decoder_tokens
output_size = num_decoder_tokens
encoder_embedding_size = 128
decoder_embedding_size = 128
hidden_size = 128  # Needs to be the same for both RNN's
num_enc_layers = 1
num_dec_layers = 1
enc_dropout = 0.1
dec_dropout = 0.1
training = False

In [165]:
total_words = len(val_X)
correct_pred = 0
for i in range(total_words):
    #print(val_Y[i][1:-1])
    decoded_sentence = translate(model,val_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
   # print(decoded_sentence)
    #print('\n')
    if val_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)

0.0


In [231]:
# def beam_search(model, word, input_char_index, output_char_index, reverse_input_char_index,
#               reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length,
#               num_encoder_tokens, num_decoder_tokens, beam_width, device):

#     word_t = ''

#     # Encode the input word
#     data = np.zeros((max_encoder_seq_length, 1), dtype="int64")
#     for t, char in enumerate(word):
#         data[t, 0] = input_char_index[char]

#     data[t + 1:, 0] = input_char_index[" "]

#     data = torch.tensor(data, dtype=torch.int64).to(device)

#     with torch.no_grad():
#         hidden = model.encoder(data)

#     # Initialize beam
#     beam = [(torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device), hidden, 0)]  # [(sequence, hidden, score)]

#     x = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
#     for _ in range(max_decoder_seq_length):
#         candidates = []
#         for seq, hidden, score in beam:
#             last_token = seq[-1]
#             output, hidden = model.decoder(x, hidden)
#             probabilities = F.softmax(output, dim=1)

#             # Expand the beam
#             topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)
#             for prob, token in zip(topk_probs[0], topk_tokens[0]):
#                 new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
#                 new_hidden = hidden
#                 new_score = score + prob.item()
#                 candidates.append((new_seq, new_hidden, new_score))

#         # Select top-k candidates
#         candidates.sort(key=lambda x: x[2], reverse=True)
#         beam = candidates[:beam_width]

#         # Check if the sequences in the beam have reached the end token
#         end_token_idx = output_char_index['\n']
#         for seq, _, _ in beam:
#             if seq[-1].item() == end_token_idx:
#                 # If the end token is reached, select the sequence as the output
#                 word_t = ''.join([reverse_target_char_index[token.item()] for token in seq[1:-1]])
#                 return word_t

#     # If the maximum sequence length is reached, select the best sequence in the beam as the output
#     best_sequence = beam[0][0]
#     word_t = ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:-1]])

#     return word_t
# def beam_search(model, word, input_char_index, output_char_index, reverse_input_char_index,
#               reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length,
#               num_encoder_tokens, num_decoder_tokens, beam_width, device):

#     word_t = ''

#     # Encode the input word
#     data = np.zeros((max_encoder_seq_length, 1), dtype="int64")
#     for t, char in enumerate(word):
#         data[t, 0] = input_char_index[char]

#     data[t + 1:, 0] = input_char_index[" "]

#     data = torch.tensor(data, dtype=torch.int64).to(device)

#     with torch.no_grad():
#         hidden = model.encoder(data)

#     # Initialize beam
#     beam = [(torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device), hidden, 0)]  # [(sequence, hidden, score)]

#     x = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
#     for _ in range(max_decoder_seq_length):
#         candidates = []
#         for seq, hidden, score in beam:
#             last_token = seq[-1]
#             output, hidden = model.decoder(x, hidden)
#             probabilities = F.softmax(output, dim=1)

#             # Expand the beam
#             topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)
#             for prob, token in zip(topk_probs[0], topk_tokens[0]):
#                 new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
#                 new_hidden = hidden
#                 new_score = score + torch.log(prob).item()
#                 candidates.append((new_seq, new_hidden, new_score))

#         # Select top-k candidates
#         candidates.sort(key=lambda x: x[2] / len(x[0]), reverse=True)
#         beam = candidates[:beam_width]

#         # Check if the sequences in the beam have reached the end token
#         end_token_idx = output_char_index['\n']
#         for seq, _, _ in beam:
#             if seq[-1].item() == end_token_idx:
#                 # If the end token is reached, select the sequence as the output
#                 word_t = ''.join([reverse_target_char_index[token.item()] for token in seq[1:-1]])
#                 return word_t

#     # If the maximum sequence length is reached, select the best sequence in the beam as the output
#     best_sequence = beam[0][0]
#     word_t = ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:-1]])

#     return word_t
# def beam_search(model, word, input_char_index, output_char_index, reverse_input_char_index,
#               reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length,
#               num_encoder_tokens, num_decoder_tokens, beam_width, device):

#     word_t = ''

#     # Encode the input word
#     data = np.zeros((max_encoder_seq_length, 1), dtype="int64")
#     for t, char in enumerate(word):
#         data[t, 0] = input_char_index[char]

#     data[t + 1:, 0] = input_char_index[" "]

#     data = torch.tensor(data, dtype=torch.int64).to(device)

#     with torch.no_grad():
#         hidden = model.encoder(data)

#     # Initialize beam
#     beam = [(torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device), hidden, 0)]  # [(sequence, hidden, score)]

#     x = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
#     for _ in range(max_decoder_seq_length):
#         candidates = []
#         for seq, hidden, score in beam:
#             last_token = seq[-1]
#             output, hidden = model.decoder(x, hidden)
#             probabilities = F.softmax(output, dim=1)

#             # Expand the beam
#             topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)
#             for prob, token in zip(topk_probs[0], topk_tokens[0]):
#                 new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
#                 new_hidden = hidden.clone().unsqueeze(0)  # Update the hidden state
#                 new_score = score + torch.log(prob).item()
#                 candidates.append((new_seq, new_hidden, new_score))

# #         # Select top-k candidates
#         candidates.sort(key=lambda x: x[2] / len(x[0]), reverse=True)
#         beam = candidates[:beam_width]

# #         # Check if the sequences in the beam have reached the end token
# #         end_token_idx = output_char_index['\n']
# #         for seq, _, _ in beam:
# #             if seq[-1].item() == end_token_idx:
# #                 # If the end token is reached, select the sequence as the output
# #                 word_t = ''.join([reverse_target_char_index[token.item()] for token in seq[1:-1]])
# #                 return word_t

# #     # If the maximum sequence length is reached, select the best sequence in the beam as the output
# #     best_sequence = beam[0][0]
# #     word_t = ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:-1]])

# #     return word_t
import heapq

def beam_search(model, word, input_char_index, output_char_index, reverse_input_char_index,
                reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length,
                num_encoder_tokens, num_decoder_tokens, beam_width, device):

    word_t = ''

    # Encode the input word
    data = np.zeros((max_encoder_seq_length, 1), dtype="int64")
    for t, char in enumerate(word):
        data[t, 0] = input_char_index[char]

    data[t + 1:, 0] = input_char_index[" "]

    data = torch.tensor(data, dtype=torch.int64).to(device)

    with torch.no_grad():
        hidden,cell = model.encoder(data)

    # Initialize beam
    initial_sequence = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
    beam = [(0.0, initial_sequence, hidden.unsqueeze(0))]  # [(score, sequence, hidden)]

    for _ in range(max_decoder_seq_length):
        candidates = []
        for score, seq, hidden in beam:
            last_token = seq[-1].item()
            if last_token == output_char_index['\n']:
                # If the sequence ends with the end token, add it to the candidates
                candidates.append((score, seq, hidden))
                continue

            x = torch.tensor(np.array(last_token).reshape(1,)).to(device)
            output, hidden,cell = model.decoder(x, hidden.squeeze(0),cell)
            probabilities = F.softmax(output, dim=1)

            # Get the top-k probabilities and tokens
            topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)

            for prob, token in zip(topk_probs[0], topk_tokens[0]):
                new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                new_hidden = hidden.clone().unsqueeze(0)
                candidates.append((score + torch.log(prob).item(), new_seq, new_hidden))

        # Select top-k candidates based on the accumulated scores
        beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[0])

    # Select the best sequence from the beam as the output
    best_score, best_sequence, _ = beam[0]
    word_t = ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:-1]])

    return word_t

In [242]:
total_words = len(val_X)
correct_pred = 0
for i in range(total_words):
  #  print(val_Y[i][1:-1])
    decoded_sentence = beam_search(model,val_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens,4,device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
    #print(decoded_sentence)
    #print('\n')
    if val_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)

0.004638671875


In [250]:
import heapq

def beam_search(model, word, input_char_index, output_char_index, reverse_input_char_index,
                reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length,
                num_encoder_tokens, num_decoder_tokens, beam_width, device, length_penalty=0.6):

    word_t = ''

    # Encode the input word
    data = np.zeros((max_encoder_seq_length, 1), dtype="int64")
    for t, char in enumerate(word):
        data[t, 0] = input_char_index[char]

    data[t + 1:, 0] = input_char_index[" "]

    data = torch.tensor(data, dtype=torch.int64).to(device)

    with torch.no_grad():
        hidden,cell = model.encoder(data)

    # Initialize beam
    initial_sequence = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
    beam = [(0.0, initial_sequence, hidden.unsqueeze(0))]  # [(score, sequence, hidden)]

    for _ in range(max_decoder_seq_length):
        candidates = []
        for score, seq, hidden in beam:
            last_token = seq[-1].item()
            if last_token == output_char_index['\n']:
                # If the sequence ends with the end token, add it to the candidates
                candidates.append((score, seq, hidden))
                continue

            x = torch.tensor(np.array(last_token).reshape(1,)).to(device)
            output, hidden,cell = model.decoder(x, hidden.squeeze(0),cell)
            probabilities = F.softmax(output, dim=1)

            # Get the top-k probabilities and tokens
            topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)

            for prob, token in zip(topk_probs[0], topk_tokens[0]):
                new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                new_hidden = hidden.clone().unsqueeze(0)
                length_penalty_factor = ((len(new_seq) - 1) / 5) ** length_penalty  # Adjust penalty factor as needed
                candidates.append((score + torch.log(prob).item() / length_penalty_factor, new_seq, new_hidden))

        # Select top-k candidates based on the accumulated scores
        beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[0])

    # Select the best sequence from the beam as the output
    best_score, best_sequence, _ = max(beam, key=lambda x: x[0])
    word_t = ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:-1]])

    return word_t

In [259]:
total_words = len(val_X)
correct_pred = 0
for i in range(total_words):
  #  print(val_Y[i][1:-1])
    decoded_sentence = beam_search(model,val_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens,1,device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
    #print(decoded_sentence)
    #print('\n')
    if val_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)

0.26953125


In [260]:
#For GRU Code start from here

In [24]:
#For GRU
class EncoderGRU(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(EncoderGRU, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)
        outputs, hidden = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)
        return hidden

class DecoderGRU(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(DecoderGRU, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)
       # self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, hidden = self.rnn(embedding, hidden)
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)
        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        # predictions = self.softmax(predictions)
        predictions = predictions.squeeze(0)
        return predictions, hidden



In [25]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.2):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = num_decoder_tokens

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden= self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden = self.decoder(x, hidden)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


In [43]:
num_epochs = 10
learning_rate = 0.001
batch_size = 32
load_model = False
input_size_encoder = num_encoder_tokens
input_size_decoder = num_decoder_tokens
output_size = num_decoder_tokens
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 256  # Needs to be the same for both RNN's
num_enc_layers = 1
num_dec_layers = 1
enc_dropout = 0.1
dec_dropout = 0.1
training = False

In [44]:
#EGRU RUN Only
encoder_net = EncoderGRU(
    input_size_encoder, encoder_embedding_size, hidden_size, num_enc_layers, enc_dropout
).to(device)

decoder_net = DecoderGRU(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_dec_layers,
    dec_dropout
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

In [45]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

train_ds_x = torch.split(input_data,batch_size,dim=1)
train_ds_y = torch.split(target_data,batch_size,dim=1)
input_data_val = input_data_val.to(device)
target_data_val = target_data_val.to(device)
target_val = target_data_val[1:].reshape(-1)

In [None]:
# this cell is only for training, not to be used now as we have saved the model


for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()
    model.train()

    for i, (x,y) in enumerate(zip(train_ds_x,train_ds_y)):
        # Get input and targets and get to cuda
        inp_data = x.to(device)
        target = y.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()
        
        
#torch.save(model.state_dict(),'models\model_pytorch_noAT_state.pt')
#torch.save(model,'models\model_pytorch_noAT.pt')

[Epoch 0 / 10]
[Epoch 1 / 10]
[Epoch 2 / 10]
[Epoch 3 / 10]
[Epoch 4 / 10]
[Epoch 5 / 10]


In [None]:
def translate(model, word, input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device):
    
    word_t = ''
    data = np.zeros((max_encoder_seq_length,1), dtype="int64")
    for t, char in enumerate(word):
        data[t, 0] = input_char_index[char]
        
    data[t + 1 :,0] = input_char_index[" "]
    
    data = torch.tensor(data,dtype=torch.int64).to(device)

    with torch.no_grad():
        hidden = model.encoder(data)
        
    x = torch.tensor(np.array(output_char_index['\t']).reshape(1,)).to(device)
    
    for t in range(1, max_decoder_seq_length):
         output, hidden = model.decoder(x, hidden)
       #  print(output.shape)
         best_guess = output.argmax(1)
         x = best_guess
         ch = reverse_target_char_index[x.item()]
         if ch == '\n':
           break
         else:
            word_t = word_t+ch

    return word_t

In [None]:
total_words = len(val_X)
correct_pred = 0
for i in range(total_words):
    #print(train_Y[i])
    decoded_sentence = translate(model,val_X[i], input_char_index, output_char_index, reverse_input_char_index, 
              reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
              num_encoder_tokens, num_decoder_tokens, device)
#    print(translate(model, train_X[i], input_char_index, output_char_index, reverse_input_char_index, 
             # reverse_target_char_index, max_encoder_seq_length, max_decoder_seq_length, 
             # num_encoder_tokens, num_decoder_tokens, device))
    #print(decoded_sentence)
    if val_Y[i][1:-1]== decoded_sentence:
        correct_pred += 1
       # print('True')
test_accuracy = correct_pred / total_words

print(test_accuracy)