In [1]:
!pip install --upgrade -q jax
!pip install --upgrade -q jaxlib
!pip install --upgrade -q trax
!pip install tf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import random
import numpy as np
import string
import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training
import re
import pickle
import os
import tf
import tensorflow_datasets as tfds
from termcolor import colored
from sklearn.model_selection import train_test_split
import textwrap
wrapper = textwrap.TextWrapper(width=70)

In [3]:
import requests
import os
if 'TPU_DRIVER_MODE' not in globals():
  url = 'http://' + os.environ['COLAB_TPU_ADDR'].split(':')[0] + ':8475/requestversion/tpu_driver0.1-dev20191206'
  resp = requests.post(url)
  TPU_DRIVER_MODE = 1

# The following is required to use TPU Driver as JAX's backend.
from jax.config import config
config.FLAGS.jax_xla_backend = "tpu_driver"
config.FLAGS.jax_backend_target = "grpc://" + os.environ['COLAB_TPU_ADDR']
print(config.FLAGS.jax_backend_target)

grpc://10.15.92.194:8470


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
def preprocess(text):
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    text = text.lower()
    text=  re.sub(' _ ','',text)
    text = re.sub(r'\d','',text)
    text = re.sub(r'\s+',' ',text)
    text = text.strip()
    return text



In [6]:
dataset_root = "/content/gdrive/MyDrive/Colab upload/"

if os.path.exists(dataset_root + "preprocessed_data.pickle"):
    with open(dataset_root + "preprocessed_data.pickle", 'rb') as f:
        english_sentences, hindi_sentences = pickle.load(f)


In [7]:
print(len(english_sentences), len(hindi_sentences))
print()
english_sentences[:3], hindi_sentences[:3]

1044041 1044041



(['give your application an accessibility workout',
  'accerciser accessibility explorer',
  'the default plugin layout for the bottom panel'],
 ['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
  'एक्सेर्साइसर पहुंचनीयता अन्वेषक',
  'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'])

In [8]:
english_sentences[:3]

['give your application an accessibility workout',
 'accerciser accessibility explorer',
 'the default plugin layout for the bottom panel']

In [9]:
def preprocess_hindi(text):
    text = re.sub('{.+}','',text)
    text = re.sub('  ',' ' ,text)
    text = re.sub('%','',text)
    text= re.sub('\(','',text)
    text= re.sub('}','',text)
    text= re.sub('\)','',text)
    text= re.sub('{','',text)
    text= re.sub('\[','',text)
    text= re.sub('\]','',text)
    text= re.sub('\t','',text)
    return text




In [10]:
#a="जल कनेक्शन की स्थिति (बाहरी वेबसाइट जो एक नई विंडों में खुलती हैं}"
#a="आप% पर वेबपृष्ठ  तक पहुंच प्राप्त करने के लिए अधिकृत नहीं हैं. आपको साइन {0}, {1} इन करने की आवश्यकता हो सकती है."
a="अल्ज़ीरिया \tसीईपी जारी है"
a=preprocess_hindi(a)
print(a)

अल्ज़ीरिया सीईपी जारी है


In [11]:
hindi_sentences = [preprocess_hindi(hi) for hi in hindi_sentences]
print(len(hindi_sentences))
print(hindi_sentences[:3])

1044041
['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'एक्सेर्साइसर पहुंचनीयता अन्वेषक', 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका']


In [12]:

english_sentences_train, english_sentences_test, hindi_sentences_train, hindi_sentences_test = train_test_split(english_sentences, hindi_sentences, test_size=0.2, random_state=1)

english_sentences_train, english_sentences_eval, hindi_sentences_train, hindi_sentences_eval = train_test_split(english_sentences_train, hindi_sentences_train, test_size=0.25, random_state=1)

In [13]:
def train_stream_fn():
  # open the first language file (e.g. English sentences)
  #with open(dataset_root + "parallel/IITB.en-hi.en",'r') as f1:
    # open the second language file (e.g. French sentences)
    #with open(dataset_root + "parallel/IITB.en-hi.hi",'r') as f2:
      # looping over the two files to combine the two translation toghether and yields them.
    for a in zip(english_sentences_train,hindi_sentences_train):
        yield(a)
        

In [14]:
def eval_stream_fn():
  # open the first language file (e.g. English sentences)
  #with open(dataset_root + "parallel/IITB.en-hi.en",'r') as f1:
    # open the second language file (e.g. French sentences)
    #with open(dataset_root + "parallel/IITB.en-hi.hi",'r') as f2:
      # looping over the two files to combine the two translation toghether and yields them.
    for a in zip(english_sentences_eval,hindi_sentences_eval):
        yield(a)
        

In [15]:
train_stream = train_stream_fn()
print('train data (en, hi) tuple:', next(train_stream))
print()

train data (en, hi) tuple: ('speaks the current flat review object', 'मौजूदा समतल रिव्यू के बारे में बोलें')



In [16]:
eval_stream = eval_stream_fn()
print('eval data (en, hi) tuple:', next(eval_stream))
print()

eval data (en, hi) tuple: ('any masses of lymphoid tissue that are similar to tonsils', 'लिम्फोइड ऊतक का समूह जो गलतुंडिका के समान है। ')



In [17]:
EOS = 1

# generator helper function to append EOS to each sentence
def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)

In [18]:
VOCAB_DIR='/content/gdrive/MyDrive/Colab upload/vocab/'
VOCAB_FILE='summarize32k.subword.subwords'

tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream)

tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream)

In [19]:
tokenized_train_stream = append_eos(tokenized_train_stream)
print(next(tokenized_train_stream))

(array([19275,   527, 88640, 64769,     4,     1]), array([33700,     4, 81833,     4, 62684,     4, 62656,     4, 62691,
           4, 62632,     4, 62706,     4, 62635,     4, 62690,     4,
           1]))


In [20]:
tokenized_eval_stream = append_eos(tokenized_eval_stream)
print(next(tokenized_eval_stream))

(array([  213, 84721,     4,  1302,   527,  4012,   835,   132,   213,
        3747,   357,     1]), array([62731,     4, 62632,     4, 33230,     4, 62632,     4, 62706,
           4, 33449,     4, 62720,     4, 63737,   391, 62848,     4,
       63142,     4, 62684,     4, 63417,   391, 62684,     4, 62694,
         391, 33901,     4, 62635,     4, 62671,     4,     6, 62684,
           4, 62635,     4, 62671,     4, 62632,     4, 62732,     4,
       62684,     4, 62635,     4, 63040,     4, 62695,   391, 33321,
           4, 62823,     4, 62635,     4, 62706,     4, 62694,   391,
       62814,     4, 34651,   391,     1]))


In [21]:
filtered_train_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_train_stream)
filtered_eval_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_eval_stream)

train_input, train_target = next(filtered_train_stream)

print(colored(f'Single tokenized example input:', 'red' ), train_input)
print(colored(f'Single tokenized example target:', 'red'), train_target)

[31mSingle tokenized example input:[0m [  476   100   410    19   213    60 70308     4   100   124    19   288
  2754    39  1151   757   320   156   181   320    33   100  1355    86
  2754    23    46  2441   320   156   186   130  3200   229    86   320
   616   869  7763     1]
[31mSingle tokenized example target:[0m [33270     4 62808     4 33279   669 27634     4 62720     4 69109   391
 62684     4 62748     4 64381     4 33585     4 62635   391 81842     4
 62950     4 62715     4 62706     4 62748   391 33302     4 78146   391
 62814     4 35501   391 81853     4 62720     4 69109   391 33302     4
 78146   391 62823     4 62635     4 33206     4 62635   391 62684     4
 62656   391 62720     4 62694     4 62671     4 62694   391 62691     4
 62635     4 63472     4 62684     4 62632     4 62732     4 62635   391
 62684     4 62656     4 62732     4 62635   391 62823     4 62635     4
 33306     4 62635   391 81853     4 62690     4 75426     4 62684     4
 62656   391 62

In [22]:
# Setup helper functions for tokenizing and detokenizing sentences

def tokenize(input_str, vocab_file=None, vocab_dir=None):
    """Encodes a string to an array of integers

    Args:
        input_str (str): human-readable string to encode
        vocab_file (str): filename of the vocabulary text file
        vocab_dir (str): path to the vocabulary file
  
    Returns:
        numpy.ndarray: tokenized version of the input string
    """
    
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir))
    
    # Mark the end of the sentence with EOS
    inputs = list(inputs) + [EOS]
    
    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    
    return batch_inputs


def detokenize(integers, vocab_file=None, vocab_dir=None):
    """Decodes an array of integers to a human readable string

    Args:
        integers (numpy.ndarray): array of integers to decode
        vocab_file (str): filename of the vocabulary text file
        vocab_dir (str): path to the vocabulary file
  
    Returns:
        str: the decoded sentence.
    """
    
    # Remove the dimensions of size 1
    integers = list(np.squeeze(integers))
    
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    
    # Remove the EOS to decode only the original tokens
    if EOS in integers:
        integers = integers[:integers.index(EOS)] 
    
    return trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir)

In [23]:

# Detokenize an input-target pair of tokenized sentences
print(colored(f'Single detokenized example input:', 'red'), detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f'Single detokenized example target:', 'red'), detokenize(train_target, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print()

# Tokenize and detokenize a word that is not explicitly saved in the vocabulary file.
# See how it combines the subwords -- 'hell' and 'o'-- to form the word 'hello'.
print(colored(f"tokenize('hello'): ", 'green'), tokenize('hello', vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f"detokenize([22611, 79, 1]): ", 'green'), detokenize([41371,4,1], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

[31mSingle detokenized example target:[0m कह दो, "मैं कोई पहला रसूल तो नहीं हूँ। और मैं नहीं जानता कि मेरे साथ क्या किया जाएगा और न यह कि तुम्हारे साथ क्या किया जाएगा। मैं तो बस उसी का अनुगामी हूँ, जिसकी प्रकाशना मेरी ओर की जाती है और मैं तो केवल एक स्पष्ट सावधान करनेवाला हूँ।" 

[32mtokenize('hello'): [0m [[41371     4     1]]
[32mdetokenize([22611, 79, 1]): [0m hello


In [24]:
boundaries =  [8,   16,  32, 64, 128, 256, 512]
batch_sizes = [64, 64, 64, 64, 64, 64, 64, 64]

# Create the generators.
train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_eval_stream)

# Add masking for the padding (0s).
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

In [25]:
input_batch, target_batch, mask_batch = next(train_batch_stream)

# let's see the data type of a batch
print("input_batch data type: ", type(input_batch))
print("target_batch data type: ", type(target_batch))

# let's see the shape of this particular batch (batch length, sentence length)
print("input_batch shape: ", input_batch.shape)
print("target_batch shape: ", target_batch.shape)

input_batch data type:  <class 'numpy.ndarray'>
target_batch data type:  <class 'numpy.ndarray'>
input_batch shape:  (64, 32)
target_batch shape:  (64, 128)


In [26]:
# pick a random index less than the batch size.
index = random.randrange(len(input_batch))

# use the index to grab an entry from the input and target batch
print(colored('THIS IS THE ENGLISH SENTENCE: \n', 'red'), detokenize(input_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print(colored('THIS IS THE TOKENIZED VERSION OF THE ENGLISH SENTENCE: \n ', 'red'), input_batch[index], '\n')
print(colored('THIS IS THE HINDI TRANSLATION: \n', 'red'), detokenize(target_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print(colored('THIS IS THE TOKENIZED VERSION OF THE HINDI TRANSLATION: \n', 'red'), target_batch[index], '\n')

[31mTHIS IS THE ENGLISH SENTENCE: 
[0m settings for the built in live video processor only 

[31mTHIS IS THE TOKENIZED VERSION OF THE ENGLISH SENTENCE: 
 [0m [ 6305  1019   213   601   132   630   846 90055     4    86     1     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0] 

[31mTHIS IS THE HINDI TRANSLATION: 
[0m केवल निर्मित लाइव वीडियो प्रोसेसर के लिए सेटिंग्स 

[31mTHIS IS THE TOKENIZED VERSION OF THE HINDI TRANSLATION: 
[0m [62684     4 62694     4 33216     4 62690     4 62656     4 62671     4
 62632     4 62720     4 62656     4 62706     4 62715     4 62635     4
 33671     4 62731     4 62695     4 62839     4 62656     4 62732     4
 62748   391 62713     4 62632     4 62671     4 62748     4 62691     4
 62694     4 67815     4 62684     4 62694   391 62715     4 62656     4
 63206     4 62691     4 62694     4 62757     4 64380     4 62856     4
 62632     4 62691     4     1     0     

In [27]:
'''for i in range(100000):
    print(i)
    val=next(train_stream)
    print('train data (en, hi) tuple:', val)
    token_val=tokenize(val, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)
    print(token_val)
    '''

"for i in range(100000):\n    print(i)\n    val=next(train_stream)\n    print('train data (en, hi) tuple:', val)\n    token_val=tokenize(val, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)\n    print(token_val)\n    "

In [28]:
model = trax.models.Transformer(
    input_vocab_size=33600,
    d_model=512, d_ff=2048, dropout = 0.1,
    n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
    max_len=2048, mode='train')


In [29]:
train_task = training.TrainTask(
    # use the train batch stream as labeled data
    labeled_data= train_batch_stream,
    # use the cross entropy loss with LogSoftmax
    loss_layer= tl.CrossEntropyLossWithLogSoftmax(),
    # use the Adafactor optimizer with learning rate of 0.001
    optimizer= trax.optimizers.Adafactor(learning_rate=0.001, epsilon1=1e-30),
    # have 500 warmup steps
    lr_schedule= trax.lr.multifactor(constant=1.0, warmup_steps=500),
    # have a checkpoint every 100 steps
    n_steps_per_checkpoint= 10,
    # saving a checkpoint every 1000 steps on the output_dir
    n_steps_per_permanent_checkpoint = 500
)

In [30]:
eval_task = training.EvalTask(
    # use the eval batch stream as labeled data
    labeled_data=eval_batch_stream,
    # use the cross entropy loss with LogSoftmax and accuracy as metrics
    metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()],
    # you could specify the number of eval batch by n_eval_batches = 64 or any other number
    # but it not specified here as we want to evaluate the whole eval data
    # n_eval_batches = 64
)

In [35]:
output_dir = '/content/gdrive/MyDrive/Colab upload/models'

# # remove old model if it exists. restarts training.
# !rm -rf output_dir

# define the training loop
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

In [34]:
input_dir = '/content/gdrive/MyDrive/Colab upload/model/'

# This loads a checkpoint:
training_loop.load_checkpoint(directory=input_dir, filename="model.pkl.gz")
# Continue training:
#training_loop.run(100)

In [36]:
model.init_from_file(input_dir+'model.pkl.gz',
                     weights_only=True)

(((),
  ((), ((), ((), (), ()))),
  (array([[ 0.0188915 ,  0.06841063, -0.11606378, ...,  0.03213299,
            0.01156224, -0.04660358],
          [ 0.03769593, -0.04923808,  0.03046959, ..., -0.01021302,
            0.03950877, -0.05263529],
          [ 0.12549807,  0.16986407,  0.10040963, ..., -0.28668645,
            0.09472397,  0.00784058],
          ...,
          [-0.10787084, -0.0677245 ,  0.01577769, ...,  0.07449336,
            0.0249789 , -0.02815343],
          [-0.15695521, -0.04102622, -0.12623689, ...,  0.15664099,
            0.05573989, -0.01804593],
          [-0.13206941, -0.24852037, -0.17207748, ...,  0.24719346,
            0.02434103, -0.18965879]], dtype=float32),
   (),
   array([[ 0.24733956,  0.7334061 ,  0.03801225, ...,  0.6249324 ,
           -0.15519635,  1.014179  ],
          [ 0.8059806 ,  0.54032826,  0.7821831 , ...,  1.387547  ,
           -0.5498619 ,  1.3953466 ],
          [ 0.5448354 , -0.4858858 ,  0.89081734, ...,  1.0618771 ,
           

In [None]:
#output_dir = '/content/gdrive/MyDrive/Colab upload/models/'

# This loads a checkpoint:
# Continue training:
training_loop.run(1000)


Step      1: Total number of trainable weights: 80677696
Step      1: Ran 1 train steps in 165.78 secs
Step      1: train CrossEntropyLossWithLogSoftmax |  0.46170485
Step      1: eval  CrossEntropyLossWithLogSoftmax |  0.43246144
Step      1: eval        WeightedCategoryAccuracy |  0.43313372

Step     10: Ran 9 train steps in 829.62 secs
Step     10: train CrossEntropyLossWithLogSoftmax |  0.52646446
Step     10: eval  CrossEntropyLossWithLogSoftmax |  0.45021886
Step     10: eval        WeightedCategoryAccuracy |  0.42435420

Step     20: Ran 10 train steps in 267.75 secs
Step     20: train CrossEntropyLossWithLogSoftmax |  0.55032051
Step     20: eval  CrossEntropyLossWithLogSoftmax |  0.45268971
Step     20: eval        WeightedCategoryAccuracy |  0.41507024

Step     30: Ran 10 train steps in 277.59 secs
Step     30: train CrossEntropyLossWithLogSoftmax |  0.51753670
Step     30: eval  CrossEntropyLossWithLogSoftmax |  0.44151640
Step     30: eval        WeightedCategoryAccuracy

In [None]:
from platform import python_version
python_version()


'3.9.12'

In [None]:
training_loop.run(20)

In [None]:
!python -V

Python 3.9.12


In [27]:
model = trax.models.Transformer(
    input_vocab_size=33600,
    d_model=512, d_ff=2048, dropout = 0.1,
    n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
    max_len=2048, mode='eval')

In [28]:
input_dir = '/content/gdrive/MyDrive/Colab upload/model/'
model.init_from_file(input_dir+'model.pkl.gz',
                     weights_only=True)

(((),
  ((), ((), ((), (), ()))),
  (array([[ 0.0188915 ,  0.06841063, -0.11606378, ...,  0.03213299,
            0.01156224, -0.04660358],
          [ 0.03769593, -0.04923808,  0.03046959, ..., -0.01021302,
            0.03950877, -0.05263529],
          [ 0.12549807,  0.16986407,  0.10040963, ..., -0.28668645,
            0.09472397,  0.00784058],
          ...,
          [-0.10787084, -0.0677245 ,  0.01577769, ...,  0.07449336,
            0.0249789 , -0.02815343],
          [-0.15695521, -0.04102622, -0.12623689, ...,  0.15664099,
            0.05573989, -0.01804593],
          [-0.13206941, -0.24852037, -0.17207748, ...,  0.24719346,
            0.02434103, -0.18965879]], dtype=float32),
   (),
   array([[ 0.24733956,  0.7334061 ,  0.03801225, ...,  0.6249324 ,
           -0.15519635,  1.014179  ],
          [ 0.8059806 ,  0.54032826,  0.7821831 , ...,  1.387547  ,
           -0.5498619 ,  1.3953466 ],
          [ 0.5448354 , -0.4858858 ,  0.89081734, ...,  1.0618771 ,
           

In [29]:
def next_symbol(model, input_tokens, cur_output_tokens, temperature):
    """Returns the index of the next token.
    Args:
        model: the NMT model.
        input_tokens (np.ndarray 1 x n_tokens): tokenized representation of the input sentence
        cur_output_tokens (list): tokenized representation of previously translated words
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
    Returns:
        int: index of the next token in the translated sentence
        float: log probability of the next symbol
    """
    # set the length of the current output tokens
    token_length = len(cur_output_tokens)
    # calculate next power of 2 for padding length 
    padded_length = np.power(2, int(np.ceil(np.log2(token_length + 1))))
    # pad cur_output_tokens up to the padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length) 
    # model expects the output to have an axis for the batch size in front so
    # convert `padded` list to a numpy array with shape (x, <padded_length>) where the
    # x position is the batch axis.
    padded_with_batch = np.expand_dims(padded, axis=0)
    # the model prediction.
    output, _ = model((input_tokens, padded_with_batch))   
    # get log probabilities from the last token output
    log_probs = output[0, token_length, :]
    # get the next symbol by getting a logsoftmax sample
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))
    return symbol, float(log_probs[symbol])

In [30]:
def sampling_decode(input_sentence, model = None, temperature=0.0, vocab_file=None, vocab_dir=None):
    """Returns the translated sentence.
    Args:
        input_sentence (str): sentence to translate.
        model: the NMT model.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file
    Returns:
        tuple: (list, str, float)
            list of int: tokenized version of the translated sentence
            float: log probability of the translated sentence
            str: the translated sentence
    """     
    # encode the input sentence
    input_tokens = tokenize(input_sentence, vocab_file=vocab_file, vocab_dir=vocab_dir)
    # initialize the list of output tokens
    cur_output_tokens = []
    # initialize an integer that represents the current output index
    cur_output = 0  
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    # check that the current output is not the end of sentence token
    while cur_output != EOS: 
        # update the current output token by getting the index of the next word
        cur_output, log_prob = next_symbol(model, input_tokens, cur_output_tokens, temperature)
        # append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output) 
    # detokenize the output tokens
    sentence = detokenize(cur_output_tokens, vocab_file=vocab_file, vocab_dir=vocab_dir)
    return cur_output_tokens, log_prob, sentence

In [32]:
sampling_decode("hello", model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

TypeError: ignored

In [33]:
def greedy_decode_test(sentence, model=None, vocab_file=None, vocab_dir=None):
    """Prints the input and output of our NMT model using greedy decode
    Args:
        sentence (str): a custom string.
        model: the NMT model.
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file
    Returns:
        str: the translated sentence
    """    
    _,_, translated_sentence = sampling_decode(sentence, model, vocab_file=vocab_file, vocab_dir=vocab_dir)   
    print("English: ", sentence)
    print("French: ", translated_sentence)
    return translated_sentence

In [34]:
your_sentence = 'I love languages.'
greedy_decode_test(your_sentence, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);

TypeError: ignored