<h1 id="tocheading">Spring 2018 NLP Class Project: Neural Machine Translation</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import spacy
import pdb
import os
from underthesea import word_tokenize
import jieba
import numpy as np
import math
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable

# running on cpu
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")



In [3]:
# ! pip install spacy && python -m spacy download en

## Part 0: Project Overview

The goal of this project is to build a neural machine translation system and experience how recent advances have made their way. Each team will build the following sequence of neural translation systems for two language pairs, __Vietnamese (Vi)→English (En)__ and __Chinese (Zh)→En__ (prepared corpora is be provided):

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.
4. [Optional] Build either or both fully self-attention translation system or/and multilingual translation system.

## Part 1: Data Upload & Preprocessing

In [13]:
PAD_token = 0
SOS_token = 1 # start of sentence
UNK_token = 2 # 2 = unk
EOS_token = 3 # end of sentence


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"<PAD>",
                           SOS_token: "<SOS>",
                           UNK_token:"<UNK>", 
                           EOS_token: "<EOS>"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        # adds the new word to the vocabulary
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [14]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    """About "NFC" and "NFD": 
    
    For each character, there are two normal forms: normal form C 
    and normal form D. Normal form D (NFD) is also known as canonical 
    decomposition, and translates each character into its decomposed form. 
    Normal form C (NFC) first applies a canonical decomposition, then composes 
    pre-combined characters again.
    
    About unicodedata.category: 
    
    Returns the general category assigned to the Unicode character 
    unichr as string."""
    
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


In [15]:
# Trim
def normalizeString(s):
    # removes blankspaces at the beginning and the end of the string
    s = unicodeToAscii(s.strip())
    # 
    s = re.sub(r"([.!?])", r" \1", s)
    # 
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    
    return s

In [28]:
def readLangs(lang1, lang2, reverse=False,
             dataset="train"):
    
    """Takes as input;
    - lang1, lang2: either (vi, en) or (zh, en)
    - dataset: one of ("train","dev","test")"""
    
    print("Reading lines...")
    
    eos = [".","?","!","\n"]
    
    # Read the pretokenized lang1 file and split into lines
    lang1_lines = open("../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-%s-%s-processed/%s.tok.%s" % (lang1, lang2, dataset, lang1), encoding="utf-8").\
        read().strip().split("\n")
        
    # Read the lang2 file and split into lines
    lang2_lines = open("../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-%s-%s-processed/%s.tok.%s" % (lang1, lang2, dataset, lang2), encoding="utf-8").\
        read().strip().split("\n")
        
    # Examples of Pretokenized Sentences
    print("Example of Language #1 sentence: " + str(lang1_lines[0]))
    print("Example of Language #2 sentence: " + str(lang2_lines[0]))
    
    # create sentence pairs (lists of length 2 that consist of string pairs)
    # e.g. ["And we &apos;re going to tell you some stories from the sea here in video .",
    #       "我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事  "]
    # check if there are the same number of sentences in each set
    assert len(lang1_lines) == len(lang2_lines), "Two languages must have the same number of sentences. "+ str(len(lang1_lines)) + " sentences were passed for " + str(lang1) + "." + str(len(lang2_lines)) + " sentences were passed for " + str(lang2)+"."
    print("Number of sentences in Language #1 = " + str(len(lang1_lines)))
    print("Number of sentences in Language #2 = " + str(len(lang2_lines)))
    
    # normalize if not Chinese, Chinese normalization is already handeled
    # add <EOS> tag at the end of the sentence for chinese
    if lang1 == "zh":
        lang1_lines = [s + "<EOS>" for s in lang1_lines]
    else:
        # replace .?!\n with <EOS> tag for Vietnamese and English
        lang1_lines = [normalizeString(s).replace(".","<EOS>").\
                       replace("?","<EOS>").replace("!","<EOS>").replace("\n","<EOS>") for s in lang1_lines]
    lang2_lines = [normalizeString(s).replace(".","<EOS>").\
                       replace("?","<EOS>").replace("!","<EOS>").replace("\n","<EOS>") for s in lang2_lines]
    
    # construct pairs
    pair_ran = range(len(lang1_lines))
    pairs = [[lang1_lines[i]] + [lang2_lines[i]] for i in pair_ran]
    
#     # Split every line into pairs and normalize
#     pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [29]:
def prepareData(lang1, lang2, reverse=False, dataset="train"):
    
    input_lang, output_lang, pairs = readLangs(lang1 = lang1, 
                                               lang2 = lang2, 
                                               reverse = reverse, 
                                               dataset = dataset)
    
    print("Read %s sentence pairs" % len(pairs))
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [30]:
input_lang, output_lang, pairs = prepareData(lang1 = 'vi', 
                                             lang2 = 'en', 
                                             reverse = False, 
                                             dataset = "train")
print(random.choice(pairs))

Reading lines...
Example of Language #1 sentence: Khoa_học đằng_sau một tiêu_đề về khí_hậu
Example of Language #2 sentence: Rachel Pike : The science behind a climate headline
Number of sentences in Language #1 = 133317
Number of sentences in Language #2 = 133317
Read 133317 sentence pairs
Trimmed to 133317 sentence pairs
Counting words...
Counted words:
vi 16142
en 47566
['Xem toi lam ay nhe <EOS>', 'So let me see if this will work <EOS>']


In [24]:
input_lang, output_lang, pairs = prepareData(lang1 = 'zh', 
                                             lang2 = 'en', 
                                             reverse = False, 
                                             dataset = "train")
print(random.choice(pairs))

Reading lines...
Example of Language #1 sentence: 深海 海中 的 生命   大卫   盖罗 
Example of Language #2 sentence: Life in the deep oceans
Read 213376 sentence pairs
Trimmed to 213376 sentence pairs
Counting words...
Counted words:
zh 89202
en 59327
['哈马 哈马斯 组织 织成 成员    我们 是 完全 和平 的   我们 希望 让 它 遍及 整个 巴勒 巴勒斯 巴勒斯坦 勒斯 坦 <EOS>', 'Hamas Party Member We were in complete harmony and we wanted to spread it to all of Palestine <EOS>']


### 1.1 Vietnamese to English

In [8]:
# Format: languagepair_language_dataset
# Train 
vien_vi_train, vien_en_train, vi_en_train_pairs = prepareData('vi', 'en', False, dataset="train")
# Dev 
vien_vi_dev, vien_en_dev, vi_en_dev_pairs = prepareData('vi', 'en', False, dataset="dev")
# Test
vien_vi_test, vien_en_test, vi_en_test_pairs = prepareData('vi', 'en', False, dataset="test")

Reading lines...
Read 133317 sentence pairs
Trimmed to 133317 sentence pairs
Counting words...
Counted words:
vi 16142
en 47566
Reading lines...
Read 1268 sentence pairs
Trimmed to 1268 sentence pairs
Counting words...
Counted words:
vi 1368
en 3814
Reading lines...
Read 1553 sentence pairs
Trimmed to 1553 sentence pairs
Counting words...
Counted words:
vi 1323
en 3617


### 1.2 Chinese to English

In [9]:
# Format: languagepair_language_dataset
# Train 
zhen_zh_train, zhen_en_train, zh_en_train_pairs = prepareData('zh', 'en', False, dataset="train")
# Dev 
zhen_zh_dev, zhen_en_dev, zh_en_dev_pairs = prepareData('zh', 'en', False, dataset="dev")
# Test
zhen_zh_test, zhen_en_test, zh_en_test_pairs = prepareData('zh', 'en', False, dataset="test")

Reading lines...
Read 213376 sentence pairs
Trimmed to 213376 sentence pairs
Counting words...
Counted words:
zh 89202
en 59327
Reading lines...
Read 1261 sentence pairs
Trimmed to 1261 sentence pairs
Counting words...
Counted words:
zh 6134
en 3914
Reading lines...
Read 1397 sentence pairs
Trimmed to 1397 sentence pairs
Counting words...
Counted words:
zh 5216
en 3421


In [10]:
zh_en_train_pairs[3]

['我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事  <EOS>',
 'And we apos re going to tell you some stories from the sea here in video <EOS>']

### 1.3: Check Source & Target Vocabs

Since the source and target languages can have very different table lookup layers, it's good practice to have separate vocabularies for each. Thus, we build vocabularies for each language that we will be using. 

In the first class (Lang) of this section, we have already defined vocabularies for all languages. So, there is no need to redefine another function. We chech each vocabulary below.

#### Chinese Vocabulary

In [11]:
print ("The number of words in Chinese training corpus is " + str(zhen_zh_train.n_words))

The number of words in Chinese training corpus is 89202


In [12]:
zhen_zh_train.word2index["格"]

10481

In [13]:
zhen_zh_train.index2word[10481]

'格'

#### Vietnamese Vocabulary

In [14]:
print ("The number of words in Vietnamese training corpus is " + str(vien_vi_train.n_words))

The number of words in Vietnamese training corpus is 16142


In [15]:
vien_vi_train.word2index["Hamburger"]

6750

In [16]:
vien_vi_train.index2word[6750]

'Hamburger'

#### English Vocabulary for Zh-En

In [17]:
print ("The number of words in English training corpus for Zh-En is " + str(zhen_en_train.n_words))

The number of words in English training corpus for Zh-En is 59327


In [18]:
zhen_en_train.word2index["translate"]

1449

In [19]:
zhen_en_train.index2word[1451]

'directly'

#### English Vocabulary for Vi-En

In [20]:
print ("The number of words in English training corpus for Vi-En is " + str(vien_en_train.n_words))

The number of words in English training corpus for Vi-En is 47566


In [21]:
vien_en_train.word2index["machine"]

846

In [22]:
vien_en_train.index2word[846]

'machine'

### 1.4 Prepare Dataloaders

In [23]:
vien_en_dev.word2index["<EOS>"]

24

In [24]:
vien_en_dev.index2word[24]

'<EOS>'

In [26]:
PAD_IDX = 0
SOS_IDX = 1
UNK_IDX = 2
# EOS_IDX = 3
# convert token to id in the dataset
def token2index_dataset(paired_tokens, 
                        lang1_token2id_vocab,
                        lang2_token2id_vocab):
    """Takes as input:
    - paired_tokens: a list of sentence pairs that consist of source & target lang sentences.
    - lang1_token2id_vocab: token2index vocabulary for the first language. 
                            Get by method Lang_dataset.word2index
    - lang2_token2id_vocab: token2index vocabulary for the second language. 
                            Get by method Lang_dataset.word2index
                            
    Returns:
    - indices_data_lang_1, indices_data_lang2: A list of lists where each sub-list holds corresponding indices for each
                                               token in the sentence."""
    indices_data_lang_1, indices_data_lang_2 = [], []
    vocabs = [lang1_token2id_vocab, lang2_token2id_vocab]
    
    # lang1
    for t in range(len(paired_tokens)):
        # replaces token with UNK_IDX if the token is not in vocab
        index_list = [vocabs[0][token] if token in vocabs[0]\
                                    else UNK_IDX for token in paired_tokens[t][0]] 
        indices_data_lang_1.append(index_list)
    # lang2
    for t in range(len(paired_tokens)):
        index_list =  [vocabs[1][token] if token in vocabs[1] \
                                    else UNK_IDX for token in paired_tokens[t][1]] 
        indices_data_lang_2.append(index_list)
        
    return indices_data_lang_1, indices_data_lang_2

# train indices
zhen_zh_train_indices, zhen_en_train_indices = token2index_dataset(zh_en_train_pairs,
                                                                   zhen_zh_train.word2index,
                                                                   zhen_en_train.word2index)

vien_vi_train_indices, vien_en_train_indices = token2index_dataset(vi_en_train_pairs,
                                                                   vien_vi_train.word2index,
                                                                   vien_en_train.word2index)

# dev indices
zhen_zh_dev_indices, zhen_en_dev_indices = token2index_dataset(zh_en_dev_pairs,
                                                               zhen_zh_dev.word2index,
                                                               zhen_en_dev.word2index)

vien_vi_dev_indices, vien_en_dev_indices = token2index_dataset(vi_en_dev_pairs,
                                                               vien_vi_dev.word2index,
                                                               vien_en_dev.word2index)

# test indices
zhen_zh_test_indices, zhen_en_test_indices = token2index_dataset(zh_en_test_pairs,
                                                                 zhen_zh_test.word2index,
                                                                 zhen_en_test.word2index)

vien_vi_test_indices, vien_en_test_indices = token2index_dataset(vi_en_test_pairs,
                                                                 vien_vi_test.word2index,
                                                                 vien_en_test.word2index)

In [28]:
print(zhen_zh_train_indices[0])

[321, 7912, 2, 7912, 310, 2, 4, 2, 1586, 23701, 2, 2, 2, 275, 49581, 2, 2, 2, 5915, 6331, 2, 2, 5868, 16124, 5789, 2]


In [29]:
print(zhen_en_train_indices[0])

[4216, 16909, 49096, 8295, 2, 16909, 2107, 2, 268, 2, 8295, 2, 1735, 8295, 8295, 10558, 2, 1263, 28417, 8295, 158, 2107, 23]


In [30]:
# check length
# train
print ("Chinese training sentence count = "+str(len(zhen_zh_train_indices)))
print ("Chinese-English (En) training sentence count = "+str(len(zhen_en_train_indices)))
print ("\nVietnamese training sentence count = "+str(len(vien_vi_train_indices)))
print ("Vietnamese-English (En) training sentence count = "+str(len(vien_en_train_indices)))
# dev
print ("\nChinese dev sentence count = "+str(len(zhen_zh_dev_indices)))
print ("Chinese-English (En) dev sentence count = "+str(len(zhen_en_dev_indices)))
print ("\nVietnamese dev sentence count = "+str(len(vien_vi_dev_indices)))
print ("Vietnamese-English (En) dev sentence count = "+str(len(vien_en_dev_indices)))
# test
print ("\nChinese test sentence count = "+str(len(zhen_zh_test_indices)))
print ("Chinese-English (En) test sentence count = "+str(len(zhen_en_test_indices)))
print ("\nVietnamese test sentence count = "+str(len(vien_vi_test_indices)))
print ("Vietnamese-English (En) test sentence count = "+str(len(vien_en_test_indices)))

Chinese training sentence count = 213376
Chinese-English (En) training sentence count = 213376

Vietnamese training sentence count = 133317
Vietnamese-English (En) training sentence count = 133317

Chinese dev sentence count = 1261
Chinese-English (En) dev sentence count = 1261

Vietnamese dev sentence count = 1268
Vietnamese-English (En) dev sentence count = 1268

Chinese test sentence count = 1397
Chinese-English (En) test sentence count = 1397

Vietnamese test sentence count = 1553
Vietnamese-English (En) test sentence count = 1553


#### Dataloader

In [35]:
from torch.utils.data import Dataset

In [42]:
## TODO 

MAX_SENTENCE_LENGTH = 15
BATCH_SIZE = 32

# zhen token2index vocabs
zhen_zh_train_token2id = zhen_zh_train.word2index
zhen_en_train_token2id = zhen_en_train.word2index

# vien token2index vocabs
vien_vi_train_token2id = vien_vi_train.word2index
vien_en_train_token2id = vien_en_train.word2index

class TranslationDataset():
    """
    Class that represents a train/dev/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, 
                 data_source, # training indices data of the source language
                 data_target, # training indices data of the target language
                 token2id_source=None, # token2id dict of the source language
                 token2id_target=None  # token2id dict of the target language
                ):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.source_sentences = data_source
        self.target_sentences = data_target
        print("self.source_sentences = " + str(self.source_sentences[0]))
        print("self.target_sentences = " + str(self.target_sentences[0]))
        
        self.token2id_source = token2id_source
        self.token2id_target = token2id_target
        # prints the mandarin token -> # dictionary
        """
        It also contains wrong tokenized words & Latin Alphabet words too
        '11<EOS>': 13275, '为啥': 13276, '指责': 13277, '超载': 13278, 
        '复杂度': 13279, '小玩意': 13280, '小玩意儿': 13281, '此行': 13282, 
        '断面': 13283, '强制': 13284, '制发': 13285, '花瓶': 13286,
        '糖': 13287, '年缴': 13288, '缴纳': 13289, '会费': 13290,
        '99': 13291, 'Photoshop': 13292, '4000<EOS>': 13293, 
        '升级': 13294, '佯': 13295, '谬': 13296, 'Microsoft': 13297, 
        'Word': 13298, '文字': 13299,
        """
#         print("self.token2id_source = " + str(self.token2id_source))
        # prints the english token -> # dictionary
#         print("self.token2id_target = " + str(self.token2id_target))        

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, batch_index):

#         source_word_idx, target_word_idx = [], []
        source_mask, target_mask = [], []
        
        for index in self.source_sentences[batch_index][:MAX_SENTENCE_LENGTH]:
            if index != UNK_IDX:
                source_mask.append(0)
            else:
                source_mask.append(1)
                
        for index in self.target_sentences[batch_index][:MAX_SENTENCE_LENGTH]:
            if index != UNK_IDX:
                target_mask.append(0)
            else:
                target_mask.append(1)
        
        source_indices = self.source_sentences[batch_index][:MAX_SENTENCE_LENGTH]
        target_indices = self.target_sentences[batch_index][:MAX_SENTENCE_LENGTH]
        
        source_list = [source_indices, source_mask, len(source_indices)]
        target_list = [target_indices, target_mask, len(target_indices)]
        
        return source_list + target_list

    
def translation_collate(batch, max_sentence_length):
    """
    Customized function for DataLoader that dynamically pads the 
    batch so that all data have the same length
    """
    source_data, target_data = [], []
    source_mask, target_mask = [], []
    source_lengths, target_lengths = [], []

    for datum in batch:
        source_lengths.append(datum[2])
        target_lengths.append(datum[5])
        
        # PAD
        source_data_padded = np.pad(np.array(datum[0]), 
                                    pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", 
                                    constant_values=0)
        source_data.append(source_data_padded)
        
        source_mask_padded = np.pad(np.array(datum[1]), 
                                    pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", 
                                    constant_values=0)
        source_mask.append(source_mask_padded)
        
        target_data_padded = np.pad(np.array(datum[3]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                                mode="constant", constant_values=0)
        target_data.append(target_data_padded)
        
        target_mask_padded = np.pad(np.array(datum[4]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                               mode="constant", constant_values=0)
        target_mask.append(target_mask_padded)
        
    ind_dec_order = np.argsort(source_lengths)[::-1]
    source_data = np.array(source_data)[ind_dec_order]
    target_data = np.array(target_data)[ind_dec_order]
    source_mask = np.array(source_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    target_mask = np.array(target_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    source_lengths = np.array(source_lengths)[ind_dec_order]
    target_lengths = np.array(target_lengths)[ind_dec_order]
    
    source_list = [torch.from_numpy(source_data), 
               torch.from_numpy(source_mask).float(), source_lengths]
    target_list = [torch.from_numpy(target_data), 
               torch.from_numpy(target_mask).float(), target_lengths]
        
    return source_list + target_list

print("Loading zhen_train_dataset:")
zhen_train_dataset = TranslationDataset(zhen_zh_train_indices,
                                       zhen_en_train_indices,
                                       token2id_source=zhen_zh_train_token2id,
                                       token2id_target=zhen_en_train_token2id)

zhen_train_loader = torch.utils.data.DataLoader(dataset=zhen_train_dataset,
                               batch_size=BATCH_SIZE,
                               collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                               shuffle=False)

print("\nLoading zhen_dev_dataset:")
zhen_dev_dataset = TranslationDataset(zhen_zh_dev_indices,
                                       zhen_en_dev_indices,
                                       token2id_source=zhen_zh_train_token2id,
                                       token2id_target=zhen_en_train_token2id)

zhen_dev_loader = torch.utils.data.DataLoader(dataset=zhen_dev_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                             shuffle=False)

print("\nLoading vien_train_dataset:")
vien_train_dataset = TranslationDataset(vien_vi_train_indices,
                                       vien_en_train_indices,
                                       token2id_source=vien_vi_train_token2id,
                                       token2id_target=vien_en_train_token2id)

vien_train_loader = torch.utils.data.DataLoader(dataset=vien_train_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                             shuffle=False)

print("\nLoading vien_dev_dataset:")
vien_dev_dataset = TranslationDataset(vien_vi_dev_indices,
                                       vien_en_dev_indices,
                                       token2id_source=vien_vi_train_token2id,
                                       token2id_target=vien_en_train_token2id)

vien_dev_loader = torch.utils.data.DataLoader(dataset=vien_dev_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                             shuffle=False)


Loading zhen_train_dataset:
self.source_sentences = [321, 7912, 2, 7912, 310, 2, 4, 2, 1586, 23701, 2, 2, 2, 275, 49581, 2, 2, 2, 5915, 6331, 2, 2, 5868, 16124, 5789, 2]
self.target_sentences = [4216, 16909, 49096, 8295, 2, 16909, 2107, 2, 268, 2, 8295, 2, 1735, 8295, 8295, 10558, 2, 1263, 28417, 8295, 158, 2107, 23]

Loading zhen_dev_dataset:
self.source_sentences = [2, 2, 1232, 1232, 2, 4, 2, 55, 91, 2, 2, 2, 2, 2695, 650, 2, 650, 14, 2, 766, 678, 2, 4782, 2, 2, 2, 281, 2, 2, 2, 27, 2802, 2, 430, 729, 2, 14, 2, 2, 2, 2, 16, 2, 4733, 5952, 2, 2, 2, 2, 2, 2, 2, 2]
self.target_sentences = [2, 2, 2, 2, 2, 3, 2, 2, 30, 199, 2, 3, 2, 2, 2, 419, 2, 419, 2, 2, 2, 2, 2, 30, 2, 1211, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 419, 2, 2, 2, 1211, 2, 2, 2, 51, 2, 2, 51, 2, 2, 2, 199, 2, 2, 2, 1109, 2, 2, 2, 2, 2, 2, 2, 2, 1211, 2, 2, 419, 2, 2, 2, 2, 2, 199, 2, 2, 2, 2, 2, 2, 2]

Loading vien_train_dataset:
self.source_sentences = [3018, 2843, 137, 44, 2, 2843, 137, 1836, 2, 44, 1830, 5219, 2, 1082, 44, 888,

In [43]:
##########################################
#### SAMPLE DATASET - CLEAR OUT LATER ####
##########################################

# zhen_train_dataset = TranslationDataset(zhen_zh_train_indices[:480], # 15 batches
#                                        zhen_en_train_indices[:480],
#                                        token2id_source=zhen_zh_train_token2id,
#                                        token2id_target=zhen_en_train_token2id)

# zhen_train_loader = torch.utils.data.DataLoader(dataset=zhen_train_dataset,
#                                batch_size=BATCH_SIZE,
#                                collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
#                                shuffle=False)

## Part 2: Evaluation Metric

We use BLEU as the evaluation metric. Specifically, we focus on the corpus-level BLEU function. 

The code for BLEU is taken from https://github.com/mjpost/sacreBLEU/blob/master/sacrebleu.py#L1022-L1080

In [44]:
! pip3 install sacrebleu

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [126]:
# import sacrebleu

In [127]:
# TODO

## Part 3: Beam Search Algorithm

In this section, we implement the Beam Search algorithm in Pytorch.

## Part 4: Model

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.

#### Loss Function & Evaluation

In [46]:
# reconstruction loss = binary cross entropy between two (vocab_size x 1) vectors
# used during training, since we can compare the real Y and and the generated Y
# still at each time step of the decoder, we compare up to and including
# the real t-th token and the generated t-th, then optimize

def loss_function(y_hat, y):
    
    """Takes as input;
    - y: correct "log-softmax"(binary vector) that represents the correct t-th token in the target sentence,
                 (vocab_size x 1) vector
    - y_hat: predicted LogSoftmax for the predicted t-th token in the target sentence.
             (vocab_size x 1) vector
    Returns;
    - NLL Loss in training time"""
#     y_hat = torch.log(y_hat) # log softmax
    loss = nn.functional.binary_cross_entropy(y_hat,y)
    
    return loss
    

# generation/inference time - validation loss = BLEU

def compute_BLEU(corpus_hat,corpus):
    ## TODO
    return None


#### Beam Search

### Beam Search

In [48]:
torch.FloatTensor([3,4,2,7,5,3,2]).topk(3)

(tensor([7., 5., 4.]), tensor([3, 4, 1]))

In [49]:
BATCH_SIZE = 32

class BeamSearch(nn.Module):
    
    """network that conducts beam search over the outputs of
     any translator network. The translator networks that can 
     be passed are:
     
     - Translate (for RNN-enc-dec),
     - AttnTranslate (for RNN-enc-dec with attention),
     - CNNtranslate (for CNN-encoder based translation).
     
     The translation networks take care of the encoder-decoder
     choices specific to each task. Please see in below sections."""

    def __init__(self, translator_network, beam_size):
        super().__init__()
        # translator network that returns the logsoftmax
        # over vocabulary size:(vocab_size, 1)
        self.translator_network = translator_network
        self.beam_size = beam_size
        
    def init_search_tree(self, batch_size):
        beam_size = self.beam_size
        self.search_tree = torch.empty(batch_size, beam_size, 1)
        return self
    
    def init_score_tree(self, batch_size):
        beam_size = self.beam_size
        search_tree = self.search_tree
        self.score_tree = torch.zeros(search_tree.size())
        return self
    
    def forward(source_sentence, source_mask, source_lengths,
                target_sentence, target_mask, target_lengths):
        
        self.init_search_tree(BATCH_SIZE)
        self.init_score_tree(BATCH_SIZE)
        
        # at each time step the decoder will give us the logsoftmax
        # of one token (batch_size, vocab_size). 
        output = model(source_sentence, target_sentence,source_mask, 
                       target_mask, source_lengths,target_lengths)
        
        # for each sentence in the batch we get the top k predictions
        # for each token and append it to the search and score trees. 
        for i in range(BATCH_SIZE):
            beam = output[i].topk(beam_size) # (token scores, token indices)
            # cat instead
            self.search_tree[i] = self.search_tree.cat(beam[1]) # cat the indices to the search tree
            self.score_tree[i,:] = beam[0] # append the scores to the score tree 
        
        # we will sum the logs 
        
        

### 2.1: RNN-based Encoder-Decoder without Attention

In [50]:
batch_size = 32

In [54]:
def convert_to_softmax(tensor_of_indices,
                       batch_size,
                       vocab_size = len(zhen_en_train_token2id)):
    """
    - takes as input a time_step vector of the batch (t-th token of each sentence in the batch)
      size: (batch_size, 1)
    - converts it to softmax of (batch_size, vocab_size)
    """
    index_tensor_ = tensor_of_indices.view(-1,1).long()
        
    one_hot = torch.FloatTensor(batch_size, vocab_size).zero_()
    one_hot.scatter_(1, index_tensor_.detach().cpu(), 1)
    
    return one_hot

### Fully self-attention Translation System / Transformer

In [39]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

    
    
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)    

    
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
    
    
    
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)
    
    
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)
    
    
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)
    
    
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)
    
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
    
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)
    

In [47]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn


class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)
    
    
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
#             nn.init.xavier_uniform_(p)
            nn.init.xavier_uniform_(p)
    return model






In [48]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [49]:
tmp_model = make_model(10, 10, 2)
tmp_model

EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (sublayer): ModuleList(
          (0): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1)
          )
          (1): SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1)
      

In [50]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask

In [51]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

In [52]:
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [53]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [124]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        
        # creates a copy of the Variable
        true_dist = x.data.clone()
        print("true_dist = " + str(true_dist))
        
        # Fills self tensor with the specified value
        true_dist.fill_(self.smoothing / (self.size - 2.0))
        print("true_dist = " + str(true_dist))
        
        # scatter_(dim, index, src) → Tensor
        print("self.confidence = " + str(self.confidence))
        true_dist.scatter_(dim = 1, 
                           index = target.data.unsqueeze(1), 
                           src = self.confidence)
        
        true_dist[:, self.padding_idx] = 0.
        mask = torch.nonzero(target.data == self.padding_idx)
#         if mask.dim() > 0:
        if mask.sum() > 0 and len(mask) > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

In [125]:
crit = LabelSmoothing(5, 0, 0.1)
def loss(x):
    d = x + 3 * 1
    predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d],
                                 ])
    #print(predict)
    return crit(Variable(predict.log()),
                 Variable(torch.LongTensor([1]))).data[0]



In [119]:
def data_gen(V, batch, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
        data[:, 0] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)

In [120]:
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        x_loss = x.contiguous().view(-1, x.size(-1)).float()
        y_loss = y.contiguous().view(-1)
        loss = self.criterion(x_loss, y_loss) / norm
                              
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data[0] * norm

In [121]:
V = 11

criterion = LabelSmoothing(size=V, 
                           padding_idx=0, 
                           smoothing=0.0)

model = make_model(V, V, N=2)

model_opt = NoamOpt(model.src_embed[0].d_model, 
                    1, 
                    400,
                    torch.optim.Adam(model.parameters(), 
                                     lr=0, 
                                     betas=(0.9, 0.98), 
                                     eps=1e-9))

for epoch in range(10):
    model.train()
    run_epoch(data_iter = data_gen(V, 30, 20), 
              model = model, 
              loss_compute = SimpleLossCompute(generator = model.generator, 
                                               criterion = criterion, 
                                               opt = model_opt))
    model.eval()
    print(run_epoch(data_iter = data_gen(V, 30, 5), 
                    model = model, 
                    loss_compute = SimpleLossCompute(generator = model.generator, 
                                                     criterion = criterion, 
                                                     opt = None)))



true_dist = tensor([[-1.9340, -4.1510, -2.6695,  ..., -2.6970, -3.7252, -1.2385],
        [-2.3561, -1.1062, -3.0696,  ..., -2.7260, -3.8816, -1.6418],
        [-3.7725, -2.3680, -4.6145,  ..., -4.4205, -4.0762, -1.8362],
        ...,
        [-2.7200, -1.7991, -4.6643,  ..., -1.3054, -5.5390, -1.0155],
        [-3.3538, -1.7163, -2.9694,  ..., -2.2880, -3.2859, -2.1620],
        [-4.2577, -1.2764, -2.6886,  ..., -2.4744, -2.3833, -1.3999]])
true_dist = tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
self.confidence = 1.0


TypeError: scatter_() received an invalid combination of arguments - got (src=float, index=Tensor, dim=int, ), but expected one of:
 * (int dim, Tensor index, Tensor src)
      didn't match because some of the arguments have invalid types: ([32;1mdim=int[0m, [32;1mindex=Tensor[0m, [31;1msrc=float[0m, )
 * (int dim, Tensor index, Number value)
      didn't match because some of the keywords were incorrect: src
