In [1]:
import sys
import re
import os
from pathlib import Path
from collections import namedtuple

import numpy as np

In [2]:
LASER_PATH = Path("..")
sys.path.append(str(LASER_PATH.resolve()))
os.environ["LASER"] = str(LASER_PATH.resolve())

MODEL_PATH = Path("../models")
DATA_PATH = Path("../data/tatoeba/v1/")

from source.shortcuts import text_file_pipeline
from source.lib.indexing import IndexSearchMultiple

In [3]:
data_en, index_en = text_file_pipeline(
    "en",
    str(DATA_PATH / "tatoeba.cmn-eng.eng"),
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"), 
    str(MODEL_PATH / "93langs.fcodes"),
    use_cpu=False,
    returns="index",
    batch_size=20
)
data_zh, index_zh = text_file_pipeline(
    "zh",
    str(DATA_PATH / "tatoeba.cmn-eng.cmn"),
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"), 
    str(MODEL_PATH / "93langs.fcodes"),
    use_cpu=False,
    returns="index",
    batch_size=20
)

 - Tokenizer: tatoeba.cmn-eng.eng in language en  
 - fast BPE: processing token
 - Encoder: bpe to enc
 - Encoder: 1000 sentences in 4s
 - embedding: /tmp/tmprlouhyym/enc 1000 examples of dim 1024
 - creating FAISS index
 - Tokenizer: tatoeba.cmn-eng.cmn in language zh  
 - fast BPE: processing token
 - Encoder: bpe to enc
 - Encoder: 1000 sentences in 4s
 - embedding: /tmp/tmphoiz6jwm/enc 1000 examples of dim 1024
 - creating FAISS index


## Evaluation

In [4]:
err = IndexSearchMultiple(
    [data_en, data_zh], [index_en, index_zh], verbose=True, texts=None)

Calculating similarity error (indices):
 - similarity error lang_1=>lang_2  4.10%
 - similarity error lang_2=>lang_1  5.00%


### Error Analysis

Read in the documents:

In [5]:
documents = {}
for lang in ("eng", "cmn"):
    with open(DATA_PATH / f"tatoeba.cmn-eng.{lang}") as fin:
        documents[lang] = list(fin.readlines())
print(len(documents["eng"]), len(documents["cmn"]))

1000 1000


#### English to Chinese Mandarin

In [6]:
_, matched_indices = index_zh.search(data_en, 1)
matched_indices.shape

(1000, 1)

In [7]:
sum([x == i for i, x in enumerate(matched_indices[:, 0])])

959

In [8]:
for idx in np.where([x != i for i, x in enumerate(matched_indices[:, 0])])[0]:
    print(
        "source:  ", documents["eng"][idx].strip() + "\n",
        "predict: ", documents["cmn"][matched_indices[idx, 0]].strip() + "\n",
        "correct: ", documents["cmn"][idx].strip() + "\n", sep=""
    )

source:  I'm at a loss for words.
predict: 我興奮得說不出話來。
correct: 我不知道應該說什麼才好。

source:  I just don't know what to say.
predict: 我不知道應該說什麼才好。
correct: 我只是不知道應該說什麼而已……

source:  You should sleep.
predict: 你应该睡觉。
correct: 你應該去睡覺了吧。

source:  So fuckin' what.
predict: 這是什麼啊？
correct: 那又怎樣?

source:  I don't like him any more than he likes me.
predict: 我也不喜欢他。
correct: 我們之間已經沒有感情了。

source:  A guest should not try to make himself superior to the host.
predict: 没有欲望就等于拥有。
correct: 强宾不压主。

source:  Something can't get out of hand.
predict: 这个问题不能解决。
correct: 爱不释手

source:  She was beside herself with joy.
predict: 她陶醉在幸福裡。
correct: 她欣喜若狂。

source:  I am the fastest runner.
predict: 我是最快的跑者。
correct: 我是跑得最快的人。

source:  I am sorry to trouble you.
predict: 我很抱歉打擾你了。
correct: 我很抱歉給你添麻煩了。

source:  I am content with my job.
predict: 我很滿意我的工作。
correct: 我對我的工作感到滿意。

source:  He is an easy mark at cards.
predict: 他是一個撲克牌高手。
correct: 他打牌很容易被騙。

source:  You can see that the architect paid scrupulous at

In [9]:
_, matched_indices = index_en.search(data_zh, 1)
matched_indices.shape

(1000, 1)

#### Chinese to English Mandarin

In [10]:
for idx in np.where([x != i for i, x in enumerate(matched_indices[:, 0])])[0]:
    print(
        "source:  ", documents["cmn"][idx].strip() + "\n",
        "predict: ", documents["eng"][matched_indices[idx, 0]].strip() + "\n",
        "correct: ", documents["eng"][idx].strip() + "\n", sep=""
    )

source:  我不知道應該說什麼才好。
predict: I just don't know what to say.
correct: I'm at a loss for words.

source:  你應該去睡覺了吧。
predict: You should go to bed.
correct: You should sleep.

source:  那又怎樣?
predict: What is this?
correct: So fuckin' what.

source:  我們之間已經沒有感情了。
predict: It would be better for both of us not to see each other anymore.
correct: I don't like him any more than he likes me.

source:  我不同意你這個觀點。
predict: I agree with your opinion.
correct: I can't go along with you on that point.

source:  强宾不压主。
predict: A lion is strong.
correct: A guest should not try to make himself superior to the host.

source:  我不了解他。
predict: I don't understand you.
correct: I can't figure him out.

source:  爱不释手
predict: I miss you.
correct: Something can't get out of hand.

source:  她欣喜若狂。
predict: She is sick.
correct: She was beside herself with joy.

source:  我是最快的跑者。
predict: I am the fastest runner.
correct: I'm the fastest runner.

source:  我很抱歉打擾你了。
predict: I am sorry to trouble you.
correc