In [1]:
import sys
import re
import os
from pathlib import Path
from collections import namedtuple

import numpy as np

In [2]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/tatoeba/v1/")
CACHE_PATH = Path("cache/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 

In [3]:
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

In [4]:
from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, IndexSearchKNN, IndexCreate, IndexSearchMultiple
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

In [5]:
def MarginRatio(em, ofp, params, args, stats):
    if args.include_source == 'always':
        ofp.write('{:d}\t{:6.1f}\t{}\n'
                  .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
    D, I = params.idx.search(em, args.margin_k)
    Mean = D.mean(axis=1)
    for n in range(D.shape[0]):
        if D[n, 0] / Mean[n] <= args.threshold:
            if args.include_source == 'matches':
                ofp.write('{:d}\t{:6.1f}\t{}\n'
                          .format(stats.nbs, 0.0, sentences[n].replace('@@ ', '')))
            txt = IndexTextQuery(params.T, params.R, I[n, 0])
            ofp.write('{:d}\t{:7.5f}\t{}\n'.format(stats.nbs, D[n, 0], txt))
            stats.nbp += 1

        stats.nbs += 1

In [6]:
def buffered_read(fp, buffer_size):
    buffer = []
    for src_str in fp:
        buffer.append(src_str.strip())
        if len(buffer) >= buffer_size:
            yield buffer
            buffer = []

    if len(buffer) > 0:
        yield buffer

## Preprocessing

In [7]:
Token(
    str(DATA_PATH / "tatoeba.cmn-eng.cmn"),
    str(CACHE_PATH / "tatoeba.cmn-eng.cmn"),
    lang="zh",
    romanize=False,
    lower_case=True, gzip=False,
    verbose=True, over_write=False)

 - Tokenizer: tatoeba.cmn-eng.cmn exists already


In [8]:
Token(
    str(DATA_PATH / "tatoeba.cmn-eng.eng"),
    str(CACHE_PATH / "tatoeba.cmn-eng.eng"),
    lang="en",
    romanize=False,
    lower_case=True, gzip=False,
    verbose=True, over_write=False)

 - Tokenizer: tatoeba.cmn-eng.eng exists already


In [9]:
bpe_codes = str(MODEL_PATH / "93langs.fcodes")
BPEfastApply(
    str(CACHE_PATH / "tatoeba.cmn-eng.eng"),
    str(CACHE_PATH / "tatoeba.cmn-eng.eng.bpe"),
    bpe_codes,
    verbose=True, over_write=False)

 - fast BPE: tatoeba.cmn-eng.eng.bpe exists already


In [10]:
BPEfastApply(
    str(CACHE_PATH / "tatoeba.cmn-eng.cmn"),
    str(CACHE_PATH / "tatoeba.cmn-eng.cmn.bpe"),
    bpe_codes,
    verbose=True, over_write=False)

 - fast BPE: tatoeba.cmn-eng.cmn.bpe exists already


## Extract Setence Embeddings

In [11]:
encoder = SentenceEncoder(
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)

In [12]:
EncodeFile(
    encoder,
    str(CACHE_PATH / "tatoeba.cmn-eng.cmn.bpe"),
    str(CACHE_PATH / "tatoeba.cmn-eng.cmn.enc"),
    verbose=True, over_write=False)

 - Encoder: tatoeba.cmn-eng.cmn.enc exists already


In [13]:
EncodeFile(
    encoder,
    str(CACHE_PATH / "tatoeba.cmn-eng.eng.bpe"),
    str(CACHE_PATH / "tatoeba.cmn-eng.eng.enc"),
    verbose=True, over_write=False)

 - Encoder: tatoeba.cmn-eng.eng.enc exists already


## Create Index

In [14]:
data_en, index_en = IndexCreate(
    str(CACHE_PATH / "tatoeba.cmn-eng.eng.enc"), 'FlatL2', verbose=True, save_index=False)
data_zh, index_zh = IndexCreate(
    str(CACHE_PATH / "tatoeba.cmn-eng.cmn.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache/tatoeba.cmn-eng.eng.enc 1000 examples of dim 1024
 - creating FAISS index
 - embedding: cache/tatoeba.cmn-eng.cmn.enc 1000 examples of dim 1024
 - creating FAISS index


## Evaluation

In [15]:
err = IndexSearchMultiple(
    [data_en, data_zh], [index_en, index_zh], langs=["en", "zh"], verbose=True)

Calculating similarity error:
 - similarity error en=>zh  4.10%
 - similarity error zh=>en  5.00%


### Error Analysis

Read in the tokenized documents:

In [16]:
documents = {}
for lang in ("eng", "cmn"):
    with open(CACHE_PATH / f"tatoeba.cmn-eng.{lang}.bpe") as fin:
        documents[lang] = list(fin.readlines())
print(len(documents["eng"]), len(documents["cmn"]))

1000 1000


#### English to Chinese Mandarin

In [17]:
_, matched_indices = index_zh.search(data_en, 1)
matched_indices.shape

(1000, 1)

In [18]:
sum([x == i for i, x in enumerate(matched_indices[:, 0])])

959

In [19]:
for idx in np.where([x != i for i, x in enumerate(matched_indices[:, 0])])[0]:
    print(
        "source:  ", documents["eng"][idx].strip() + "\n",
        "predict: ", documents["cmn"][matched_indices[idx, 0]].strip() + "\n",
        "correct: ", documents["cmn"][idx].strip() + "\n", sep=""
    )

source:  i 'm at a loss for words .
predict: 我@@ 興@@ 奮 得 說 不 出@@ 話 來   。
correct: 我 不 知道 應@@ 該 說 什麼 才 好   。

source:  i just don 't know what to say .
predict: 我 不 知道 應@@ 該 說 什麼 才 好   。
correct: 我 只是 不 知道 應@@ 該 說 什麼 而@@ 已   ..@@ ....

source:  you should sleep .
predict: 你 应该 睡@@ 觉   。
correct: 你 應@@ 該 去 睡@@ 覺 了 吧   。

source:  so fu@@ ck@@ in ' what .
predict: 這@@ 是 什麼 啊   ？
correct: 那 又 怎@@ 樣   ?

source:  i don 't like him any more than he lik@@ es me .
predict: 我 也 不 喜欢 他   。
correct: 我們 之@@ 間 已@@ 經 沒@@ 有 感@@ 情 了   。

source:  a gu@@ est should not try to make himself superior to the ho@@ st .
predict: 没有 欲@@ 望 就 等@@ 于 拥有   。
correct: 强@@ 宾 不@@ 压@@ 主   。

source:  something can 't get out of hand .
predict: 这个 问题 不能 解决   。
correct: 爱@@ 不@@ 释@@ 手

source:  she was be@@ side her@@ self with jo@@ y .
predict: 她 陶@@ 醉 在 幸@@ 福 裡   。
correct: 她 欣@@ 喜@@ 若@@ 狂   。

source:  i am the fast@@ est run@@ ner .
predict: 我 是 最@@ 快 的 跑 者   。
correct: 我 是 跑 得 最@@ 快 的 人   。

source:  i am sorry to t

In [20]:
_, matched_indices = index_en.search(data_zh, 1)
matched_indices.shape

(1000, 1)

#### Chinese to English Mandarin

In [21]:
for idx in np.where([x != i for i, x in enumerate(matched_indices[:, 0])])[0]:
    print(
        "source:  ", documents["cmn"][idx].strip() + "\n",
        "predict: ", documents["eng"][matched_indices[idx, 0]].strip() + "\n",
        "correct: ", documents["eng"][idx].strip() + "\n", sep=""
    )

source:  我 不 知道 應@@ 該 說 什麼 才 好   。
predict: i just don 't know what to say .
correct: i 'm at a loss for words .

source:  你 應@@ 該 去 睡@@ 覺 了 吧   。
predict: you should go to bed .
correct: you should sleep .

source:  那 又 怎@@ 樣   ?
predict: what is this ?
correct: so fu@@ ck@@ in ' what .

source:  我們 之@@ 間 已@@ 經 沒@@ 有 感@@ 情 了   。
predict: it would be better for both of us not to see each other any@@ more .
correct: i don 't like him any more than he lik@@ es me .

source:  我 不 同意 你 這@@ 個 觀@@ 點   。
predict: i agree with your opinion .
correct: i can 't go along with you on that point .

source:  强@@ 宾 不@@ 压@@ 主   。
predict: a lion is strong .
correct: a gu@@ est should not try to make himself superior to the ho@@ st .

source:  我 不 了解 他   。
predict: i don 't understand you .
correct: i can 't figure him out .

source:  爱@@ 不@@ 释@@ 手
predict: i miss you .
correct: something can 't get out of hand .

source:  她 欣@@ 喜@@ 若@@ 狂   。
predict: she is si@@ ck .
correct: she was be@@ side her@@ s