In [None]:
import glob
import re
import collections
from metrics_domain_adaptation import utils
import json

re_langs = re.compile(r".*/(.{2,2})_(.{2,2})_doc(\d+)\.txt")

data_all = collections.defaultdict(dict)
for f in glob.glob("/home/ec2-user/MetricsDomainAdaptation/data/shuoyang/*_doc*.txt"):
    lang1, lang2, docname = re_langs.match(f).groups()
    data_all[(lang1, lang2)][docname] = [
        line.rstrip("\n").split("\t")
        for line in open(f, "r")
    ]

cheatsheet = {
    # inefficient but does not matter
    tuple(line.split("\t")[0].replace("_", "-", 1).split("_")): json.loads(line.split("\t")[1].rstrip("\n").replace("'", '"'))
    for line in open("/home/ec2-user/MetricsDomainAdaptation/data/shuoyang/cheat_sheet.txt", "r")
}

In [None]:
# we have a few new languages
print(set([f"{x}-{y}" for x, y in data_all.keys()])-set(utils.LANGS2))
print(set(utils.LANGS2)-set([f"{x}-{y}" for x, y in data_all.keys()]))

In [None]:
# lang detect
import langdetect

for (lang1, lang2), data_docs  in data_all.items():
    langs = lang1+"-"+lang2
    for docname, doc in data_docs.items():
        for _, _, line_src, line_tgt in doc:
            try:
                lang1_detected = langdetect.detect_langs(line_src)[0]
                lang1_detected.lang = lang1_detected.lang.split("-")[0]
                if lang1 != lang1_detected.lang and lang1_detected.prob >= 0.9:
                    print(langs+":"+docname, lang1, lang1_detected.lang, line_src)
            except langdetect.LangDetectException:
                print("Undetectable", langs+":"+docname, line_src)
            try:
                lang2_detected = langdetect.detect_langs(line_tgt)[0]
                lang2_detected.lang = lang2_detected.lang.split("-")[0]
                if lang2 != lang2_detected.lang and lang2_detected.prob >= 0.9:
                    print(langs+":"+docname, lang2, lang2_detected.lang, line_tgt)
            except langdetect.LangDetectException:
                print("Undetectable", langs+":"+docname, line_tgt)

In [None]:
# malformed segments
re_sent_ok = re.compile(r"^[\w\s\d]*$")

for (lang1, lang2), data_docs  in data_all.items():
    langs = lang1+"-"+lang2
    for docname, doc in data_docs.items():
        for _, _, line_src, line_tgt in doc:
            # if not re_sent_ok.match(line_src):
            if len([c for c in line_src if c.isalpha()]) <= 2:
                print(langs+":"+docname, line_src)
            if len([c for c in line_tgt if c.isalpha()]) <= 2:
                print(langs+":"+docname, line_tgt)

In [None]:
all_lines = {}
for (lang1, lang2), data_docs  in data_all.items():
    langs = lang1+"-"+lang2
    for docname, doc in data_docs.items():
        cheatsheet_local = cheatsheet[(langs, "doc"+docname)]
        for system, _, line_src, line_tgt in doc:
            system = cheatsheet_local[int(system.removeprefix("system"))]
            line_hash = line_src + " | " + line_tgt
            if line_hash in all_lines:
                print(f"{system}/{all_lines[line_hash]}", line_hash)
            else:
                all_lines[line_hash] = system

In [None]:

from transformers import pipeline

mt_models = {
    (lang1, lang2): pipeline("translation", model=f"Helsinki-NLP/opus-mt-{lang1}-{lang2}", device=0)
    for lang1, lang2 in data_all.keys()
    if len({lang1, lang2} & {"pt"}) == 0
}

In [None]:
import random
import tqdm

def sent_overlap(a, b):
    a = set(a.lower().split())
    b = set(b.lower().split())
    if len(a) <= 4 and len(b) <= 4:
        return True
    else:
        # make sure there's some overlap
        return (2*len(a & b))/(len(a)+len(b)) >= 0.15

for (lang1, lang2), data_docs in tqdm.tqdm(list(data_all.items())):
    langs = lang1+"-"+lang2
    for docname, doc in data_docs.items():
        for _, _, line_src, line_tgt in random.sample(doc, k=1):
            if (lang1, lang2) in mt_models:
                line_src_mt = mt_models[(lang1, lang2)](line_src)[0]["translation_text"]
                if not sent_overlap(line_src_mt, line_tgt):
                    print(langs+":"+docname, line_src_mt, line_tgt)
            if (lang2, lang1) in mt_models:
                line_tgt_mt = mt_models[(lang2, lang1)](line_tgt)[0]["translation_text"]
                if not sent_overlap(line_tgt_mt, line_src):
                    print(langs+":"+docname, line_tgt_mt, line_src)
