In [14]:
import os
import xml.etree.ElementTree as ET
import shutil
import numpy as np

In [3]:
groundtruth_dir='/data/QbE-Final/scoring_quesst2015_lang_noise'
lang = 'Chinese'
result_dir = '/data/QbE-Final/ckpts/phrase-mean/allmspanspec_fbank_pitch_phones/2015results/'

In [19]:
def default_cos_to_score(s):
    return 10 * (1.5 - s) - 7.5

In [46]:
def grade_lang(result, truth, lang, thres=0, qtype=None, score_function=None):
    
    working_dir = os.path.join(result,lang)
    if qtype is not None:
        working_dir = os.path.join(result,f"{lang}_{qtype}")
    
    
    os.makedirs(working_dir, exist_ok=True)
    output_fn = os.path.join(working_dir,  f"AWESystem.stdlist.xml")
    
    groundtruth_lang = os.path.join(truth, f"groundtruth_quesst2015_dev_{lang}")
    
    if qtype is not None:
        groundtruth_lang = os.path.join(truth, f"groundtruth_quesst2015_dev_{lang}_{qtype}")
    
    dist_dict = np.load(os.path.join(result, 'dev_dist_dict.npy'), allow_pickle=True).item()
    
    ecf_fn = os.path.join(groundtruth_lang, f"quesst2015_{lang}.ecf.xml")
    tlist_fn = os.path.join(groundtruth_lang,f"quesst2015_{lang}_dev.tlist.xml")
    
    # get query needed
    tlist_tree = ET.parse(tlist_fn)
    root = tlist_tree.getroot()
    query_set = set()
    for child in root:
        child = child.attrib
        query_set.add(child['termid'])
    
    # get search and dur
    ecf_tree = ET.parse(ecf_fn)
    root = ecf_tree.getroot()
    audio_time_dict = {}
    for child in root:
        child = child.attrib
        fn = child['audio_filename']
        _, fn = os.path.split(fn)
        fn = os.path.splitext(fn)[0]
        dur = child['dur']
        audio_time_dict[fn] = dur
    
    # for grading script
    result_heading = """<?xml version="1.0" encoding="UTF-8"?>
    <stdlist termlist_filename="Query.tlist.xml" indexing_time="1.00" language="mul" index_size="1" system_id="AWE system">
    """
    result_ending = "</stdlist>\n"

    term_heading = """<detected_termlist termid="{}" term_search_time="2.00" oov_term_count="1">
    """
    term_ending = "</detected_termlist>\n"
    one_query = """<term file="{}" channel="1" tbeg="0" dur="{}" score="{}" decision="{}"/>
    """

    with open(output_fn, 'w') as f:
        f.write(result_heading)

        for q in query_set:
            f.write(term_heading.format(q))

            for s in audio_time_dict.keys():

                if (q, s) in dist_dict:
                    score = dist_dict[(q, s)]

                    result = 'NO'

                    if score_function is not None:
                        score = score_function(score)

                    if score > thres:
                        result = 'YES'

                    f.write(one_query.format(s, audio_time_dict[s], score, result))

            f.write(term_ending)
        f.write(result_ending)
    
    os.chdir(f"../scoring_quesst2015")
    os.system(f"./score-TWV-Cnxe.sh {working_dir} {groundtruth_lang} -7.5")

In [23]:
grade_lang("../ckpts/phrase-mean/allmspanspec_fbank_pitch_phones/2015results/",
           groundtruth_dir, 'Chinese', thres=0, score_function=default_cos_to_score)

In [24]:
grade_lang("../ckpts/phrase-mean/allmspanspec_fbank_pitch_phones/2015results/",
           groundtruth_dir, 'Albanian', thres=0, score_function=default_cos_to_score)

In [25]:
grade_lang("../ckpts/phrase-mean/allmspanspec_fbank_pitch_phones/2015results/",
           groundtruth_dir, 'Czech', thres=0, score_function=default_cos_to_score)

In [26]:
grade_lang("../ckpts/phrase-mean/allmspanspec_fbank_pitch_phones/2015results/",
           groundtruth_dir, 'Portuguese', thres=0, score_function=default_cos_to_score)

In [28]:
def gen_gold_dict(gold_fn):
    gold_set = set()
    with open(gold_fn, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            if line[:6] == 'LEXEME':
                line = line.split(' ')
                this_s = line[1]
                this_q = line[5]

                if 'quesst2015' in this_q:
                    gold_set.add((this_q, this_s))
    return gold_set

In [40]:
def gen_groundtruth(truth, lang, query_type):
    groundtruth_lang = os.path.join(truth, f"groundtruth_quesst2015_dev_{lang}")
    ecf_fn = os.path.join(groundtruth_lang, f"quesst2015_{lang}.ecf.xml")
    tlist_fn = os.path.join(groundtruth_lang,f"quesst2015_{lang}_dev.tlist.xml")
    
    new_working_dir = os.path.join(truth, f"groundtruth_quesst2015_dev_{lang}_{query_type}" )
    os.makedirs(new_working_dir, exist_ok=True)
    shutil.copyfile(ecf_fn, os.path.join(new_working_dir, f"quesst2015_{lang}.ecf.xml"))
    shutil.copyfile(tlist_fn, os.path.join(new_working_dir, f"quesst2015_{lang}_dev.tlist.xml"))
    
    query_gold_fn = f"/data/QbE-Final/scoring_quesst2015/groundtruth_quesst2015_dev_{query_type}/quesst2015_dev_{query_type}.rttm"
    lang_gold_fn = os.path.join(groundtruth_lang, f"quesst2015_{lang}_dev.rttm" )
    gold_set = gen_gold_dict(query_gold_fn)
    
    new_gold_fn = os.path.join(new_working_dir, f"quesst2015_{lang}_dev.rttm" )
    
    new_lines = []
    with open(lang_gold_fn, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            if line[:6] == 'LEXEME':
                line = line.split(' ')
                this_s = line[1]
                this_q = line[5]

                if (this_q, this_s) not in gold_set:
                    line[1] = this_s
                    line[5] = 'NO_KEYWORD'
                    
                new_lines.append(' '.join(line))
            else:
                new_lines.append(line)
    
    last_line=""
    with open(new_gold_fn, 'w') as f_new:
        for line in new_lines:
            if line != last_line:
                last_line = line
                f_new.write(line + '\n')

In [41]:
gen_groundtruth(groundtruth_dir, 'Chinese', 'T1')

In [42]:
langs = ['Chinese', 'Albanian', 'Czech', 'Portuguese']
types = ['T1', 'T2', 'T3']

for lang in langs:
    for t in types:
        gen_groundtruth(groundtruth_dir, lang, t)

In [47]:
for lang in langs:
    for t in types:
        grade_lang("../ckpts/phrase-mean/allmspanspec_fbank_pitch_phones/2015results/",
           groundtruth_dir, lang, qtype=t, thres=0, score_function=default_cos_to_score)