In [1]:
import argparse
import pyterrier as pt
import pandas as pd
import os
import logging
from pyterrier_dr import BGEM3, FlexIndex
from ir_measures import R, MRR, nDCG

  from .autonotebook import tqdm as notebook_tqdm


# Load Queries

In [2]:
trec_year = "2023"

In [3]:
dataset = pt.get_dataset(f"irds:neuclir/1/zh/trec-{trec_year}")
queries_orig = dataset.get_topics(tokenise_query=False)
# only use ht_title and mt_title
queries_orig = queries_orig[["qid", "ht_title"]]
queries_orig = queries_orig.rename(columns={"ht_title": "query"})

# print no. of queries
print(f"Loaded {len(queries_orig)} queries for irds:neuclir/1/zh/trec-{trec_year} (HT)")

There are multiple query fields available: ('title', 'description', 'narrative', 'ht_title', 'ht_description', 'mt_title', 'mt_description', 'mt_narrative', 'translation_lang'). To use with pyterrier, provide variant or modify dataframe to add query column.
Loaded 76 queries for irds:neuclir/1/zh/trec-2023 (HT)


In [4]:
# pre-process transliterated queries (assumes they are already tokenized before romanisation)
queries_translit = pd.read_csv(f"/root/nfs/CLIR/data/transliterations/neuclir_1_zh_trec-{trec_year}_uroman.tsv", sep="\t", header=None, names=["qid", "ht_title", "mt_title", "ht_description"])
queries_translit["qid"] = queries_translit["qid"].astype(str)
queries_translit["ht_title"] = queries_translit["ht_title"].astype(str)
queries_translit["mt_title"] = queries_translit["mt_title"].astype(str)
queries_translit["ht_description"] = queries_translit["ht_description"].astype(str)
queries_translit = queries_translit.rename(columns={"ht_title": "query"})
print(f"Loaded {len(queries_translit)} queries for transliterated irds:neuclir/1/zh/trec-{trec_year} (HT)")

Loaded 76 queries for transliterated irds:neuclir/1/zh/trec-2023 (HT)


In [5]:
queries_translit.head()

Unnamed: 0,qid,query,mt_title,ht_description
0,200,aoweihuitanwuhuilu,fubaihuilutiyulianhehuiaoyunhui,tiyujieshifouyoujigouxingfubaihehuiludenganjian?
1,201,zhongguozaiyilangdetouzi,zhongguoduiyilangdetouzi,zhongguozaiyilangtouzilenaxiechanye?
2,202,"xinxingjishu, jingzhunnongye, zhinengnongye, n...","xinxingjishu, jingquenongye, zhinengnongye, no...",zhengzaigaibiannongyechanyedezuixinxinxingjish...
3,203,zhangcilungeqian,cengjingbeikazhule,chazhaoyouguanzhangcilunzaisuyishiyunhegeqiand...
4,204,chufaxingfenjiyundongtingzhi,dianqiuxingfenjiyundongtingzhi,shiyongshenmeceelaijianshao/tingzhizaitiyuyund...


# Load Retrieval Results

In [6]:
res_orig_path = f"/root/nfs/CLIR/data/retrieval_results/bge-m3-ZH_MMARCO_50/bge-m3-ZH_MMARCO_50_neuclir_1_zh_trec-{trec_year}_zh.res.gz"

In [7]:
res_trans_path = f"/root/nfs/CLIR/data/retrieval_results/bge-m3-ZH_MMARCO_50/bge-m3-ZH_MMARCO_50_neuclir_1_zh_trec-{trec_year}_zh_trans_uro.res.gz"

In [10]:
res_orig = pt.io.read_results(res_orig_path)
res_orig = res_orig.merge(queries_orig, on="qid")
print(f"Loaded results from disk: {res_orig_path}")

Loaded results from disk: /root/nfs/CLIR/data/retrieval_results/bge-m3-ZH_MMARCO_50/bge-m3-ZH_MMARCO_50_neuclir_1_zh_trec-2023_zh.res.gz


In [11]:
res_trans = pt.io.read_results(res_trans_path)
res_trans = res_trans.merge(queries_translit, on="qid")
print(f"Loaded results from disk: {res_trans_path}")

Loaded results from disk: /root/nfs/CLIR/data/retrieval_results/bge-m3-ZH_MMARCO_50/bge-m3-ZH_MMARCO_50_neuclir_1_zh_trec-2023_zh_trans_uro.res.gz


# Analysis

## sample 10 qids

In [12]:
sample = queries_orig.sample(10)

In [13]:
sample

Unnamed: 0,qid,query
51,251,影音創作者Pewdiepie聲量
19,219,"新科技, 人工智慧 / 人工智能（AI）, 醫療"
60,260,馬航370 海洋無限
28,228,5G優點和缺點
59,259,鬼滅之刃漫畫銷量
12,212,智慧城市、物联网(IoT)
69,269,巨石陣建造者和用途的科學發現
38,238,法赫里扎德遇刺美国对的反应
66,266,2017奧斯卡頒獎失誤
75,275,奧運游泳選手遴選


In [14]:
# show text of query with qid 114 for both queries_orig and queries_translit
qid = "251"
print(queries_orig[queries_orig["qid"] == qid]["query"].values[0])

print(queries_translit[queries_translit["qid"] == qid]["query"].values[0])

影音創作者Pewdiepie聲量
yingyinchuangzuozhePewdiepieshengliang


In [15]:
# print top 5 results for qid for both res_orig showing only the columns "qid", "docno", "rank" and "score"
print(res_orig[res_orig["qid"] == qid].head(10)[["qid", "docno", "rank", "score"]])
docnos_orig = res_orig[res_orig["qid"] == qid].head(10)["docno"].values

print(res_trans[res_trans['qid'] == qid].head(10)[["qid", "docno", "rank", "score"]])
docnos_translit = res_trans[res_trans['qid'] == qid].head(10)["docno"].values

       qid                                 docno  rank     score
51000  251  de07f107-88f2-459a-9d76-b886f828843d     0  0.536568
51001  251  4d8b0105-df2a-4403-a1d9-2f53590a44d9     1  0.510152
51002  251  dd3fdb41-678c-4182-af33-7be884584f72     2  0.504455
51003  251  82fef4cb-85e4-4156-81cb-92efb1c20f6d     3  0.495551
51004  251  46d36fd0-1f0f-43f8-b90c-7751c2543181     4  0.492586
51005  251  23a4fe76-8b4a-49a5-a165-a5b29c076c6b     5  0.488505
51006  251  fb79f6bc-dcb4-46f6-b751-f7d480b5ce50     6  0.487089
51007  251  dd48e96f-deb3-4f55-b569-40ac5d3a19f0     7  0.482957
51008  251  137fd728-9d1c-48a3-88d8-3d4ebcac1d96     8  0.479304
51009  251  8e65b7e8-ca5a-4f3f-a8c7-e11c40ee42b4     9  0.478233
       qid                                 docno  rank     score
51000  251  dd70716a-c9ef-4e53-9d99-e7cd813372c9     0  0.469929
51001  251  6ba2f305-1b7d-4e43-b3b6-9fea5679b446     1  0.467884
51002  251  fce1f4e0-dfa6-4adc-b783-68aa23cf7ff0     2  0.462127
51003  251  137fd728-9d1c

In [16]:
# find the common docnos between the two sets keeping the order
common_docnos = [docno for docno in docnos_orig if docno in docnos_translit]
common_docnos

['4d8b0105-df2a-4403-a1d9-2f53590a44d9',
 '23a4fe76-8b4a-49a5-a165-a5b29c076c6b',
 '137fd728-9d1c-48a3-88d8-3d4ebcac1d96']

In [24]:
# run ir datasets docstore lookup on a docno
docno = "dd48e96f-deb3-4f55-b569-40ac5d3a19f0"

!ir_datasets lookup neuclir/1/zh/trec-{trec_year} $docno --fields title text

/bin/bash: /opt/miniconda3/envs/flagembedding/lib/libtinfo.so.6: no version information available (required by /bin/bash)
17LIVE共同創辦人潘杰賢 收購台灣最大Podcast平台SoundOn - 財經	新加坡投資顧問公司Kollective Ventures（KV）、Turn Capital、17LIVE共同創辦人潘杰賢的個人基金今（27）日宣布，收購台灣最大Podcast平台「SoundOn聲浪」。  SoundOn創辦人顧立楷表示，一直在尋找能提供SoundOn未來發展的最佳策略，KV與Turn Capital曾協助非常多新創公司快速擴張，因此成為本次募資的選擇。未來仍會以SoundOn繼續營運，希望KV和TurnCapital在不久的將來同時加速公司與產業的增長。  潘杰賢表示，本次收購是為了進一步投資與建立Podcast 生態圈。隨著Podcast市場爆發性成長，SoundOn已擁有逾70％市佔，希望今年能更迅速擴張，KV與Turn Capital也看見台灣Podcast市場具發展潛力。  SoundOn由Uber北亞洲區前總經理顧立楷成立，2019年正式上線，目前從Hosting平台（託管服務）、播放器、原創節目、Podcast廣告代理等，採一條龍布局，近萬個創作者投入成為Podcaster，SoundOn流量成長為創立時期的20倍以上。  SoundOn為台灣最大的Hosting平台，每月超過3,500萬次下載，估計在2021年此數字將增長至5億以上。  (工商 )
