Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[2nd sub-PR of #117] handle other trec topic format #176

Merged
merged 1 commit into from
Aug 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions capreolus/data/topics.dummy.for-style-testing.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<top>
<topics>
<top lang='en'>
<num>301</num>
<title>Dummy doc</title>
<desc>A dummy doc</desc>
<narr>The doc of the dummies</narr>
<top>


<top>
<topics>
<top lang='en'>
<num>302
<title>title of Dummy doc 302</title>
<desc>desc of dummy doc 302</desc>
<narr>narr of The doc of the dummies 302</narr>
<top>


<top>

<num> Number: 303
<title> title of Dummy doc 303

<desc> Description:
Description of dummy doc 303

<narr> Narrative:
Narrative of the dummies doc 303

</top>
18 changes: 18 additions & 0 deletions capreolus/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from capreolus import constants
from capreolus.utils.trec import load_trec_topics

PACKAGE_PATH = constants["PACKAGE_PATH"]


def test_trec_topic_loader():
dummy_topic = PACKAGE_PATH / "data" / "topics.dummy.for-style-testing.txt"
topics = load_trec_topics(dummy_topic)
print(topics)

assert topics["title"] == {"301": "Dummy doc", "302": "title of Dummy doc 302", "303": "title of Dummy doc 303"}
assert topics["desc"] == {"301": "A dummy doc", "302": "desc of dummy doc 302", "303": "Description of dummy doc 303"}
assert topics["narr"] == {
"301": "The doc of the dummies",
"302": "narr of The doc of the dummies 302",
"303": "Narrative of the dummies doc 303",
}
43 changes: 36 additions & 7 deletions capreolus/utils/trec.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ def load_ntcir_topics(fn):
def load_trec_topics(queryfn):
title, desc, narr = defaultdict(list), defaultdict(list), defaultdict(list)

def clean_line(line, tag_name, unwanted_tokens=None):
if unwanted_tokens is None:
unwanted_tokens = []
elif isinstance(unwanted_tokens, str):
unwanted_tokens = [unwanted_tokens]
assert isinstance(unwanted_tokens, list) or isinstance(unwanted_tokens, set)

line = line.replace(f"<{tag_name}>", "").replace(f"</{tag_name}>", "").strip().split() # remove_tag
line = [token for token in line if token not in unwanted_tokens]
return line

block = None
if str(queryfn).endswith(".gz"):
openf = gzip.open
Expand All @@ -49,25 +60,32 @@ def load_trec_topics(queryfn):
line = line.strip()

if line.startswith("<num>"):
# <num> Number: 700
qid = line.split()[-1]
# <num> Number: 700, or
# <num>700
# <num>700</num>
qid = line.split()[-1].replace("<num>", "").replace("</num>", "")
# no longer an int
# assert qid > 0
block = None
elif line.startswith("<title>"):
# <title> query here
title[qid].extend(line.strip().split()[1:])
# <title> query here, or
# <title>query here</title>
block = "title"
line = clean_line(line, tag_name=block, unwanted_tokens="Topic:")
title[qid].extend(line)
# TODO does this sometimes start with Topic: ?
assert "Topic:" not in line
elif line.startswith("<desc>"):
# <desc> description \n description
desc[qid].extend(line.strip().split()[1:])
# <desc> description \n description, or
# <desc>description</desc>
block = "desc"
line = clean_line(line, tag_name=block, unwanted_tokens="Description:")
desc[qid].extend(line)
elif line.startswith("<narr>"):
# same format as <desc>
narr[qid].extend(line.strip().split()[1:])
block = "narr"
line = clean_line(line, tag_name=block, unwanted_tokens="Narrative:")
narr[qid].extend(line)
elif line.startswith("</top>") or line.startswith("<top>"):
block = None
elif block == "title":
Expand Down Expand Up @@ -115,6 +133,17 @@ def load_qrels(qrelfile, qids=None, include_spam=True):
return labels


def write_qrels(labels, qrelfile):
qreldir = os.path.dirname(qrelfile)
if qreldir != "":
os.makedirs(qreldir, exist_ok=True)

with open(qrelfile, "w") as fout:
for qid in labels:
for docid in labels[qid]:
fout.write(f"{qid} Q0 {docid} {labels[qid][docid]}\n")


def document_to_trectxt(docno, txt):
s = f"<DOC>\n<DOCNO> {docno} </DOCNO>\n"
s += f"<TEXT>\n{txt}\n</TEXT>\n</DOC>\n"
Expand Down