In [19]:
from tqdm import tqdm
qid2line = defaultdict(list)
qrel = defaultdict(list)
with open("collections/msmarco-doc/msmarco-doctrain-qrels.tsv") as f:
    for line in f:
        topicid, _, docid, rel = line.strip().split(' ')
        assert rel == "1", line.split(' ')
        qrel[topicid].append(docid)

with open('runs/msmarco-doc/run.msmarco-doc.bm25-tuned.topics.msmarco-doc.train.txt') as f:
    for line in tqdm(f):
        topicid, _, _, _, _, _ = line.strip().split()
        if topicid in qrel:
            qid2line[topicid].append(line.strip())

with open('runs/msmarco-doc/run.train.small.tsv','w') as f:
    for topicid,lines in tqdm(qid2line.items()):
        f.write('\n'.join(lines)+'\n')

366971587it [08:13, 743924.55it/s]
100%|██████████| 367008/367008 [00:33<00:00, 10878.52it/s]


In [21]:
import csv
import random
import gzip
import os
from collections import defaultdict
from tqdm import tqdm

# The query string for each topicid is querystring[topicid]
querystring = {}
with gzip.open("collections/msmarco-doc/msmarco-doctrain-queries.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [topicid, querystring_of_topicid] in tsvreader:
        querystring[topicid] = querystring_of_topicid

# In the corpus tsv, each docid occurs at offset docoffset[docid]
docoffset = {}
with gzip.open("collections/msmarco-doc/msmarco-docs-lookup.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [docid, _, offset] in tsvreader:
        docoffset[docid] = int(offset)

# For each topicid, the list of positive docids is qrel[topicid]
qrel = {}
with gzip.open("collections/msmarco-doc/msmarco-doctrain-qrels.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter=" ")
    for [topicid, _, docid, rel] in tsvreader:
        assert rel == "1"
        if topicid in qrel:
            qrel[topicid].append(docid)
        else:
            qrel[topicid] = [docid]

def generate_triples(outfile):
    """Generates triples comprising:
    - Query: The current topicid and query string
    - Pos: One of the positively-judged documents for this query
    - Rnd: Any of the top-100 documents for this query other than Pos
    Since we have the URL, title and body of each document, this gives us ten columns in total:
    topicid, query, posdocid, posurl, postitle, posbody, rnddocid, rndurl, rndtitle, rndbody
    outfile: The filename where the triples are written
    triples_to_generate: How many triples to generate
    """

    qid2pos = defaultdict(list)
    qid2neg = defaultdict(list)

    with gzip.open("collections/msmarco-doc/msmarco-doctrain-top100.gz", 'rt', encoding='utf8') as top100f,\
    open(outfile, 'w', encoding="utf8") as out:
        for line in top100f:
            [topicid, _, unjudged_docid, rank, _, _] = line.split()

            assert topicid in querystring
            assert topicid in qrel
            assert unjudged_docid in docoffset

            if unjudged_docid in qrel[topicid]:
                qid2pos[topicid].append(unjudged_docid)
            else:
                qid2neg[topicid].append(unjudged_docid)
    
        for topicid, pos_list in tqdm(qid2pos.items()):
            for negative_docid in qid2neg[topicid]:
                for positive_docid in pos_list:
                    out.write(topicid + "\t" +
                              positive_docid + "\t" +
                              negative_docid + "\n")


generate_triples("triples.tsv")

100%|██████████| 271184/271184 [00:11<00:00, 24368.48it/s]


In [2]:
import gzip
import json
import os
import spacy
from tqdm import tqdm
import re

f_corpus = gzip.open('collections/msmarco-doc/msmarco-docs.tsv.gz', mode='rt')
total_doc = 0
f_jsonl = open(f'collections/msmarco-doc/collection_jsonl/msmarco-doc_0.jsonl','w')
for line in tqdm(f_corpus):
    if total_doc % 100000 == 0 and total_doc != 0:
        f_jsonl.close()
        f_jsonl = open(f'collections/msmarco-doc/collection_jsonl/msmarco-doc_{total_doc//100000}.jsonl','w')
    f_jsonl.write(line)
    total_doc += 1
f_corpus.close()
f_jsonl.close()

3213835it [05:28, 9784.11it/s] 


In [1]:
import gzip
import json
import os
import spacy
from tqdm import tqdm
import re
from joblib import Parallel, delayed
from glob import glob

def process(corpus_fn):
    max_length = 10
    stride = 5
    nlp = spacy.blank("en")
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    def create_segments(doc_text, nlp, max_length, stride):
        doc_text = doc_text.strip()
    #     if len(doc_text) > 100000:
    #         print(len(doc_text))
        doc = nlp(doc_text[:10000])
        sentences = [sent.string.strip() for sent in doc.sents]
        segments = []

        for i in range(0, len(sentences), stride):
            segment = " ".join(sentences[i:i+max_length])
            segments.append(segment)
            if i + max_length >= len(sentences):
                break
        return segments
    
    total_seg = 0
    new_name = corpus_fn.replace('msmarco-doc_','msmarco-seg_')
    assert new_name != corpus_fn
    with open(corpus_fn) as f_corpus, open(new_name,'w') as f_jsonl:
        for line in tqdm(f_corpus):
            f_doc_id, doc_url, doc_title, doc_text = line.split('\t')
            segments = create_segments(doc_text, nlp, max_length, stride)
            for seg_id, segment in enumerate(segments):
                doc_seg = f'{f_doc_id}#{seg_id}'
                expanded_text = f'{doc_url} {doc_title} {segment}'
                output_dict = {'id': doc_seg, 'contents': expanded_text}
                f_jsonl.write(json.dumps(output_dict) + '\n')
                total_seg += 1
    
Parallel(n_jobs=20)([delayed(process)(fp) 
                    for fp in glob('collections/msmarco-doc/collection_jsonl/msmarco-doc_*.jsonl')])

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]