In [1]:
import numpy as np
import pandas as pd
import re
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict
import _pickle as pk
from time import time

import os
from os.path import join

def read_query(filepath):
    tree = ET.parse(filepath)
    questions = []
    for p in tree.findall('topic'):
        questions.append([x.text.strip() for x in p.findall('*')])
    return questions

def query_processing(questions):
    questions = [x[-1][:-1].split('、') for x in questions]
    res = []
    for q in questions:
        uni_qspace = set()
        bi_qspace = set()
        for words in q:
            uni_qspace |= set(list(words))
            bi_qspace |= set([words[i:i+2] for i in range(len(words)-1)])
        res.append((list(uni_qspace), list(bi_qspace)))
    return res

def open_file(path):
    tree = ET.parse(path.strip())
    id_ = tree.find('.//id').text
    text = ''.join([x.text.strip() for x in tree.findall('.//p')])
    return id_, text
    
def BM25_score(d, doc_id, query, k1=1.2, b=0.75):
    score = 0
    for term in query:
        tf = d.docs[doc_id][term]
        doc_len = d.docs_length[doc_id]
        
        nqi = d.df[term]
        idf = np.log((d.num_docs - nqi + 0.5)/(nqi + 0.5) + 1)
        
        score += idf * (tf*(k1 + 1))/(tf + k1*(1 - b + b*(doc_len / d.avg_length)))
    return score

def get_result_list(q_id, filelist, res):
    return [q_id, ' '.join([open_file(filelist[i])[0].lower() for i in res])]

In [2]:
class Dataset:
    def __init__(self):
        self.filelist = None
        self.vocab = None

        self.inverted_file = defaultdict(list) # {'a': [(doc_id, cnt), (doc_id, cnt)], 'b': []}
        self.words = Counter() # {'a': 100, 'b': 20}
        self.docs_length = list() # [10, 20, 30]
        self.avg_length = 0
        self.docs = list() # [{'a': 10, 'b': 20}, {'a': 5, 'b': 10}, {'c': 30}]
        self.num_docs = 0 # the number of documents
        self.df = Counter() # A word 'a' appears in how many documents.
        
        self.start_time = 0
    def parsing(self, path):
        with open(join(path, 'model/file-list'), 'r') as f:
            self.filelist = [join(path, 'CIRB010', s.strip()) for s in f.readlines()]
            
        self.num_docs = len(self.filelist)
        self.docs_length = [0]*self.num_docs
        self.docs = [Counter() for i in range(self.num_docs)]
        
        with open(join(path, 'model/vocab.all'), 'r') as f:
            self.vocab = f.read().split()
        
        with open(join(path, 'model/inverted-file'), 'r') as f:
            self.start_time = time()
            lines = f.readlines()
            i, end = 0, len(lines)
            while i < end:
                if i % 10 == 0:
                    print('Processing .... %06.2f%%, total time: %06.2f sec.' % (100*(i+1)/end, time() - self.start_time), end='\r')
                st, nd, cnt = [int(x) for x in lines[i].split()]
                
                if re.match('\w', self.vocab[st]):
                    w = self.vocab[st] if nd < 0 else self.vocab[st] + self.vocab[nd]
                    self.df[w] += cnt

                    for line in lines[i+1:i+cnt+1]:
                        doc_id, w_cnt = tuple([int(x) for x in line.split()])
                        self.words[w] += w_cnt
                        self.inverted_file[w].append((doc_id, w_cnt))
                        if nd < 0:
                            self.docs_length[doc_id] += w_cnt
                        self.docs[doc_id][w] += w_cnt
                i += cnt + 1
        self.avg_length = np.mean(d.docs_length)
    def save(self, name):
        with open(name, 'wb') as pf:
            pk.dump(self.__dict__, pf)
    
    def load(self, name):
        with open(name, 'rb') as pf:
            self.__dict__ = pk.load(pf)

In [3]:
if __name__ == '__main__':
    data_path = '/tmp2/r09922104/ir'
    train_path = join(data_path, 'queries/query-train.xml')
    test_path = join(data_path, 'queries/query-test.xml')
    
    save = False
    
    d = Dataset()
    if save:
        d.load('/tmp2/r09922104/data/data.pkl')
    else:
        d.parsing(data_path)
        d.save('/tmp2/r09922104/data/data.pkl')

    questions = read_query(train_path) + read_query(test_path)
    queries = query_processing(questions)
    
    result = []
    start = time()
    print()
    
    for _, query in enumerate(queries):
        print('Processing %02d / %02d, total time: %06.2f sec.' % (_+1, len(queries), time() - start), end='\r')
        uni, bi = query
        candidate = set()
        for word in bi:
            candidate |= set([d[0] for d in d.inverted_file[word]])
        candidate = list(candidate)

        scores = [BM25_score(d, doc_id, uni+bi) for doc_id in candidate]
        rank = np.argsort(scores)
        res = [candidate[i] for i in rank[-100:][::-1]]
        result.append(get_result_list('%03d' % (_+1), d.filelist, res))
        
    pd.DataFrame(result).to_csv('out.csv', header=['query_id','retrieved_docs'], index=False)

Processing .... 100.00%, total time: 244.66 sec.
Processing 30 / 30, total time: 197.26 sec.