Ранжирование, основанное на BM25F с двумя зонами - заголовок и основной текст

In [1]:
from multiprocessing import Pool
import numpy as np
import os
from collections import defaultdict

### Parsing

In [2]:
from pymystem3 import Mystem
import Stemmer
stemmer_ru = Mystem()
stemmer_en = Stemmer.Stemmer('english')
import re


def Steming(text):
    text_out = stemmer_ru.lemmatize(text)
    text_out = ''.join(text_out)
    text_out = re.sub(re.compile("\W+"), r' ', text_out)
    text_out = stemmer_en.stemWords(text_out.split(" "))
    return " ".join(text_out)

def Normalize_text(text):
    text = text.replace("-\n", "")
    text = re.sub(re.compile("\W+"), r' ', text)
    text = " ".join(re.split(re.compile("(\d+)"), text))
    return text

def Normalize_word(word):
    word = word.replace(u'ё', u'е')
    word = word.replace('_', '')
    word = word.strip()
    return word

In [3]:
from bs4 import BeautifulSoup

def is_visible(element):
    if element.parent.name in ['[document]', 'head', 'title', 'style', 'script']:
        return False
    elif re.match(re.compile('<!---.*--->', re.DOTALL), str(element.encode('utf-8'))):
        return False
    return True
    
def Extract(text):
    text = Normalize_text(text)
    text = Steming(text)
    out = []
    for word in text.split(" "):
        word = Normalize_word(word)
        if word == '':
            continue
        out.append(word)
    return out

def CleanFiles(urls, inputs, output, files):
    for name in files:
        f = open(inputs+"/"+name, "r", encoding="utf-8", errors='replace')
        url = f.readline()[:-1]
        text = f.read().lower()
        output = output+"/"+str(urls[url])
        if os.path.exists(output):
            return
        f = open(output, 'w', encoding='utf-8')
        titles = re.search(re.compile('<title>(.*?)<\/title>', re.DOTALL), text)
        title = titles.groups()[0] if titles is not None else ''
        text = text.replace("<", " <").replace(">", "> ")
        text = " ".join(filter(is_visible, BeautifulSoup(text).findAll(text=True)))
        text = " ".join(Extract(title + " " + "SPLITTER" + " " + text))
        title, text = text.split("SPLITTER")
        f.write(title+"\n")
        f.write(text)
        f.close()

### TFs

In [4]:
class Lemmas:
    def __init__(self):
        self.docs = 0
        self.counter = 0

def GenLemmStat(inputs, output, filenames):
    for name in filenames:
        input_path = inputs+"/"+name
        output_path = output+"/"+name
        if os.path.exists(output_path):
            return
        input_file = open(input_path, 'r', encoding='utf-8')
        title = (input_file.readline()[:-1])
        inner_text = input_file.read()
        input_file.close()
        words_dict = defaultdict(list)
        for word in title.split(' '):
            if word != '':
                words_dict[word.lower()].append(0)
        skipped = -1
        for pos, word in enumerate(inner_text.split(' ')):
            if word == '':
                skipped += 1
            else:
                words_dict[word.lower()].append(pos + skipped)
        file = open(output_path, 'w', encoding='utf-8')
        length = 0
        for word, pos in words_dict.items():
            final_length += len(pos)
            file.write(word + "\t"+str(len(positions)) + "\t")
            file.write(" ".join(str(pos) for pos in positions))
            file.write("\n")
        file.write(str(length))
        file.close()
        
def MakeStat(input_dir, out_filename, filenames):
    corpus_dict = defaultdict(Lemmas)
    for name in filenames:
        file = open(input_dir+"/"+name, 'r', encoding='utf-8')
        words_info = file.read().splitlines()[:-1]
        for info in words_info:
            info = info.split('\t')
            word = info[0]
            corpus_dict[word].counter += int(info[1])
            corpus_dict[word].docs += 1
        file.close()
    out_file = open(out_filename, 'w', encoding='utf-8')
    corpus_size = 0
    for w, info in corpus_dict.items():
        out_file.write(w+"\t"+str(info.docs)+"\t"+str(info.counter)+"\n")
        corpus_size += info.counter
    out_file.write(str(corpus_size))
    out_file.close()

### Preprocess .dats and count tfs

In [5]:
WORKERS = 4
URLS = 38114

def GetSplit(filenames, workers=WORKERS):
    splits = np.linspace(0, len(filenames), workers+1, dtype=np.int)
    return [filenames[splits[i]:splits[i+1]] for i in range(workers)]

def StartPool(func, args, workers=WORKERS):
    proc_pool = Pool(workers)
    proc_pool.starmap(func, args)
    proc_pool.close()

In [6]:
f = open("data/urls.numerate.txt", 'r', encoding="utf8")
urls = [line.split('\t') for line in f.read().splitlines()]
urls = dict((url[1], url[0]) for url in urls)
f.close()
dates = sorted(os.listdir('./data/content'))
print(dates)
filenames = []
for date in dates:
    for doc in sorted(os.listdir('./data/content/' + date)):
        filenames.append(date + '/' + doc)
splits = GetSplit(filenames)
StartPool(CleanFiles, [(urls, 'data/content/', 'data/clean_content', splits[i]) for i in range(WORKERS)])

splits = GetSplit([str(i) for i in range(1, URLS+1)])
StartPool(GenLemmStat, [('data/clean_content', 'data/tf_content', splits[i]) for i in range(WORKERS)])

MakeStat('data/tf_content', 'data/statistics.txt', [str(i) for i in range(1, URLS+1)])

['20170702', '20170704', '20170707', '20170708', '20170710', '20170711', '20170717', '20170726']


### Scoring

In [7]:
AVG_DOC_LEN = 13196

In [8]:
class Query:
    def __init__(self):
        self.docs = []
        self.words = ''

def GetIDF(words, words_stat, total_len):
    result = []
    for word in words:
        if words_stat[word].counter != 0:
            result.append(np.log(total_len / words_stat[word].counter))
        else:
            result.append(0.0)
    return result

def GetTF(ids):
    document_dict = defaultdict(list)
    doc_file = open('data/tf_content/'+str(ids), 'r', encoding='utf-8')
    lines = doc_file.read().splitlines()
    doc_file.close()
    for line in lines[:-1]:
        line_parts = line.split('\t')
        document_dict[line_parts[0]] = [int(i) for i in line_parts[2].split(" ")]
    return document_dict, int(lines[-1])

def Score(words, words_idf, document_dict, document_len):
    score = 0
    for word_id, word in enumerate(words):
        f = len(document_dict[word])
        idf = words_idf[word_id]
        score += idf * (f / (f + 1 + 0.001 * document_len) + int(0 in document_dict[word]))
    return score

def Scoring(query, words_stat, total_len):
    data = [GetTF(idx) for idx in query.docs]
    words_idf = GetIDF(query.words, words_stat, total_len)
    scores = []
    for idx in range(len(query.docs)):
        scores.append(Score(query.words, words_idf, data[idx][0], data[idx][1]))
    return scores

### Process queries

In [9]:
def CleanWords(text):
    words = Steming(Normalize_text(text)).split(" ")
    words_list = [Normalize_word(word).lower() for word in words]
    out = []
    for word in words_list:
        if word == '':
            continue
        out.append(word)
    return out

def ProcessQueries():
    queries = defaultdict(Query)
    f = open("data/queries.numerate.txt", 'r', encoding='utf-8')
    for query in f.read().splitlines():
        split = query.split('\t')
        queries[int(split[0])].words = CleanWords(split[1])
    f.close()
    f = open('data/sample.technosphere.ir1.textrelevance.submission.txt', 'r', encoding='utf-8')
    for line in f.read().splitlines()[1:]:
        split = line.split(",")
        queries[int(split[0])].docs.append(int(split[1]))
    f.close()
    return queries

In [10]:
words_stat = defaultdict(Lemmas)
f = open('data/statistics.txt', 'r', encoding='utf-8')
lines = f.read().splitlines()
f.close()
for line in lines[:-1]:
    line = line.split('\t')
    words_stat[line[0]].docs, words_stat[line[0]].counter = int(line[1]), int(line[2])

queries_dict = ProcessQueries()

submission = open('./submission.txt', 'w')
submission.write("QueryId,DocumentId\n")
for query_id, query in queries_dict.items():
    scores = Scoring(query, words_stat, int(lines[-1]))
    for idx in np.argsort(scores)[::-1]:
        submission.write(str(query_id) + "," + str(query.docs[idx]) + "\n")
submission.close()