Необходимо положить `data.tar.gz` в директорию `data`, а потом подменить `eval.py` на исправленный вариант.

In [None]:
!tar -xvzf data/data.tar.gz

In [1]:
!head -30 data/cran.all.1400

.I 1
.T
experimental investigation of the aerodynamics of a
wing in a slipstream .
.A
brenckman,m.
.B
j. ae. scs. 25, 1958, 324.
.W
experimental investigation of the aerodynamics of a
wing in a slipstream .
  an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with
supporting evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or
boundary-layer-control effect .  the integrated remaining lift
increment, after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical evaluation of the destalling ef

In [2]:
import os
import sys
import abc
import math
import functools
import string
from itertools import chain
from operator import itemgetter

from typing import (
    Optional,
    Generator,
    List
)

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import numpy as np
import pandas as pd
from pandas import DataFrame

from tqdm import tqdm


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

BASE_DIR = "data"
TEXTS_FILE = os.path.join(BASE_DIR, "cran.all.1400")
QUERIES_FILE = os.path.join(BASE_DIR, "cran.qry")
CORRECT_ANSWERS_FILE = os.path.join(BASE_DIR, "test.qrel_clean")
PREDICTION_FILE = os.path.join(BASE_DIR, "train.qrel_clean")
NUMBER_TEXTS = 1400

In [3]:
class Text:
    __slots__ = ["i", "t", "a", "b", "w"]
    
    def __init__(self):
        self.i = None # type: str
        self.t = None # type: str
        self.a = None # type: str
        self.b = None # type: str
        self.w = None # type: str

        
class Query:
    __slots__ = ["i", "w"]
    
    def __init__(self):
        self.i = None # type: str
        self.w = None # type: str


def _read_file(filepath):
    # type: (str) -> str
    
    file = open(filepath)
    yield from file
        

def _parse(gen, cls, line_starts):
    # type: (Generator[str]) -> Text
    
    def set_current_state(line):
        nonlocal current_state
        
        for i, (s, _) in enumerate(line_starts):
            if line.startswith(s):
                current_state = i
    
    def yield_text():
        t = cls()
        for i, (_, s) in enumerate(line_starts):
            setattr(t, s, ''.join(text_lists[i]))
        
        return t
    
    text_lists = [[] for _ in range(len(line_starts))]
    current_state = -1

    for line in chain(gen, ['.I']):
        set_current_state(line)
        
        if current_state == 0:
            if any(text_lists):
                yield yield_text()
            text_lists = [[] for _ in range(len(line_starts))]
        
        text_lists[current_state].append(line)


def get_texts_gen(filepath):
    # type: (str) -> Generator[Text]

    gen = _read_file(filepath)
    texts = _parse(gen, Text, [('.I', 'i'), ('.T', 't'), ('.A', 'a'), ('.B', 'b'), ('.W', 'w')])
    return texts


def get_queries_gen(filepath):
    # type: (str) -> Generator[Query]
    
    gen = _read_file(filepath)
    queries = _parse(gen, Query, [('.I', 'i'), ('.W', 'w')])
    return queries

In [4]:
class ItemTokens:
    __slots__ = ["i", "tokens"]
    
    def __init__(self, text, attr):
        self.i = text.i
        
        assert hasattr(text, attr)
        
        self.tokens = self._filter(self._clean(getattr(text, attr)))
    
    def _clean(self, s):
        for t in string.punctuation:
            s = s.replace(t, " ")
        return s
    
    def _filter(self, s):
        stop_tokens = ["I", "T", "A", "B", "W"]
        return (stemmer.stem(lemmatizer.lemmatize(t)) for t in word_tokenize(s) 
                if (t not in stop_words) and (t not in stop_tokens) and (t not in string.punctuation) 
                and (not t.isdigit()))
    
    def __iter__(self):
        return self.tokens

    
def get_item_tokens_gen(texts, attr):
    # type: (List[Text], str) -> Generator[TextTokens]
    
    item_tokens = (ItemTokens(text, attr) for text in texts)
    return item_tokens

In [5]:
class InvIndex:
    def __init__(self, text_tokens_gen):
        columns = ["doc_id", "token", "count"]
        index = ["doc_id", "token"]
        
        def get_part_df():
            for doc_id, text_tokens in tqdm(enumerate(text_tokens_gen)):
                data = [(doc_id, token, 1) for token in text_tokens]
                df = DataFrame(data, columns=columns).groupby(by=index).sum()
                yield df
        
        self.df = pd.concat(get_part_df())
        self.df["count"] = self.df["count"].astype(np.float32)
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_n(self, t=None):
        try:
            if t is None:
                return self.df["count"].sum()

            return self.df.loc[(slice(None), t), :]["count"].sum()
        except KeyError:
            return 0
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_f(self, t, doc_id=None):
        try:
            if doc_id is not None:
                return self.df.loc[(doc_id, t), :]
            
            return self.df.loc[(slice(None), t), :]
        except KeyError:
            pass
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_l(self, doc_id=None):
        try:
            if doc_id is not None:
                return self.df.loc[doc_id, :]["count"].sum()
            
            return self.df.reset_index().groupby("doc_id").sum()["count"].mean()
        except KeyError:
            pass

In [6]:
class RankedList:
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def __call__(self, q, inv_index):
        raise NotImplementedError


class RSVRankedList(RankedList):
    def __init__(self, k1, b):
        self.k1 = k1
        self.b = b
    
    def __call__(self, q, inv_index):
        rsv = {}
        N = inv_index.get_n()
        
        for t in q:
            Nt = inv_index.get_n(t)
            F = inv_index.get_f(t)
            idf = math.log(1.0 + (N - Nt + 0.5) / (Nt + 0.5))
            
            if F is not None:
                for index, row in F.iterrows():
                    doc_id, ftd = index[0], row["count"]
                    Ld, L = inv_index.get_l(doc_id), inv_index.get_l()
                    tf = ftd * (self.k1 + 1.) / (self.k1 * ((1. - self.b) + self.b * Ld / L) + ftd)
                    rsv[doc_id] = rsv.get(doc_id, 0) + idf * tf
        return sorted(rsv.items(), key=itemgetter(1), reverse=True)[:10]


In [12]:
texts_gen = get_texts_gen(TEXTS_FILE)
text_tokens_gen = get_item_tokens_gen(texts_gen, 'w')
rsv = RSVRankedList(k1=1.2, b=0.75)

queries_gen = get_queries_gen(QUERIES_FILE)
query_tokens_gen = get_item_tokens_gen(queries_gen, 'w')

inv_index = InvIndex(text_tokens_gen)
with open(PREDICTION_FILE, "w") as fout:
    for query_id, query in tqdm(enumerate(query_tokens_gen)):
        ranked_list = rsv(query, inv_index)
        for doc_id, _ in ranked_list:
            fout.write("{} {}\n".format(query_id + 1, doc_id + 1))

1400it [00:12, 113.70it/s]
225it [03:28,  1.08it/s]


In [13]:
!cd data && python3 eval.py

mean precision: 0.29155555555555557
mean recall: 0.42536748678214453
mean F-measure: 0.3459736864354289
MAP@10: 0.3569053966854231


In [9]:
inv_index.df.to_csv("data/df.csv")

In [None]:
inv_index.df.sort_values(by="count", ascending=False).head(10)