In [1]:
!tar -xvzf data/data.tar.gz

data/
data/eval.py
data/cran.qry
data/qrel_clean
data/cran.all.1400


In [2]:
!head -30 data/cran.all.1400

.I 1
.T
experimental investigation of the aerodynamics of a
wing in a slipstream .
.A
brenckman,m.
.B
j. ae. scs. 25, 1958, 324.
.W
experimental investigation of the aerodynamics of a
wing in a slipstream .
  an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with
supporting evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or
boundary-layer-control effect .  the integrated remaining lift
increment, after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical evaluation of the destalling ef

In [3]:
import os
import sys
import gc
from itertools import (
    chain
)

from typing import (
    Optional,
    Generator,
    List
)

import spacy
from spacy.tokenizer import Tokenizer

import pandas as pd
from pandas import DataFrame

from tqdm import tqdm


nlp = spacy.load("en")
tokenizer = Tokenizer(nlp.vocab)

BASE_DIR = "data"
TEXTS_FILE = os.path.join(BASE_DIR, "cran.all.1400")
QUERIES_FILE = os.path.join(BASE_DIR, "cran.qry")
CORRECT_ANSWERS_FILE = os.path.join(BASE_DIR, "grel_clean")
PREDICTION_FILE = os.path.join("prediction")
NUMBER_TEXTS = 1400

In [4]:
class Text:
    __slots__ = ["i", "t", "a", "b", "w"]
    
    def __init__(self):
        self.i = None # type: str
        self.t = None # type: str
        self.a = None # type: str
        self.b = None # type: str
        self.w = None # type: str


def _read_file(filepath):
    # type: (str) -> str
    
    file = open(filepath)
    yield from file


def _parse_text(gen):
    # type: (Generator[str]) -> Text
    
    def set_current_state(line):
        nonlocal current_state
        
        for i, (s, _) in enumerate(line_starts):
            if line.startswith(s):
                current_state = i
    
    def yield_text():
        t = Text()
        for i, (_, s) in enumerate(line_starts):
            setattr(t, s, ''.join(text_lists[i]))
        
        return t
    
    line_starts = [('.I', 'i'), ('.T', 't'), ('.A', 'a'), ('.B', 'b'), ('.W', 'w')]
    text_lists = [[] for _ in range(len(line_starts))]
    current_state = -1

    for line in chain(gen, ['.I']):
        set_current_state(line)
        
        if current_state == 0:
            if any(text_lists):
                yield yield_text()
            text_lists = [[] for _ in range(len(line_starts))]
        
        text_lists[current_state].append(line)
    
    return texts


def get_texts_gen(filename):
    # type: (str) -> Generator[Text]

    gen = _read_file(filename)
    texts = _parse_text(gen)
    return texts


texts = get_texts_gen(TEXTS_FILE)

In [5]:
class TextTokens:
    __slots__ = ["i", "tokens"]
    
    def __init__(self, text, attr):
        self.i = text.i
        
        assert hasattr(text, attr)
        
        self.tokens = self._clean(getattr(text, attr))
    
    def _clean(self, s):
        return (t.lemma_ for t in tokenizer(s))
    
    def __iter__(self):
        return self.tokens

    
def get_text_tokens_gen(texts, attr):
    # type: (List[Text], str) -> Generator[TextTokens]
    
    text_tokens = (TextTokens(text, attr) for text in texts)
    return text_tokens


text_tokens_gen = get_text_tokens_gen(texts, 'w')

In [6]:
class InvIndex:
    def __init__(self, text_tokens_gen):
        columns = ["doc_id", "token", "count"]
        index = ["doc_id", "token"]
        
        def get_part_df():
            for doc_id, text_tokens in tqdm(enumerate(text_tokens_gen)):
                data = [(doc_id, token, 1) for token in text_tokens]
                df = DataFrame(data, columns=columns).groupby(by=index).sum()
                yield df
        
        self.df = pd.concat(get_part_df())
    
    def get_n(t=None):
        if t is None:
            return self.df["count"].sum()
        
        return self.db.loc[(slice(None), t), :]["count"].sum()
    
    def get_f():
        pass

inv_index = InvIndex(text_tokens_gen)

1400it [00:07, 198.93it/s]


In [16]:
inv_index.df.loc[pd.IndexSlice[:, "asdfasdf"], :]["count"].sum()

KeyError: 'asdfasdf'

In [None]:
def RSV(q, inv_index):
    pass