In [1]:
!tar -xvzf data/data.tar.gz

data/
data/eval.py
data/cran.qry
data/qrel_clean
data/cran.all.1400


In [2]:
!head -30 data/cran.all.1400

.I 1
.T
experimental investigation of the aerodynamics of a
wing in a slipstream .
.A
brenckman,m.
.B
j. ae. scs. 25, 1958, 324.
.W
experimental investigation of the aerodynamics of a
wing in a slipstream .
  an experimental study of a wing in a propeller slipstream was
made in order to determine the spanwise distribution of the lift
increase due to slipstream at different angles of attack of the wing
and at different free stream to slipstream velocity ratios .  the
results were intended in part as an evaluation basis for different
theoretical treatments of this problem .
  the comparative span loading curves, together with
supporting evidence, showed that a substantial part of the lift increment
produced by the slipstream was due to a /destalling/ or
boundary-layer-control effect .  the integrated remaining lift
increment, after subtracting this destalling lift, was found to agree
well with a potential flow theory .
  an empirical evaluation of the destalling ef

In [3]:
import os
import sys
from itertools import (
    chain
)

from typing import (
    Optional,
    List
)

import spacy
from spacy.tokenizer import Tokenizer

from pandas import DataFrame

from tqdm import tqdm


nlp = spacy.load("en")
tokenizer = Tokenizer(nlp.vocab)

BASE_DIR = "data"
ALL_DATA_FILE = os.path.join(BASE_DIR, "cran.all.1400")
NUMBER_TEXTS = 1400

In [4]:
class Text:
    __slots__ = ["i", "t", "a", "b", "w"]
    
    def __init__(self):
        self.i = None # type: str
        self.t = None # type: str
        self.a = None # type: str
        self.b = None # type: str
        self.w = None # type: str


def _read_file(filepath):
    # type: (str) -> Optional[List[str]]
    
    with open(filepath, "r") as fin:
        lines = fin.readlines()
        return lines
    
    return None


def _parse_text(lines):
    # type: (str) -> List[Text]
    
    def set_current_state(line):
        nonlocal current_state
        
        for i, (s, _) in enumerate(line_starts):
            if line.startswith(s):
                current_state = i
    
    def add_text():
        nonlocal texts

        t = Text()
        for i, (_, s) in enumerate(line_starts):
            setattr(t, s, ''.join(text_lists[i]))
        
        texts.append(t)
    
    line_starts = [('.I', 'i'), ('.T', 't'), ('.A', 'a'), ('.B', 'b'), ('.W', 'w')]
    text_lists, texts = [[] for _ in range(len(line_starts))], []
    current_state = -1

    for line in chain(lines, ['.I']):
        set_current_state(line)
        
        if current_state == 0:
            if any(text_lists):
                add_text()
            texts_lines = [[] for _ in range(len(line_starts))]
        
        text_lists[current_state].append(line)
    
    return texts


def get_texts(filename):
    # type: (str) -> List[Text]

    data = _read_file(filename)
    
    assert data != None
    
    texts = _parse_text(data)
    return texts


texts = get_texts(ALL_DATA_FILE)
assert len(texts) == NUMBER_TEXTS

In [5]:
class TextTokens:
    __slots__ = ["i", "tokens"]
    
    def __init__(self, text, attr):
        self.i = text.i
        
        assert hasattr(text, attr)
        
        self.tokens = self._clean(getattr(text, attr))
    
    def _clean(self, s):
        return (t.lemma_ for t in tokenizer(s))
    
    def __iter__(self):
        return self.tokens

    
def get_text_tokens_gen(texts, attr):
    # type: (List[Text], str) -> List[Tokens]
    
    text_tokens = (TextTokens(text, attr) for text in texts)
    return text_tokens


text_tokens_gen = get_text_tokens_gen(texts, 'w')

In [None]:
class InvIndex:
    def __init__(self, text_tokens_gen):
        data = [(token, doc_id, 1) for doc_id, text_tokens in tqdm(enumerate(text_tokens_gen), miniters=40) for token in text_tokens]
        self.df = DataFrame(data, columns=["token", "doc_id", "count"])

inv_index = InvIndex(text_tokens_gen)

821it [04:43,  2.90it/s]