In [None]:
import os
import json
import logging
from urllib import parse
from bs4 import BeautifulSoup
from collections import defaultdict

import scrapy
from scrapy.spiders import Spider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess

logging.getLogger('scrapy').propagate = False


BASE_DIR = "data"
URLS_FILE = os.path.join(BASE_DIR, "urlid.csv")

SCRAPY_RESULT_FILE = os.path.join(BASE_DIR, "docs.json")
BASE_URL = "https://ru.wikipedia.org"

In [None]:
def get_urls(filename: str, full=False):
    with open(filename) as fin:
        for line in fin:
            idx, url = line.strip().split(",", 1)
            url = parse.urljoin(BASE_URL, url)
            yield (idx, url) if full else url


class CustomSpider(Spider):
    name = "custom_spider"
    
    def start_requests(self):
        start_urls = get_urls(URLS_FILE)
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        try:
            soup = BeautifulSoup(response.body, "lxml")
            title = soup.find("h1", {"id": "firstHeading"}).text
            div_content = soup.find("div", {"id": "mw-content-text"})
            snippet = " ".join(p.text for p in div_content.find_all("p"))
        except:
            pass
        
        return {'url': response.url, 'title': locals().get('title', ''), 'snippet': locals().get('snippet', '')}

In [None]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': SCRAPY_RESULT_FILE
})
process.crawl(CustomSpider)
process.start()

In [None]:
data_json = json.load(open(SCRAPY_RESULT_FILE))

In [None]:
len(data_json)k

In [None]:
class Doc:
    __slots__ = ("id", "url", "title", "snippet")
    
    def __init__(self):
        self.id = None # type: int
        self.url = None # type: str
        self.title = None # type: str
        self.snippet = None # type: str


class Query:
    __slots__ = ("id", "text")
    
    def __init__(self):
        self.id = None # type: int
        self.text = None # type: str


def get_doc_gen():
    urls = {url: idx for url, idx in get_urls(URLS_FILE, full=True)}
    json_data = json.load(SCRAPY_RESULT_FILE)
    for doc in json_data:
        d = Doc()
        d.url = doc['url']
        d.title = doc['title']
        d.snippet = doc['snippet']
        d.id = urls[d.url]
        yield d

In [None]:
class ItemTokens:
    __slots__ = ["i", "tokens"]
    
    def __init__(self, text, attr):
        self.i = text.i
        
        assert hasattr(text, attr)
        
        self.tokens = self._filter(self._clean(getattr(text, attr)))
    
    def _clean(self, s):
        for t in chain(string.punctuation, string.digits):
            s = s.replace(t, " ")
        return s
    
    def _filter(self, s):
        stop_tokens = ["I", "T", "A", "B", "W"]
        return (stemmer.stem(lemmatizer.lemmatize(t)) for t in word_tokenize(s) 
                if (t not in stop_words) and (t not in stop_tokens) and (len(t) >= 2))
    
    def __iter__(self):
        return self.tokens

    
def get_item_tokens_gen(texts, attr):
    # type: (List[Text], str) -> Generator[TextTokens]
    
    item_tokens = (ItemTokens(text, attr) for text in texts)
    return item_tokens

In [None]:
class InvIndex:
    def __init__(self, text_tokens_gen):
        columns = ["doc_id", "token", "count"]
        index = ["doc_id", "token"]
        
        def get_part_df():
            for doc_id, text_tokens in tqdm(enumerate(text_tokens_gen)):
                data = [(doc_id, token, 1) for token in text_tokens]
                df = DataFrame(data, columns=columns).groupby(by=index).sum()
                yield df
        
        self.df = pd.concat(get_part_df())
        self.df["count"] = self.df["count"].astype(np.float32)
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_n(self, t=None):
        try:
            if t is None:
                return self.df["count"].sum()

            return self.df.loc[(slice(None), t), :]["count"].sum()
        except KeyError:
            return 0
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_f(self, t, doc_id=None):
        try:
            if doc_id is not None:
                return self.df.loc[(doc_id, t), :]
            
            return self.df.loc[(slice(None), t), :]
        except KeyError:
            pass
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_l(self, doc_id=None):
        try:
            if doc_id is not None:
                return self.df.loc[doc_id, :]["count"].sum()
            
            return self.df.reset_index().groupby("doc_id").sum()["count"].mean()
        except KeyError:
            pass