In [20]:
import os
import json
import logging
import functools
from urllib import parse
from bs4 import BeautifulSoup
from collections import defaultdict

from tqdm import tqdm

import pandas as pd
import numpy as np

import scrapy
from scrapy.spiders import Spider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess

logging.getLogger('scrapy').propagate = False


BASE_DIR = "data"
URLS_FILE = os.path.join(BASE_DIR, "urlid.csv")

SCRAPY_RESULT_FILE = os.path.join(BASE_DIR, "docs.json")
BASE_URL = "https://ru.wikipedia.org"

In [2]:
def get_urls(filename: str, full=False):
    with open(filename) as fin:
        for line in fin:
            idx, url = line.strip().split(",", 1)
            url = parse.urljoin(BASE_URL, url)
            yield (idx, url) if full else url


class CustomSpider(Spider):
    name = "custom_spider"
    
    def start_requests(self):
        start_urls = get_urls(URLS_FILE)
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        try:
            soup = BeautifulSoup(response.body, "lxml")
            title = soup.find("h1", {"id": "firstHeading"}).text
            div_content = soup.find("div", {"id": "mw-content-text"})
            snippet = " ".join(p.text for p in div_content.find_all("p"))
        except:
            pass
        
        return {'url': response.url, 'title': locals().get('title', ''), 'snippet': locals().get('snippet', '')}

In [3]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': SCRAPY_RESULT_FILE
})
process.crawl(CustomSpider)
process.start()

In [22]:
class Doc:
    __slots__ = ("id", "url", "title", "snippet")
    
    def __init__(self):
        self.id = None # type: int
        self.url = None # type: str
        self.title = None # type: str
        self.snippet = None # type: str


class Query:
    __slots__ = ("id", "text")
    
    def __init__(self):
        self.id = None # type: int
        self.text = None # type: str


def get_texts_gen():
    urls = {url: idx for url, idx in get_urls(URLS_FILE, full=True)}
    json_data = json.load(open(SCRAPY_RESULT_FILE))
    for doc in json_data:
        d = Doc()
        d.url = doc['url']
        d.title = doc['title']
        d.snippet = doc['snippet']
        d.id = urls[d.url]
        yield d

In [7]:
class ItemTokens:
    __slots__ = ["id", "tokens"]
    
    def __init__(self, text, attr):
        self.id = text.id
        self.tokens = self._filter(self._clean(getattr(text, attr)))
    
    def _clean(self, s):
        for t in chain(string.punctuation, string.digits):
            s = s.replace(t, " ")
        return s
    
    def _filter(self, s):
        return (stemmer.stem(lemmatizer.lemmatize(t)) for t in word_tokenize(s) 
                if (t not in stop_words) and (len(t) >= 2))
    
    def __iter__(self):
        return self.tokens

    
def get_item_tokens_gen(texts, attr):
    # type: (List[Text], str) -> Generator[TextTokens]
    
    item_tokens = (ItemTokens(text, attr) for text in texts)
    return item_tokens

In [9]:
class InvIndex:
    def __init__(self, text_tokens_gen):
        columns = ["doc_id", "token", "count"]
        index = ["doc_id", "token"]
        
        def get_part_df():
            for doc_id, text_tokens in tqdm(enumerate(text_tokens_gen)):
                data = [(doc_id, token, 1) for token in text_tokens]
                df = DataFrame(data, columns=columns).groupby(by=index).sum()
                yield df
        
        self.df = pd.concat(get_part_df())
        self.df["count"] = self.df["count"].astype(np.float32)
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_n(self, t=None):
        try:
            if t is None:
                return self.df["count"].sum()

            return self.df.loc[(slice(None), t), :]["count"].sum()
        except KeyError:
            return 0
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_f(self, t, doc_id=None):
        try:
            if doc_id is not None:
                return self.df.loc[(doc_id, t), :]
            
            return self.df.loc[(slice(None), t), :]
        except KeyError:
            pass
    
    @functools.lru_cache(maxsize=256, typed=False)
    def get_l(self, doc_id=None):
        try:
            if doc_id is not None:
                return self.df.loc[doc_id, :]["count"].sum()
            
            return self.df.reset_index().groupby("doc_id").sum()["count"].mean()
        except KeyError:
            pass

In [11]:
class RSVRankedList:
    """По формуле из дз"""
    def __init__(self, k1, b):
        self.k1 = k1
        self.b = b
    
    def __call__(self, q, inv_index):
        rsv = {}
        N = inv_index.get_n()
        
        for t in q:
            Nt = inv_index.get_n(t)
            F = inv_index.get_f(t)
            idf = math.log(1.0 + (N - Nt + 0.5) / (Nt + 0.5))
            
            if F is not None:
                for index, row in F.iterrows():
                    doc_id, ftd = index[0], row["count"]
                    Ld, L = inv_index.get_l(doc_id), inv_index.get_l()
                    tf = ftd * (self.k1 + 1.) / (self.k1 * ((1. - self.b) + self.b * Ld / L) + ftd)
                    rsv[doc_id] = rsv.get(doc_id, 0) + idf * tf
        return sorted(rsv.items(), key=itemgetter(1), reverse=True)[:10]

In [24]:
texts_gen = get_texts_gen()
text_tokens_gen = get_item_tokens_gen(texts_gen, "title")

# queries_gen = get_queries_gen(QUERIES_FILE)
# query_tokens_gen = get_item_tokens_gen(queries_gen, "w")

rsv = RSVRankedList(k1=1.2, b=0.75)
inv_index = InvIndex(text_tokens_gen)
# with open(PREDICTION_FILE, "w") as fout:
#     for query_id, query in tqdm(enumerate(query_tokens_gen)):
#         ranked_list = rsv(query, inv_index)
#         for doc_id, _ in ranked_list:
#             fout.write("{} {}\n".format(query_id + 1, doc_id + 1))



0it [00:00, ?it/s][A[A

KeyError: 'https://ru.wikipedia.org/wiki/%D0%92%D0%B5%D1%80%D1%81%D0%B0%D1%87%D0%B5'