In [None]:
from gensim.corpora import Dictionary
import tqdm
import sys
from gensim import corpora
from gensim import corpora, models, similarities
import numpy as np
import logging
from pymystem3 import Mystem
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

lemmatizer = Mystem()
stemmer = SnowballStemmer('russian', ignore_stopwords=True)
stop = stopwords.words('russian')
TRY_NAME = "WORDS_TITLE_BODY"
stop.extend (["\n", " "])

dct = Dictionary(prune_at=None)

def apply_to_str (string):
    no_stops = filter (lambda x: x not in stop, lemmatizer.lemmatize (string))
    return map(lambda x: stemmer.stem(x), no_stops)

with open ("../data/docs.tsv") as fin:    
    for doc in tqdm.tqdm (fin, total=582167):
        doc = doc.decode ("utf-8").lower ().strip ().split("\t")
        dct.add_documents ([apply_to_str (' '.join (doc[1:]))], prune_at=None)

dct.save ("../result/{}/dict.dct".format (TRY_NAME))

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def doc_reader ():
    with open ("../data/docs.tsv") as fin:
        for i, doc in enumerate (fin):
            doc = doc.decode ("utf-8").lower ().strip ().split("\t")[1:]
            yield dct.doc2bow(apply_to_str (' '.join (doc)))
            
corpora.MmCorpus.serialize('../result/{}/corpus.mm'.format (TRY_NAME), doc_reader ())

import gensim as gs
import math
import numpy as np
import tqdm

dct_title = gs.corpora.Dictionary.load ("../result/{}/dict.dct".format (TRY_NAME))

corpus = gs.corpora.MmCorpus('../result/{}/corpus.mm'.format (TRY_NAME))

PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25

corp_size = 0
corp_size_words = 0
for doc in tqdm.tqdm (corpus):
    corp_size_words += sum (dict (doc).itervalues ())
    corp_size += 1

avgdl = float (corp_size_words) / corp_size

idfs = {}

summ_idf = 0

for word_id, freq in dct_title.dfs.iteritems ():
    idfs[word_id] = math.log(corp_size - freq + 0.5) - math.log(freq + 0.5)
    summ_idf += idfs[word_id]

average_idf = float (summ_idf) / len (dct_title.dfs)

for id in idfs.iterkeys ():
    idfs[id] = idfs[id] if idfs[id] >= 0 else EPSILON * average_idf

q_ids = []
queries = []
with open ("../data/queries.tsv") as fin:
    for q in fin:
        q = q.strip ().decode ("utf-8").lower ().split ("\t")
        q_ids.append (int (q[0]))
        queries.append (dict (dct_title.doc2bow (apply_to_str (q[1]))))

q_size = len (queries)

result = np.zeros ((q_size, corp_size))

for doc_i, doc in tqdm.tqdm (enumerate (corpus), total=582167):
    doc = dict (doc)
    doc_keys = set (doc)
    doc_len = sum (doc.itervalues ())
    for q_i, q in enumerate (queries):
        score = 0
        for word in set (q.keys ()) & doc_keys:
            idf = idfs[word]
            score += (idf * doc[word] * (PARAM_K1 + 1)
                      / (doc[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * doc_len / avgdl)))
        result[q_i, doc_i] = score

with open ("../result/result_{}.csv".format (TRY_NAME), "w") as fout:
    fout.write ("QueryId,DocumentId\n")
    
    for q_num, qid in tqdm.tqdm (enumerate (q_ids)):
        for doc in np.argsort (result[q_num, :])[-5:][::-1]:
            fout.write ("{},{}\n".format (qid, doc))

np.save (open ("../result/{}/ranking.npy".format (TRY_NAME), "wb"), result)


In [1]:
from gensim.corpora import Dictionary
import tqdm
import sys
from gensim import corpora
from gensim import corpora, models, similarities
import numpy as np
import logging
from pymystem3 import Mystem
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from multiprocessing import Pool


def apply_to_str (string, lemmatizer, stemmer, stop):
    no_stops = filter (lambda x: x not in stop, lemmatizer.lemmatize (string))
    return map(lambda x: stemmer.stem(x), no_stops)

def process_file (fname):
    lemmatizer = Mystem()
    stemmer = SnowballStemmer('russian', ignore_stopwords=True)
    stop = stopwords.words('russian')
    stop.extend (["\n", " "])
    with open ('../data/' + fname) as f:
        with open ('../temp/' + fname + "_result", "w") as fout:
            for i, line in enumerate (f):
                splited = line.decode ("utf-8").lower ().strip ().split ("\t")
                res_str = splited[0] + "\t"
                for s in splited[1:]:
                    res_str += ' '.join (apply_to_str (s, lemmatizer, stemmer, stop)) + "\t"
                res_str += '\n'
                fout.write (res_str.encode ('utf-8'))
                if i % 1000 == 0:
                    print fname + " at " + str (i)

In [None]:
files = ['xaa','xab','xac','xad','xae','xaf','xag','xah','xai','xaj','xak','xal']

In [None]:
pool = Pool (12)

In [None]:
pool.map (process_file, files)