In [1]:
from collections import Counter
import math

In [2]:
web_q = Counter()
im_q = Counter()

with open('./web_queries.txt', 'r') as web_fin, open('./image_queries.txt', 'r') as im_fin:
    for i, line in enumerate(web_fin):
        web_q.update(line.replace('\n', '').split())
        
    for line in im_fin:
        im_q.update(line.replace('\n', '').split())


In [3]:
print(len(web_q), len(im_q))

87801 22102


In [28]:
%%time

im_q_filtered = {x.strip() : im_q[x] for x in im_q if x.isalpha() and x != 'id'}

CPU times: user 34.2 ms, sys: 2.64 ms, total: 36.8 ms
Wall time: 37 ms


In [29]:
%%time

web_q_filtered = {x.strip() : web_q[x] for x in web_q if x.isalpha() and x != 'id'}

CPU times: user 113 ms, sys: 3.05 ms, total: 116 ms
Wall time: 117 ms


In [30]:
print(len(im_q_filtered), len(web_q_filtered))
web_sum = sum(web_q_filtered.values())
im_sum = sum(im_q_filtered.values())
print(web_sum, im_sum)

20487 77547
381276 60424


In [31]:
input_q = 'рабочий стол 604ч604'

In [93]:
def p_is_image(sz):
    p = 0
    bias = 0.2
    for word in sz.split():
        if word in im_q_filtered and word in web_q_filtered:
            p += math.log(im_q_filtered[word] / web_q_filtered[word])
    if len(sz.split()):
        return bias + p / len(sz.split())
    return p

In [94]:
print(p_is_image(input_q))

0.08635780434313561


In [95]:
res_w = []
res_im = []

with open('./web_queries.txt', 'r') as web_fin, open('./image_queries.txt', 'r') as im_fin:
    for i, line in enumerate(web_fin):
        line = line.replace('\n', '')
        res_w.append((line, p_is_image(line)))
        
    for line in im_fin:
        line = line.replace('\n', '')
        res_im.append((line, p_is_image(line)))

In [96]:
res_w_sort = sorted(res_w, key=lambda x: x[1])

In [97]:
res_im_sort = sorted(res_im, key=lambda x: x[1])

In [98]:
res_w_sort[-10:-1]

[('грамота юному натуралисту', 0.3917880483011873),
 ('перламутровый лоскут', 0.4027325540540822),
 ('дворцовая архитектура ', 0.45541281188299537),
 ('щавель раскраска', 0.5128529498822063),
 ('коляска анмар марсель', 0.526943084337242),
 ('польт птахів', 0.5465735902799727),
 ('дорожній рух картинки', 0.5540261817785512),
 ('siberian', 0.6054651081081643),
 ('архитектура', 0.7108256237659907)]

In [99]:
print(res_w_sort[-1], res_w_sort[0])

('AMARAYIN SORER', 0.8931471805599454) ('сима ленд каталог', -6.249088176707172)


In [100]:
print(res_im_sort[-1], res_im_sort[0])

('щавель', 0.8931471805599454) ('погода', -5.7914645471079815)


In [171]:
tp = 0
fp = 0
fn = 0
tn = 0

threshold = -0.5

with open('./web_queries.txt', 'r') as web_fin, open('./image_queries.txt', 'r') as im_fin:
    for i, line in enumerate(web_fin):
        line = line.replace('\n', '')
        if p_is_image(line) > threshold:
            fp += 1
        else:
            tn += 1
        
    for line in im_fin:
        line = line.replace('\n', '')
        if p_is_image(line) > threshold:
            tp += 1
        else:
            fn += 1

In [172]:
print(tp, fp, fn, tn)

10103 33894 7897 66106


In [173]:
print('Accuracy is {}'.format((tp + tn) / (tp + fp + fn + tn)))
pres = tp / (tp + fp)
rec = tp / (tp + fn)
print('Precion is {}'.format(pres))
print('Recall is {}'.format(rec))
print('F1 is {}'.format(2 * (pres * rec) / (pres + rec)))

Accuracy is 0.6458389830508474
Precion is 0.22962929290633452
Recall is 0.5612777777777778
F1 is 0.3259189960804555
