In [2]:
import json

import pandas as pd
import numpy as np
from tqdm import tqdm

import cPickle

In [3]:
def read_json(line):
    d = json.loads(line)
    user_facts = []
    for f in d['facts']:
        fid = f['fid'] - 1
        ts = f['ts']
        if ts > 1000000000000000:
            user_facts.append((fid, ts / 1000))
        else:
            user_facts.append((fid, ts))
    return user_facts

In [4]:
with open('tmp/df_train.bin', 'rb') as f:
    df_train = cPickle.load(f)

train_users = set(df_train.user_1) | set(df_train.user_2) 
train_idx = sorted(train_users)

components = []
uid_to_others = {}
for _, group in tqdm(df_train.groupby('component')):
    users = set(group.user_1) | set(group.user_2)
    components.append(users)
    for uid in users:
        uid_to_others[uid] = users - {uid}



In [5]:
np.random.seed(2)

num_components = df_train.component.max()
component_idx = np.arange(0, num_components)
np.random.shuffle(component_idx)

split = num_components / 2
print component_idx[:split].shape, component_idx[split:].shape
fold1_comps = set(component_idx[:split])
fold2_comps = set(component_idx[split:])

fold1 = df_train[df_train.component.isin(fold1_comps)]
fold1_users = set(fold1.user_1) | set(fold1.user_2)
fold2 = df_train[df_train.component.isin(fold2_comps)]
fold2_users = set(fold2.user_1) | set(fold2.user_2)

len(fold1_users), len(fold2_users)

(30000,) (30000,)


(120679, 120050)

In [6]:
TRAIN_1 = 1
TRAIN_2 = 2
TEST = 3

def user_fold(uid):
    if uid in fold1_users:
        return TRAIN_1
    if uid in fold2_users:
        return TRAIN_2
    return TEST

## ES

In [7]:
with open('tmp/df_urls.bin', 'rb') as f:
    df_urls = cPickle.load(f)
df_urls.head()

Unnamed: 0,domain,address,param,title
0,6N6J,6KKH 3FI,3HZT5,
1,FFNE,AR4 O78Q O78R,6XUD,
2,BC6L,30DFF,,
3,KAV3,MI 7FWM0,,
4,7D7H,6N5M 6JQ0 ZNMS 6MEA,,


In [25]:
from elasticsearch import Elasticsearch, helpers
es_host = '172.17.0.2'
es = Elasticsearch(host=es_host)

In [26]:
# es.indices.delete(index='user')

{u'acknowledged': True}

In [27]:
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Mapping, String, Nested, Integer, Boolean
from elasticsearch_dsl import analyzer, tokenizer

whitespace_analyzer = analyzer('whitespace_analyzer', tokenizer=tokenizer('whitespace'))
con = connections.create_connection(host=es_host)

mapping = Mapping('user_log')

fact = Nested(multi=True, include_in_parent=True)
fact.field('domain', String(analyzer=whitespace_analyzer, ))
fact.field('address', String(analyzer=whitespace_analyzer))
fact.field('param', String(analyzer=whitespace_analyzer))
fact.field('title', String(analyzer=whitespace_analyzer))

mapping.field('fact', fact)
mapping.field('fold', Integer(index='not_analyzed'))

mapping.save('user')

In [29]:
import itertools 

def chunk_iterator(iterator, size):
    while 1:
        batch = list(itertools.islice(iterator, size))
        if batch:
            yield batch
        else:
            break

In [32]:
uid_idx = 0

with open('../data/facts.json', 'r') as fact_file:
    lines = iter(fact_file)

    for chunk in tqdm(chunk_iterator(lines, 100)):
        actions = []

        for line in chunk:
            log = read_json(line)
            facts, _ = zip(*log)
            facts = df_urls.iloc[list(facts)]
            user = {
                'fact': facts.to_dict(orient='records'), 
                'fold': user_fold(uid_idx),
            }
            action = {'_id': uid_idx, '_index': 'user', '_type': 'user_log', '_source': user}
            actions.append(action)
            uid_idx = uid_idx + 1

        helpers.bulk(es, actions)



In [33]:
def find_similar(uid, limit=10):
    query = {
        'query': {
            'filtered': {
                'query': {
                    'more_like_this': {
                        'like': {
                            '_index': 'user',
                            '_type': 'user_log',
                            '_id': uid,
                        },
                        'max_query_terms': 10,
                        'fields': ['fact.domain', 'fact.address', 'fact.param', 'fact.title^2'],
                    }
                }
            }
        },
        'filter': {
            'bool': {
                'must': [{
                    'term': {
                        'fold': user_fold(uid),
                    },
                }],
                
            }
        },
        'fields': ['_id'],
        'size': limit,
    }

    res = es.search(index='user', doc_type='user_log', body=query)
    hits = res['hits']['hits']
    return [(int(d['_id']), d['_score']) for d in hits]

In [34]:
np.random.seed(1)
random_users = set(np.random.choice(list(train_users), size=500))

In [102]:
retrieved = 0
relevant = 0

for uid in tqdm(random_users):
    similar = find_similar(uid, limit=25)
    others_found = {u for (u, _) in similar}
    others_truth = uid_to_others[uid]
    retrieved = retrieved + len(others_found & others_truth) 
    relevant = relevant + len(others_truth)

retrieved, relevant



(1244, 2059)

ES Baseline

In [109]:
with open('tmp/train_test_users.bin', 'rb') as f:
    _, test_idx = cPickle.load(f)
    test_idx = list(test_idx)

In [116]:
test_pairs = []

for uid in tqdm(test_idx):
    similar = find_similar(uid, limit=25)
    test_pairs.extend((uid, u, s) for (u, s) in similar)



In [121]:
test_pairs = sorted(test_pairs, key=lambda (a, b, s): s, reverse=True)

In [123]:
with open('tmp/idx_to_uid.bin', 'rb') as f:
    idx_to_uid = cPickle.load(f)

In [125]:
count = 215307
with open('submission.txt','w') as f_out:
    seen = set()
    for uid1, uid2, score in test_pairs:
        if (uid1, uid2) in seen or (uid2, uid1) in seen:
            continue
        seen.add((uid1, uid2))

        f_out.write("%s,%s\n" % (idx_to_uid[uid1], idx_to_uid[uid2]))
        count = count - 1
        if count <= 0:
            break

In [None]:
!zip -r submission.txt.zip submission.txt