In [1]:
import numpy as np
import pandas as pd
import json
import scipy.sparse as sp
import os
import re
from itertools import chain
import random

In [35]:
os.chdir('C:/projects/itmo/text-anal/')
random.seed(69)
MIN_HASH_SIZE = 64

In [3]:
def load_documents():
    docs = pd.read_csv('data/Tweets.csv')
    return list(enumerate(docs['text'].values.astype(str)))

In [4]:
documents = load_documents()

In [5]:
documents[:4]

[(0, ' I`d have responded, if I were going'),
 (1, ' Sooo SAD I will miss you here in San Diego!!!'),
 (2, 'my boss is bullying me...'),
 (3, ' what interview! leave me alone')]

In [6]:
def tokenize(documents):
    sep = re.compile(r'[\W_]+')
    wnsplit = re.compile(r'[`\'"-=]')
    result = []
    for doc_id, doc in documents:
        doc = re.sub(wnsplit, '', doc)
        result.append((doc_id, [word for word in re.split(sep, doc.lower()) if len(word) != 0]))
    return result

In [7]:
doc2terms = tokenize(documents)

In [8]:
doc2terms[:4]

[(0, ['id', 'have', 'responded', 'if', 'i', 'were', 'going']),
 (1,
  ['sooo', 'sad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'diego']),
 (2, ['my', 'boss', 'is', 'bullying', 'me']),
 (3, ['what', 'interview', 'leave', 'me', 'alone'])]

In [36]:
bag_of_words = sorted(set(chain.from_iterable(terms for doc_id, terms in doc2terms)))
bag_of_words.append('--unknown--')

In [10]:
bag_of_words[:4]

['a', 'aa', 'aaa', 'aaaa']

In [11]:
len(bag_of_words)

27838

In [37]:
def make_shufled(terms_size, count):
    def shuffled(lst):
        lst = list(lst)
        random.shuffle(lst)
        return lst
    return [ shuffled(range(terms_size)) for _ in range(count) ]

In [38]:
shufled_indexes = make_shufled(len(bag_of_words), MIN_HASH_SIZE)

| name | value |
| --- | --- |
shufled_indexes | list(indexes) 
indexes | list(index) 
index | int \[0, len(bag_of_words)\]

In [113]:
word2id = {
    word: i for i, word in enumerate(bag_of_words)
}

In [30]:
# doc2word = np.zeros((len(bag_of_words), len(doc2terms)))

# for doc_id, terms in doc2terms:
#     for term in terms:
#         word_index = word2id[term]
#         doc2word[word_index, doc_id] = 1

In [70]:
from numba import njit, typed

In [122]:
np_sh_ind = np.array(shufled_indexes)

def minhash(doc_terms):
    terms = doc_terms if len(doc_terms) != 0 else ['--unknown--']
    word_indexes = list({word2id[word] if word in word2id else word2id['--unknown--'] for word in terms})
    return minhash_jit(typed.List(word_indexes), np_sh_ind)

@njit
def minhash_jit(word_indexes, shufled_indexes):
    h = np.zeros(len(shufled_indexes), dtype=np.int64)
    
    for i, indexes in enumerate(shufled_indexes):
        for j, index in enumerate(indexes):
            if index in word_indexes:
                h[i] = j
                break
        else:
            h[i] = len(indexes)
    return h


In [124]:
doc2minhash = [(doc_id, minhash(terms)) for doc_id, terms in doc2terms]

doc2minhash[:4]

[(0,
  array([ 1187,  5663,  7537,  1305,  1364,  2901,  1503,  4328,  6420,
          2216,  1255,   949,  8683,  9545,  7280,  7375, 11167,  2763,
          1241,  3230,  9234,  5429,  5271,  4385,   357,  4465,  8283,
          2352,  3660,  7953,  2593, 14961,  3478,  1456,   427,  1572,
          7307,  1008,  2718,  1822,  7352,   586, 13349,   489,  9224,
           647,  3420,  1646,  3155,  7159,  3643,  1651,    12,  3319,
          2307,  4117,  7992,  1781,   563,   684, 15769,  5572,  1811,
           479], dtype=int64)),
 (1,
  array([  145,  2274,  4429,  2769,  1364,  3004,  1546,  5071,   227,
          5334,   614,   949,  1440,  2274,   508,   435,   649,  2445,
         10445,  1198,   375,  1740,  2602,  6216,   357,  3362,   464,
          2255,  1662,   564,  1143,  1988,  2621,  1718,  1181,   324,
          4794,  2565,  4749,  4082,  7415,   172,  1774,  2361,  2264,
          3335,  4789,   898, 10010,  2558,  3775,  1177,  3062,  3976,
           249,   476,

In [126]:
from sklearn.metrics import accuracy_score as accuracy

In [130]:
accuracy(doc2minhash[0][1], doc2minhash[1][1])

0.046875

In [168]:
def similar(doc, boundary=0.2):
    terms = tokenize([(0, doc)])[0][1]
    print(terms)
    doc_hash = minhash(terms)
    
    for doc_id, other_hash in doc2minhash:
        acc = accuracy(doc_hash, other_hash)
        if acc > boundary:
            yield (doc_id, acc)
            
def print_similar_docs(doc):
    print('ORIGINAL:', doc)
    print('----------------------------')
    for doc_id, acc in sorted(similar(doc), key=lambda x: -x[1]):
        print('{0:<6} - {1:.2%} : {2}'.format(doc_id, acc, documents[doc_id][1]))
    print('----------------------------')


In [169]:
print_similar_docs('i want to go to music tonight but i lost my voice.')

ORIGINAL: i want to go to music tonight but i lost my voice.
----------------------------
['i', 'want', 'to', 'go', 'to', 'music', 'tonight', 'but', 'i', 'lost', 'my', 'voice']
13     - 100.00% : i want to go to music tonight but i lost my voice.
8077   - 42.19% : I Want To Go See Up. But I Don`t Want To Go With My Mom And Brother.
26688  - 37.50% : I have to work tonight  But I get my paycheck!
5379   - 35.94% :  I want to go with you !  But I`m tierd....
17507  - 35.94% : don`t want to go to work tonight
3300   - 34.38% : I lost my voice
19481  - 34.38% : I love my daddy, but I don`t want to go to his house today.
25026  - 34.38% : Where`s my bus? I want to go home!
27162  - 34.38% : Who`s going out tonight? I want to go out  ****
7656   - 32.81% : i need to read  fic again, but i lost it
9329   - 32.81% :  we r the lost troopers. But I want to know
11462  - 32.81% :  but but but. I want steak
15355  - 32.81% : I want to go see Up
27286  - 32.81% :  Yaay! I was supposed to go, but I 