In [110]:
import numpy as np
import pandas as pd
import os
import glob
from pathlib import Path
from scipy.spatial import distance
import json
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
input_dir = 'phase2_outputs/'
mode = 'tfidf' #tf 
file_list = glob.glob('phase2_outputs/task0b/{}_*.txt'.format(mode))

In [47]:
data = [json.loads(json.load(open(fname, 'r'))) for fname in sorted(file_list)]

In [50]:
len(data[0])

5953

In [80]:
vecs = np.random.randn(len(data[0]), 8)
bin_vec = np.array(data).dot(vecs) >= 0
exponents = np.array([2**i for i in range(7, -1, -1)])
bin_vec.dot(exponents)

array([ 88, 249, 115, 207, 229, 253, 131,  47,  42, 197,  39,  11,  58,
       132, 175, 235, 112, 141, 213, 179, 124, 103,  56,  77, 178, 172,
       245, 189, 174,  83, 123, 216, 135, 184, 111, 204, 239, 171, 126,
       204,  75, 169, 175, 189,  61, 252,   0, 120,  41, 215,  13, 249,
       202, 235, 142, 232, 204, 127, 189,  56, 119, 235, 228, 218, 130,
       168, 170, 203,  98, 234, 221, 206, 237,   7, 227, 230, 175, 169,
        53,  43,   8, 183, 128, 107, 174, 250, 140,  45, 202, 188, 111,
        25,  50])

In [74]:
bin_vec[0].dot(exponents)

45394

In [224]:
class LSH:
    def __init__(self, L, k, mode, input_dir):
        self.L= L
        self.k = k
        self.mode = mode
        self.data = None
        self.hash_tables = []
        self.n_words = None
        self.file_list = sorted(glob.glob('{}/{}_*.txt'.format(input_dir, mode)))
        self.idx_2_file = {i:fname for i,fname in enumerate(self.file_list)}
        self.file_2_idx = {fname:i for i,fname in enumerate(self.file_list)}
    
    
    def get_random_vectors(self):
        return np.random.randn(self.n_words, self.k)
    
    def load_data(self):
        self.data = np.array([json.loads(json.load(open(fname, 'r'))) for fname in self.file_list])
        self.n_words = len(self.data[0])
    
    def binary_2_integer(self, binary_vectors):
        exponents = np.array([2**i for i in range(self.k - 1, -1, -1)])
        return binary_vectors.dot(exponents)
    
    def hash_data(self, data, random_vectors):
        binary_repr = data.dot(random_vectors) >= 0
        binary_inds = self.binary_2_integer(binary_repr)
        return binary_inds
    
    def train(self):
        self.load_data()
        for i in range(self.L):
            random_vectors = self.get_random_vectors()
            binary_inds = self.hash_data(self.data, random_vectors)
            table = defaultdict(list)
            for idx, bin_ind in enumerate(binary_inds):
                table[bin_ind].append(idx)
            hash_table = {'random_vectors': random_vectors, 'table': table}
            self.hash_tables.append(hash_table)
    
    def query(self, data_point, max_results):
        retrieval = set()
        n_buckets = 0
        n_candidates = 0
        for hash_table in self.hash_tables:
            table = hash_table['table']
            random_vectors = hash_table['random_vectors']
            binary_idx = self.hash_data(data_point, random_vectors)
            n_buckets += 1
            #print(binary_idx, table[binary_idx])
            retrieval.update(table[binary_idx])
        
        retrieval = list(retrieval)
        sim_scores = cosine_similarity(np.expand_dims(data_point, 0), self.data[retrieval]).ravel()
        data_idx = sim_scores.argsort()[::-1][:max_results]
        #print(sim_scores)
        #print(data_idx)
        #print(retrieval)
        assert len(retrieval) == len(sim_scores)
        return {'n_buckets': n_buckets, 
                'n_candidates': len(retrieval),
                'scores': sim_scores[data_idx],
                'retrieved_files': [self.idx_2_file[retrieval[d]] for d in data_idx]
               }
            

In [225]:
lsh = LSH(L=8, k=4, mode='tfidf', input_dir='phase2_outputs/task0b')

In [226]:
lsh.train()

In [227]:
query_idx = 33
print("Query file: ", lsh.idx_2_file[query_idx])
lsh.query(np.array(data[query_idx]), max_results=10)

Query file:  phase2_outputs/task0b/tfidf_vectors_251.txt


{'n_buckets': 8,
 'n_candidates': 60,
 'scores': array([1.        , 0.28351168, 0.27296928, 0.26864773, 0.25547429,
        0.25533848, 0.25117153, 0.24767286, 0.24667712, 0.24310759]),
 'retrieved_files': ['phase2_outputs/task0b/tfidf_vectors_251.txt',
  'phase2_outputs/task0b/tfidf_vectors_249.txt',
  'phase2_outputs/task0b/tfidf_vectors_561.txt',
  'phase2_outputs/task0b/tfidf_vectors_257.txt',
  'phase2_outputs/task0b/tfidf_vectors_261.txt',
  'phase2_outputs/task0b/tfidf_vectors_018.txt',
  'phase2_outputs/task0b/tfidf_vectors_007.txt',
  'phase2_outputs/task0b/tfidf_vectors_562.txt',
  'phase2_outputs/task0b/tfidf_vectors_013.txt',
  'phase2_outputs/task0b/tfidf_vectors_250.txt']}