## Top N Similar Items


For a n*n matrix
    find the similarity of first element with every other element

Will results in first row being completely done 
and also first column of all other rows filled

similarly repeat with remaining. Don't compute with what has already being done.

While adding we can limit the number of similarity to top n by having simple conditions.


In [71]:
import concurrent.futures as parallel
import functools
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics.pairwise import cosine_similarity as pairwise_cosine_similarity
import numpy as np
import time
import sys
from pprint import pprint

In [72]:
def cosine_similarity(x,y):
    return 1 - paired_cosine_distances(x,y)

class Limited_Similarity:
    
    def __init__(self,matrix,limit=10):
        self.matrix = matrix
        self.limit = limit
        self.no_rows = self.matrix.shape[0]

        self.prepare_result_table() # location for storing results
        self.perform() # Perform similarity
    
    def speed_cosine(self,i):
        return cosine_similarity([self.matrix[self.primary_vector,:]],[self.matrix[i,:]])[0]
    
    def prepare_result_table(self):
        self.result = {}
        
        for i in range(self.no_rows):
            self.result[i] = []
        # data will be stored in form of tuple (index, similarity) in list

    
    def index_of_insertion(self, index, value):
        iterations = len(self.result[index])
        for i in range(iterations):
            if self.result[index][i][1] >= value:
                continue
            return i
        return iterations
    # find the location where we can put this
    # the result is always sorted
    
    def put_output_in_results(self,index, result):
        for i in range(len(result)):
            j_index = i+index+1 # if row i is 0, other items will start at i + 1
            #also
            
            index_to_insert_at = self.index_of_insertion(index, result[i])
            
            
            # don't insert if it is beyond limit
            if not index_to_insert_at >= self.limit:
                self.result[index].insert(index_to_insert_at, (j_index, result[i])) # insert at correct place so that list remains sorted
                
                if len(self.result[index]) >= self.limit: # remove extra columns
                    del self.result[index][-1]
            
            index_to_insert_at = self.index_of_insertion(j_index, result[i])
        
            if not index_to_insert_at > self.limit:
                self.result[j_index].insert(index_to_insert_at, (index, result[i]))
                if len(self.result[j_index]) > self.limit:
                    del self.result[j_index][-1]
    
    def perform(self):
        for i in range(self.no_rows):
            self.primary_vector = i # select a vector which will be *** with other vecotrs
            result = map(self.speed_cosine, [j for j in range(i+1,self.no_rows)]) # perform the operation
            self.put_output_in_results(i, list(result))
            #print(i,list(result))
        

In [73]:
from numpy import random
vector = random.randn(100,4)

In [74]:
a = Limited_Similarity(vector,10)
# 643 ms ± 49.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# pprint(a.result)

In [75]:
sys.getsizeof(a.result)

4704

In [76]:
%%timeit -n 1
Limited_Similarity(vector,10)

632 ms ± 42.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
%%timeit -n 1
b = pairwise_cosine_similarity(vector)

250 µs ± 80.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [84]:
%%timeit -n 1

import sys

def full_calculation_similarity(vector, limit):
    b = pairwise_cosine_similarity(vector)

    sorted_similarity = []
    for i in range(b.shape[0]):
        x = vector[i,:]
        sorted_similarity.append(np.argsort(x))
    return sorted_similarity

b = full_calculation_similarity(vector, 10)
print(sys.getsizeof(b))
## print(b)

912
912
912
912
912
912
912
1.02 ms ± 212 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
b

[array([1, 2, 3, 0]),
 array([2, 1, 0, 3]),
 array([3, 0, 2, 1]),
 array([2, 3, 1, 0]),
 array([1, 3, 0, 2]),
 array([3, 1, 0, 2]),
 array([3, 1, 2, 0]),
 array([3, 0, 1, 2]),
 array([1, 0, 2, 3]),
 array([1, 3, 2, 0]),
 array([0, 1, 3, 2]),
 array([2, 1, 0, 3]),
 array([1, 3, 0, 2]),
 array([2, 0, 3, 1]),
 array([1, 0, 2, 3]),
 array([1, 3, 0, 2]),
 array([0, 3, 1, 2]),
 array([2, 1, 3, 0]),
 array([1, 2, 0, 3]),
 array([0, 3, 1, 2]),
 array([1, 0, 2, 3]),
 array([1, 0, 2, 3]),
 array([2, 3, 1, 0]),
 array([1, 3, 2, 0]),
 array([3, 0, 1, 2]),
 array([0, 2, 3, 1]),
 array([3, 1, 2, 0]),
 array([0, 1, 3, 2]),
 array([3, 0, 2, 1]),
 array([3, 2, 1, 0]),
 array([1, 3, 2, 0]),
 array([0, 2, 3, 1]),
 array([1, 0, 2, 3]),
 array([2, 1, 3, 0]),
 array([0, 3, 1, 2]),
 array([2, 0, 3, 1]),
 array([2, 0, 3, 1]),
 array([1, 2, 3, 0]),
 array([3, 0, 1, 2]),
 array([1, 0, 2, 3]),
 array([2, 0, 3, 1]),
 array([1, 2, 0, 3]),
 array([3, 1, 0, 2]),
 array([1, 3, 2, 0]),
 array([3, 0, 1, 2]),
 array([1,

# Additional
## Parallel Similarity Finding


For a n*n matrix
    find the similarity of first element with every other element

Will results in first row being completely done 
and also first column of all other rows filled

similarly repeat with remaining. Don't compute with what has already being done.

While adding we can limit the number of similarity to top n by having simple conditions.

Also, if we need to find

[](https://wikimedia.org/api/rest_v1/media/math/render/svg/a71c4add4abded66efd42b202c76f6a59944a587)

in cosine similarity we need to find the dot product between 2 arrays for numerator 

we need transpose of a vector and that can also be cached for each itteration

norm for the vectors is a single number and can easily be cached aka @functools.lru_cache()
