In [2]:
import pandas as pd
from datetime import datetime
from random import randint
from tqdm import tqdm
import numpy as np
from numpy.linalg import norm
from operator import itemgetter

## Part 1 : MinHash

In [23]:
new_data = pd.read_pickle('processed.pkl')

## HASH FUNCTIONS

In [9]:
def hash_date(value):
    return int(pd.Timestamp(value).timestamp())

In [10]:
def hash_string(value,p):
    m =2^32 -1
    return sum([ ord(x)*(p^i) for i,x in enumerate(value)])%m

In [11]:
def hash_float(value):
    integer,decimal  = str(value).split('.')
    return int(integer+decimal)

## MINHASH

In [6]:
def minHash(customer,p,permutations):
    vec  = [0 for i in range(len(customer)) ]
    for i,val in enumerate(customer):
        if not isinstance(val,int):
            if isinstance(val,datetime): 
                val  = hash_date(val)
            elif isinstance(val,str): 
                val = hash_string(val,p)
            elif isinstance(val,float): 
                val = hash_float(val)

        mini = float('inf')
        for perm_vals in permutations:
            a,b = perm_vals
            output = (a*val +b)%p

            if(mini > output):
                mini  = output

        vec[i] = output
    return vec
    

### MinHash applied to kaggle dataset

In [25]:
p  = 4993
max_val = 2^32-1
N  = 10
permutations = [ (randint(0,max_val),randint(0,max_val)) for i in range(N)]
new_data['minhash']  = [minHash([*x[1]],p,permutations) for x in tqdm(new_data.iterrows())]

0it [00:31, ?it/s]
0it [00:45, ?it/s]
985322it [03:44, 4386.19it/s]


### Read Query.csv

In [24]:
query  = pd.read_pickle('query.pkl')

### MinHash applied to query dataset

In [26]:
query['minhash']  = [minHash([*x[1]],p,permutations) for x in tqdm(query.iterrows())]

46it [00:00, 3374.40it/s]


### Check similarity

In [4]:
def cosine_sim(A,B):
    cosine = np.dot(A,B)/(norm(A)*norm(B))
    return cosine

In [15]:
def find_best_neighbour(query):
    norm_q_hash  = np.array(query.minhash)/max(query.minhash)
    best_score = max([(cosine_sim(norm_q_hash,(np.array(x)/max(x))),i) for i,x in enumerate(new_data.minhash)],key=itemgetter(0))
    return best_score


In [35]:
neighbours = []
for i in range(len(query)):
    if i in query.index:
        res = find_best_neighbour(query.loc[i])
        neighbours.append(res[1])

#### Query dataset

In [37]:
display(query)


Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),minhash
0,1978-07-27,M,DELHI,94695.61,2016-02-09,140310,65.0,"[1166, 103, 63, 3388, 3240, 90, 1530]"
1,1992-06-11,M,PANCHKULA,7584.09,2016-02-09,120214,6025.0,"[2012, 103, 43, 4739, 3240, 3843, 3363]"
2,1991-08-14,M,PATNA,7180.6,2016-10-08,221732,541.5,"[3199, 103, 123, 4084, 4372, 451, 4243]"
3,1987-03-01,M,CHENNAI,56847.75,2016-08-29,144138,1000.0,"[4419, 103, 53, 2468, 925, 3419, 163]"
4,1995-04-01,M,GURGAON,84950.13,2016-09-25,233309,80.0,"[3377, 103, 23, 4244, 1629, 1382, 3030]"
5,1981-10-01,M,WORLD TRADE CENTRE BANGALORE,23143.95,2016-11-09,192906,303.0,"[1691, 103, 73, 1418, 1138, 1785, 365]"
6,1976-09-20,F,CHITTOOR,15397.8,2016-08-28,92633,20.0,"[3538, 43, 193, 1959, 714, 2648, 2023]"
7,1991-10-04,M,MOHALI,426.3,2016-02-08,203754,50.0,"[3974, 103, 233, 2709, 3029, 419, 30]"
8,1990-03-19,M,MOHALI,4609.34,2016-08-26,184015,300.0,"[4802, 103, 233, 824, 292, 2749, 65]"
9,1970-12-19,M,SERAMPORE,6695988.46,2016-08-27,144030,299.0,"[4393, 103, 83, 1008, 503, 2339, 4958]"


#### Best neighbours in the kaggle datset based on the cosine similarity of  minhash  vector

In [38]:
display(new_data.loc[neighbours])

Unnamed: 0,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR),minhash
701472,1978-07-27,M,DELHI,94695.61,2016-02-09,140310,65.0,"[1166, 103, 63, 3388, 3240, 90, 1530]"
684212,1992-06-11,M,PANCHKULA,7584.09,2016-02-09,120214,6025.0,"[2012, 103, 43, 4739, 3240, 3843, 3363]"
296431,1991-08-14,M,PATNA,7180.6,2016-10-08,221732,541.5,"[3199, 103, 123, 4084, 4372, 451, 4243]"
601262,1987-03-01,M,CHENNAI,56847.75,2016-08-29,144138,1000.0,"[4419, 103, 53, 2468, 925, 3419, 163]"
10841,1995-04-01,M,GURGAON,84950.13,2016-09-25,233309,80.0,"[3377, 103, 23, 4244, 1629, 1382, 3030]"
880492,1981-10-01,M,WORLD TRADE CENTRE BANGALORE,23143.95,2016-11-09,192906,303.0,"[1691, 103, 73, 1418, 1138, 1785, 365]"
581328,1976-09-20,F,CHITTOOR,15397.8,2016-08-28,92633,20.0,"[3538, 43, 193, 1959, 714, 2648, 2023]"
74329,1991-10-04,M,MOHALI,426.3,2016-02-08,203754,50.0,"[3974, 103, 233, 2709, 3029, 419, 30]"
563650,1990-03-19,M,MOHALI,4609.34,2016-08-26,184015,300.0,"[4802, 103, 233, 824, 292, 2749, 65]"
547486,1970-12-19,M,SERAMPORE,6695988.46,2016-08-27,144030,299.0,"[4393, 103, 83, 1008, 503, 2339, 4958]"
