In [4]:
import pandas as pd
import numpy as np
import collections
import os
from datetime import datetime

# Finding Similar Documents

## 1.1 Set up the data

In [2]:
df = pd.read_csv(os.getcwd()+'/bank_transactions.csv')
df.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,10/1/94,F,JAMSHEDPUR,17819.05,2/8/16,143207,25.0
1,T2,C2142763,4/4/57,M,JHAJJAR,2270.69,2/8/16,141858,27999.0
2,T3,C4417068,26/11/96,F,MUMBAI,17874.44,2/8/16,142712,459.0
3,T4,C5342380,14/9/73,F,MUMBAI,866503.21,2/8/16,142714,2060.0
4,T5,C9031234,24/3/88,F,NAVI MUMBAI,6714.43,2/8/16,181156,1762.5


In [3]:
df.dtypes

TransactionID               object
CustomerID                  object
CustomerDOB                 object
CustGender                  object
CustLocation                object
CustAccountBalance         float64
TransactionDate             object
TransactionTime              int64
TransactionAmount (INR)    float64
dtype: object

## 1.2 Fingerprint Hashing

### Min Hashing

In [None]:

# input : shingle matrix (size : n_shingles x n_docs)
# output : signature matrix (size : n_permutations x n_docs)
def MinHash(shingle_matrix, n_permutations) :
    n_customers = shingle_matrix.shape[1]
    n_shingles = shingle_matrix.shape[0]
    
    signature_matrix = np.array([], dtype=int)
    
    for _ in range (n_permutations) :
        permutation = getPermutation(n_shingles)
        
        permuted_shingle_matrix = permuteMatrix(shingle_matrix, permutation)
        
        index = []
        # we get the index of the first non-null element for each customer in the permuted shingle matrix
        for i in range (n_customers) : 
            index.append(np.nonzero(permuted_shingle_matrix[:,i])[0])
        signature_matrix.append(index)
        
    return signature_matrix
    

# Generate a random permutation
# output : array of size n_shingles
def getPermutation(n_shingles) :
    return np.random.permutation(range(0,n_shingles))

# Generate the signature with the permutation applied
# shingle_matrix : array size n_shingle x n_docs
# permutation : array of size n_shingles
def permuteMatrix(shingle_matrix, permutation) :
    
    if len(permutation) != shingle_matrix.shape[0] :
        print("Permuation error !")
        print("Number of shingles != Number of permutations")
    
    permuted_shingle_matrix = np.copy(shingle_matrix)
    for i, j in enumerate(permutation) :
        permuted_shingle_matrix[j] = shingle_matrix[i]
        
    return permuted_shingle_matrix
    
    



In [None]:

# Divide the signature matrix by band and put the customers into related bucket for each band
# signature matrix : size n_permutations x n_docs
# b : number of bands
# r : number of rows in each band
def LSH(signature_matrix, b, r) :
    
    buckets = collections.defaultdict(set)
    
    bands = np.array_split(signature_matrix, b, axis=0)
    
    for i,band in enumerate(bands) :
        for customer in range (signature_matrix.shape[1]) :
            buckets[list(band[:,i])].add(customer)
            
    return buckets
    

## 1.3 Locality Sensitive Hashing

# 2. Grouping Customers together !

# Command Line Question

# Algorithmic Question 