In [481]:
import numpy
import math

# LSH signature generation using random projection
def get_signature(user_vector, rand_proj):
    res = 0
    for p in (rand_proj):
        res = res << 1
        val = numpy.dot(p, user_vector)
        if val >= 0:
            res |= 1
    return res


In [14]:
# get number of '1's in binary
# running time: O(# of '1's)
def nnz(num):
    if num == 0:
        return 0
    res = 1
    num = num & (num-1)
    while num:
        res += 1
        num = num & (num-1)
    return res


In [15]:
# angular similarity using definitions
# http://en.wikipedia.org/wiki/Cosine_similarity
def angular_similarity(a,b):
    dot_prod = numpy.dot(a,b)
    sum_a = sum(a**2) **.5
    sum_b = sum(b**2) **.5
    cosine = dot_prod/sum_a/sum_b # cosine similarity
    theta = math.acos(cosine)
    return 1.0-(theta/math.pi)


In [301]:
def similarity(a,b):
    from sklearn.metrics.pairwise import cosine_similarity
    sim = cosine_similarity(a,b)
    return sim

In [306]:
def cosine_sim(text1, text2):
    if(type(text1) != str or type(text2)!=str):
        text1 = ' '.join(list(map(lambda x:str(x),text1)))
        text2 = ' '.join(list(map(lambda x:str(x),text2)))
    from sklearn.feature_extraction.text import TfidfVectorizer
    try:
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([text1])
        return ((tfidf * tfidf.T).A)[0,1]
    except:
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]

In [302]:
dim = 200 # number of dimensions per data
d = 2**10 # number of bits per signature

nruns = 24 # repeat times

avg = 0
for run in range(nruns):
    user1 = numpy.random.randn(dim)
    user2 = numpy.random.randn(dim)
    randv = numpy.random.randn(d, dim)
    r1 = get_signature(user1, randv)
    r2 = get_signature(user2, randv)
    xor = r1^r2x
    true_sim, hash_sim = (angular_similarity(user1, user2), (d-nnz(xor))/float(d))
    diff = abs(hash_sim-true_sim)/true_sim
    avg += diff
    print ('true %.4f, hash %.4f, diff %.4f' % (true_sim, hash_sim, diff))
print ('avg diff' , avg / nruns)

true 0.5141, hash 0.5283, diff 0.0277
true 0.4992, hash 0.5010, diff 0.0036
true 0.4674, hash 0.4756, diff 0.0176
true 0.5268, hash 0.5166, diff 0.0194
true 0.4802, hash 0.4980, diff 0.0372
true 0.4927, hash 0.4883, diff 0.0089
true 0.5150, hash 0.5166, diff 0.0031
true 0.5154, hash 0.5146, diff 0.0014
true 0.4830, hash 0.4893, diff 0.0131
true 0.5139, hash 0.5283, diff 0.0280
true 0.5462, hash 0.5469, diff 0.0012
true 0.5165, hash 0.4951, diff 0.0414
true 0.4894, hash 0.5156, diff 0.0536
true 0.5016, hash 0.5000, diff 0.0033
true 0.5147, hash 0.5137, diff 0.0020
true 0.5213, hash 0.5352, diff 0.0265
true 0.5401, hash 0.5527, diff 0.0234
true 0.5347, hash 0.5439, diff 0.0173
true 0.4817, hash 0.4619, diff 0.0410
true 0.5298, hash 0.5078, diff 0.0414
true 0.4654, hash 0.4600, diff 0.0117
true 0.4898, hash 0.5000, diff 0.0209
true 0.5219, hash 0.5234, diff 0.0030
true 0.4640, hash 0.4805, diff 0.0356
avg diff 0.020096744202322676


In [112]:
import pandas as pd
sample_data = pd.read_csv("csv_example_messy_input.csv")
sample_data.groupby('Address').head()

Unnamed: 0,Id,Source,Site name,Address,Zip,Phone,Fax,Program Name,Length of Day,IDHS Provider ID,...,Executive Director,Center Director,ECE Available Programs,NAEYC Valid Until,NAEYC Program Id,Email Address,Ounce of Prevention Description,Purple binder service type,Column,Column2
0,0,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
1,1,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
2,2,CPS_Early_Childhood_Portal_scrape.csv,National Louis University - Dr. Effie O. Elli...,10 S Kedzie Ave,,5339011.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
3,3,CPS_Early_Childhood_Portal_scrape.csv,National Louis University - Dr. Effie O. Elli...,10 S Kedzie Ave,,5339011.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
4,4,CPS_Early_Childhood_Portal_scrape.csv,Board Trustees-City Colleges of Chicago - Oli...,10001 S Woodlawn Ave,,2916100.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
5,5,CPS_Early_Childhood_Portal_scrape.csv,Board Trustees-City Colleges of Chicago - Oli...,10001 S Woodlawn Ave,,2916100.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
6,6,CPS_Early_Childhood_Portal_scrape.csv,Easter Seals Society of Metropolitan Chicago ...,1001 W Roosevelt Rd,,9395115.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
7,7,CPS_Early_Childhood_Portal_scrape.csv,Easter Seals Society of Metropolitan Chicago ...,1001 W Roosevelt Rd,,9395115.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
8,8,CPS_Early_Childhood_Portal_scrape.csv,Hull House Association - Uptown Head Start / ...,1020 W Bryn Mawr Ave,,7695753.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
9,9,CPS_Early_Childhood_Portal_scrape.csv,Hull House Association - Child Dev. Central O...,1030 W Van Buren St,,9068600.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,


In [304]:
fields =['Site name','Address','Zip','Phone','Program Name']
data = sample_data[fields]
data_np = data.fillna(0).as_matrix()
a= ' '.join(list(map(lambda x:str(x),data_np[0]))).strip()
b= ' '.join(list(map(lambda x:str(x),data_np[1]))).strip()
# b=b.replace("Ave","Avenue")
print(a,"\n",b)

Salvation Army - Temple / Salvation Army 1 N Ogden Ave  0.0 2262649.0 Child Care 
 Salvation Army - Temple / Salvation Army 1 N Ogden Ave  0.0 2262649.0 Child Care


In [320]:
from sklearn.feature_extraction.text import TfidfVectorizer
try:
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([a,])
    print( ((tfidf * tfidf.T).A)[0,1] )
except:
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([a, b])
    print( ((tfidf * tfidf.T).A)[0,1] )
user1,user2 = tfidf.toarray()
randv = numpy.random.randn(d, len(user1))
r1 = get_signature(user1, randv)
r2 = get_signature(user2, randv)
xor = r1^r2
true_sim, hash_sim = (angular_similarity(user1, user2), (d-nnz(xor))/float(d))
print(true_sim,hash_sim)

1.0
1.0 1.0


In [623]:

#data selection
fields =['Site name','Address','Zip','Phone','Program Name']
data = sample_data[fields]
#data as matrix
data_np = data.fillna(0).as_matrix()

a= ' '.join(list(map(lambda x:str(x),data_np[0]))).strip()
b= ' '.join(list(map(lambda x:str(x),data_np[1]))).strip()

# randv = numpy.random.randn(d, len(user1))
# r1 = get_signature(user1, randv)
# r2 = get_signature(user2,randv)

In [1]:
def toString(a):
    return ' '.join(list(map(lambda x:str(x),a))).strip()

In [480]:
def toVec(text):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer=CountVectorizer()
    tfidf = vectorizer.fit_transform([text])
    return tfidf.toarray().ravel()


In [382]:
def lsh_sim(xor,d=2**10):
    return (d-nnz(xor))/float(d)

In [380]:
d = 2**10

In [484]:
user_str = toString(data_np[26])
user1 = toVec(user_str)
user_str = toString(data_np[1])
user2 = toVec(user_str)
randv  = numpy.random.randn(d, len(user1))
r1 = get_signature(user1,randv)
r2 = get_signature(user2,randv)
xor = r1 ^ r2 


ValueError: shapes (17,) and (8,) not aligned: 17 (dim 0) != 8 (dim 0)

In [514]:
count =0
unique  = dict()
randv  = numpy.random.randn(d, 2**8)
for user1 in data_np:
    user_str = toString(user1)
    u1 = toVec(user_str)
#     user_str = toString(user2)
#     u2 = toVec(user_str)
#     npad = abs(len(u1)-len(u2))
#     if(len(u1)<len(u2)):
#         u1 = numpy.pad(u1, pad_width=npad, mode='constant', constant_values=0)[npad:]
#     else:
#         u2 = numpy.pad(u2, pad_width=npad, mode='constant', constant_values=0)[npad:]

    r1 = get_signature(u1,randv[:,:u1.shape[0]])
#     r2 = get_signature(u2,randv)
#     xor = r1 ^ r2 
    unique[r1]=user_str
    for h,i in unique.items():
        
        count+=1

In [77]:
"""LSH working algorithm kordedup"""
# del(lsh)
import pandas as pd
from datasketch import MinHash,MinHashLSH
import random 
unique=dict()
duplicate = dict()
sales = pd.read_excel("sales.xls")
lsh = MinHashLSH(threshold=0.9, num_perm=128)
m_list = list()
count = 0 
import time


random_dups = random.randrange(1500,2000)
#genrating random duplications 
dups = [random.randrange(1,sales.shape[0]) for i in range(random_dups)]
ptr= sales.shape[0]
for dup in dups:
    sales.loc[ptr]= sales.iloc[dup]
    ptr+=1
#finding duplicates in sales data
print("Strating Deduplication using ")
start = time.time()
for i in sales.as_matrix():
    s =  set(toString(i).split())
    m = MinHash(num_perm=128)
    for d in s:
        m.update(d.encode('utf8'))
    m_list.append(m)
    try:
        # try creating an lsh inde
        
        lsh.insert("PLN"+str(count), m)
        result = lsh.query(m_list[-1]
        if(list(l))>1):
            unique["PLN"+str(count)]=s
    except:
        print(lsh.keys())
        raise
    count+=1
#calculate total time required
end = time.time()
results = list()
for i in m_list:
    i_sorted = sorted(lsh.query(i))
    if i_sorted not in results:
        results.append(i_sorted)
print("Total Size of elements",sales.size)
# print("Items similar to m1 ", results)
print("Time taken to find duplicates",end-start)
dup_records = sum(list(map(lambda x:len(x)-1,results)))
print("Duplicate records are",dup_records)

Strating Deduplication using 


TypeError: 'dict' object is not callable

In [63]:
import numpy as np
results = np.array(results).ravel()

In [72]:
for key in results:
    print(unique[key[0]])

{'Storage', 'Base', 'Eldon', '3', 'Nunavut', 'Large', 'Business', '00:00:00', '38.94', 'Office', '6', 'stackable', 'platinum', '35.0', 'Muhammed', 'Small', '261.54', 'Organization', 'shelf,', '0.8', 'for', 'Air', 'Box', '&', 'storage', '0.04', 'Regular', '2010-10-13', '2010-10-20', '1', 'MacIntyre', 'Supplies', 'Low', '-213.25'}


In [637]:
# sales_pd = pd.read_excel("sales.xls")
# dups = [random.randrange(1,sales_pd.shape[0]) for i in range(100000)]
# ptr= sales_pd.shape[0]
# for dup in dups:
#     sales_pd.loc[ptr]= sales_pd.iloc[dup]
#     ptr+=1
sales_pd.size

787773

In [626]:
import random
from datetime import datetime
mfgs = ["A","B","C","D","E","F"]
zones = ["S","W","E","N","C"]
dims = ["Small","Medium","Large"]
price = lambda :random.randrange(10000,100000)
idim = lambda : random.randrange(2)
qty = lambda : random.randrange(100,10000)
wt = lambda : random.randrange(10,100000)
imfg = lambda :random.randrange(len(mfgs))
iz = lambda :random.randrange(len(zones))
description = "This is iron"
zone = lambda : ranomd.randrange(5)
fields = ["PLN", "Item","Price", "Dim","Qty", "Mfg", "Type","Weight", "Description","Zone","TimeStamp"]
iron = pd.DataFrame(columns=fields)
row =["IR","IRON", price(),dims[idim()],qty(),mfgs[imfg()],"Iron",wt(),description,zones[iz()],datetime.now()]
for i in range(1,400):
    row =["IR","IRON", price(),dims[idim()],qty(),mfgs[imfg()],"Iron",wt(),description,zones[iz()],datetime.now()]
    row[0]+=str(i)
    row[1]+="_"+str(i)
    iron.loc[i]=row


In [627]:
iron.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
1,IR1,IRON_1,53647.0,Small,5767.0,F,Iron,13151.0,This is iron,W,2017-04-02 10:25:09.735899
2,IR2,IRON_2,84195.0,Small,1507.0,F,Iron,94090.0,This is iron,N,2017-04-02 10:25:09.742013
3,IR3,IRON_3,22194.0,Medium,5774.0,C,Iron,61260.0,This is iron,C,2017-04-02 10:25:09.748446
4,IR4,IRON_4,24681.0,Medium,6307.0,C,Iron,18211.0,This is iron,C,2017-04-02 10:25:09.753438
5,IR5,IRON_5,30868.0,Small,9413.0,E,Iron,23494.0,This is iron,W,2017-04-02 10:25:09.757538


In [235]:
#generate fake glass
import random
from datetime import datetime
item_type = "Glass"
mfgs = ["A","B","C","D","E","F"]
zones = ["S","W","E","N","C"]
dims = ["Small","Medium","Large"]
price = lambda :random.randrange(10000,100000)
idim = lambda : random.randrange(2)
qty = lambda : random.randrange(100,10000)
wt = lambda : random.randrange(10,100000)
imfg = lambda :random.randrange(len(mfgs))
iz = lambda :random.randrange(len(zones))
description = "This is glass"
zone = lambda : ranomd.randrange(5)
fields = ["PLN", "Item","Price", "Dim","Qty", "Mfg", "Type","Weight", "Description","Zone","TimeStamp"]
glass = pd.DataFrame(columns=fields)
# row =["IR",, price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
for i in range(400,800):
    row =["IR",item_type.upper(), price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
    row[0]+=str(i)
    row[1]+="_"+str(i)
    glass.loc[i]=row
glass.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
400,IR400,GLASS_400,88614.0,Medium,1702.0,E,Glass,90305.0,This is glass,N,2017-04-01 23:42:58.305771
401,IR401,GLASS_401,12387.0,Small,1216.0,D,Glass,67988.0,This is glass,W,2017-04-01 23:42:58.309612
402,IR402,GLASS_402,10394.0,Small,1173.0,B,Glass,47138.0,This is glass,N,2017-04-01 23:42:58.314130
403,IR403,GLASS_403,47909.0,Medium,8876.0,D,Glass,76945.0,This is glass,N,2017-04-01 23:42:58.318025
404,IR404,GLASS_404,56653.0,Medium,1715.0,F,Glass,30443.0,This is glass,C,2017-04-01 23:42:58.323777


In [236]:
#generate fake accessories
import random
from datetime import datetime
item_type = "Accessories"
mfgs = ["A","B","C","D","E","F"]
zones = ["S","W","E","N","C"]
dims = ["Small","Medium","Large"]
price = lambda :random.randrange(10000,100000)
idim = lambda : random.randrange(2)
qty = lambda : random.randrange(100,10000)
wt = lambda : random.randrange(10,100000)
imfg = lambda :random.randrange(len(mfgs))
iz = lambda :random.randrange(len(zones))
description = "This is Accessories"
zone = lambda : ranomd.randrange(5)
fields = ["PLN", "Item","Price", "Dim","Qty", "Mfg", "Type","Weight", "Description","Zone","TimeStamp"]
acc = pd.DataFrame(columns=fields)
# row =["IR",, price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
for i in range(800,1200):
    row =["IR",item_type.upper(), price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
    row[0]+=str(i)
    row[1]+="_"+str(i)
    acc.loc[i]=row
acc.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
800,IR800,ACCESSORIES_800,74026.0,Medium,460.0,A,Accessories,39265.0,This is Accessories,E,2017-04-01 23:43:03.528691
801,IR801,ACCESSORIES_801,68057.0,Medium,4176.0,C,Accessories,10607.0,This is Accessories,C,2017-04-01 23:43:03.532984
802,IR802,ACCESSORIES_802,69291.0,Medium,8024.0,E,Accessories,43205.0,This is Accessories,E,2017-04-01 23:43:03.537726
803,IR803,ACCESSORIES_803,80871.0,Medium,977.0,C,Accessories,74885.0,This is Accessories,C,2017-04-01 23:43:03.541878
804,IR804,ACCESSORIES_804,35471.0,Small,3007.0,D,Accessories,57058.0,This is Accessories,E,2017-04-01 23:43:03.545581


In [200]:
glass.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
400,IR400,GLASS,76739.0,Small,7730.0,D,Glass,80807.0,This is glass,N,2017-04-01 19:06:14.382292
401,IR401,GLASS,49340.0,Small,2477.0,F,Glass,32729.0,This is glass,C,2017-04-01 19:06:14.386162
402,IR402,GLASS,12082.0,Small,3327.0,F,Glass,1171.0,This is glass,W,2017-04-01 19:06:14.390628
403,IR403,GLASS,57122.0,Small,6541.0,B,Glass,92409.0,This is glass,N,2017-04-01 19:06:14.394786
404,IR404,GLASS,37134.0,Small,6707.0,B,Glass,21873.0,This is glass,N,2017-04-01 19:06:14.399479


In [237]:
def dump_to_json(file_name,data):
    import json
    with open(file_name, 'w') as outfile:
        json.dump(data, outfile,indent=4)

In [228]:
iron.TimeStamp= iron.TimeStamp.apply(lambda x:str(x))
iron_matrix = iron.as_matrix()
iron_list = [dict(list(zip(fields,item))) for item in iron_matrix]
dump_to_json("iron.json",iron_list)

In [229]:
glass.TimeStamp= glass.TimeStamp.apply(lambda x:str(x))
glass_matrix = glass.as_matrix()
glass_list = [dict(list(zip(fields,item))) for item in glass_matrix]
dump_to_json("glass.json",glass_list)

In [231]:
acc.TimeStamp= acc.TimeStamp.apply(lambda x:str(x))
acc_matrix = acc.as_matrix()
acc_list = [dict(list(zip(fields,item))) for item in acc_matrix]
dump_to_json("acc.json",acc_list)

In [233]:
#Json read test
pd.read_json("acc.json").head()

Unnamed: 0,Description,Dim,Item,Mfg,PLN,Price,Qty,TimeStamp,Type,Weight,Zone
0,This is Accessories,Medium,ACCESSORIES_800,B,IR800,66655,6448,2017-04-01 19:23:39.845296,Accessories,3888,W
1,This is Accessories,Medium,ACCESSORIES_801,D,IR801,36823,4545,2017-04-01 19:23:39.849148,Accessories,65984,S
2,This is Accessories,Small,ACCESSORIES_802,D,IR802,22695,4510,2017-04-01 19:23:39.853290,Accessories,1882,W
3,This is Accessories,Small,ACCESSORIES_803,C,IR803,73840,2665,2017-04-01 19:23:39.857320,Accessories,5684,S
4,This is Accessories,Medium,ACCESSORIES_804,F,IR804,21273,7435,2017-04-01 19:23:39.864052,Accessories,96653,C
