In [13]:
import numpy
import math

# LSH signature generation using random projection
def get_signature(user_vector, rand_proj):
    res = 0
    for p in (rand_proj):
        res = res << 1
        val = numpy.dot(p, user_vector)
        if val >= 0:
            res |= 1
    return res


In [14]:
# get number of '1's in binary
# running time: O(# of '1's)
def nnz(num):
    if num == 0:
        return 0
    res = 1
    num = num & (num-1)
    while num:
        res += 1
        num = num & (num-1)
    return res


In [15]:
# angular similarity using definitions
# http://en.wikipedia.org/wiki/Cosine_similarity
def angular_similarity(a,b):
    dot_prod = numpy.dot(a,b)
    sum_a = sum(a**2) **.5
    sum_b = sum(b**2) **.5
    cosine = dot_prod/sum_a/sum_b # cosine similarity
    theta = math.acos(cosine)
    return 1.0-(theta/math.pi)


In [55]:
def similarity(a,b):
    from sklearn.metrics.pairwise import cosine_similarity
    sim = cosine_similarity(a,b)
    return sim


In [91]:
def cosine_sim(text1, text2):
    if(type(text1) != str or type(text2)!=str):
        text1 = ' '.join(list(map(lambda x:str(x),text1)))
        text2 = ' '.join(list(map(lambda x:str(x),text2)))
    from sklearn.feature_extraction.text import TfidfVectorizer
    try:
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]
    except:
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0,1]

In [99]:
dim = 200 # number of dimensions per data
d = 2**10 # number of bits per signature

nruns = 24 # repeat times

avg = 0
for run in range(nruns):
    user1 = sorted(numpy.random.randn(dim))
    user2 = sorted(numpy.random.randn(dim))
    randv = numpy.random.randn(d, dim)
    r1 = get_signature(user1, randv)
    r2 = get_signature(user2, randv)
    xor = r1^r2
    true_sim, hash_sim = (angular_similarity(user1, user2), (d-nnz(xor))/float(d))
    diff = abs(hash_sim-true_sim)/true_sim
    avg += diff
    print ('true %.4f, hash %.4f, diff %.4f' % (true_sim, hash_sim, diff))
print ('avg diff' , avg / nruns)

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

In [97]:
for i,j in zip(user1,user2):
    print(i,j)

-2.99585988336 -2.30076030264
-2.88131902946 -2.14486452695
-2.54749919662 -1.52245026645
-2.20500747856 -1.48321182274
-2.06955238331 -1.40175718052
-1.99912383084 -1.39696441919
-1.93859714633 -1.35485155212
-1.91388816948 -1.32573198102
-1.78730220605 -1.3018910537
-1.72058483352 -1.22997143583
-1.65242233559 -1.21332633557
-1.58563874533 -1.19831047765
-1.58251757984 -1.12018550001
-1.54258714165 -1.08866129207
-1.51482097182 -1.03574632892
-1.48536654417 -0.987350371999
-1.46804677233 -0.9744353514
-1.44650781007 -0.96948711988
-1.38730531221 -0.950909847907
-1.34086111023 -0.929073231537
-1.32612006101 -0.91055343593
-1.29729867293 -0.894859419843
-1.28401981491 -0.861893029495
-1.27271547038 -0.84337951497
-1.26828700384 -0.841734337926
-1.2668762187 -0.818987468665
-1.22025848401 -0.797023253764
-1.21645949545 -0.793913777465
-1.03601808399 -0.789850590725
-1.02808098051 -0.773239201555
-0.995899102892 -0.742995737832
-0.985914086406 -0.741104999596
-0.980710522582 -0.728489882

In [112]:
import pandas as pd
sample_data = pd.read_csv("csv_example_messy_input.csv")
sample_data.groupby('Address').head()

Unnamed: 0,Id,Source,Site name,Address,Zip,Phone,Fax,Program Name,Length of Day,IDHS Provider ID,...,Executive Director,Center Director,ECE Available Programs,NAEYC Valid Until,NAEYC Program Id,Email Address,Ounce of Prevention Description,Purple binder service type,Column,Column2
0,0,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
1,1,CPS_Early_Childhood_Portal_scrape.csv,Salvation Army - Temple / Salvation Army,1 N Ogden Ave,,2262649.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
2,2,CPS_Early_Childhood_Portal_scrape.csv,National Louis University - Dr. Effie O. Elli...,10 S Kedzie Ave,,5339011.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
3,3,CPS_Early_Childhood_Portal_scrape.csv,National Louis University - Dr. Effie O. Elli...,10 S Kedzie Ave,,5339011.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
4,4,CPS_Early_Childhood_Portal_scrape.csv,Board Trustees-City Colleges of Chicago - Oli...,10001 S Woodlawn Ave,,2916100.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
5,5,CPS_Early_Childhood_Portal_scrape.csv,Board Trustees-City Colleges of Chicago - Oli...,10001 S Woodlawn Ave,,2916100.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
6,6,CPS_Early_Childhood_Portal_scrape.csv,Easter Seals Society of Metropolitan Chicago ...,1001 W Roosevelt Rd,,9395115.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
7,7,CPS_Early_Childhood_Portal_scrape.csv,Easter Seals Society of Metropolitan Chicago ...,1001 W Roosevelt Rd,,9395115.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
8,8,CPS_Early_Childhood_Portal_scrape.csv,Hull House Association - Uptown Head Start / ...,1020 W Bryn Mawr Ave,,7695753.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,
9,9,CPS_Early_Childhood_Portal_scrape.csv,Hull House Association - Child Dev. Central O...,1030 W Van Buren St,,9068600.0,,Child Care,EXTENDED DAY,,...,,,,,,,,,,


In [144]:
fields =['Site name','Address','Zip','Phone','Program Name']
data = sample_data[fields]
data_np = data.fillna(0).as_matrix()
a= ' '.join(list(map(lambda x:str(x),data_np[0]))).strip()
b= ' '.join(list(map(lambda x:str(x),data_np[1]))).strip()
b=b.replace("Ave","Avenue")
print(a,"\n",b)

KeyError: "['Timestamp'] not in index"

In [122]:
cosine_sim(a,b)

0.8680942667801238

In [219]:
# Generate Fake Data 
import random
from datetime import datetime
mfgs = ["A","B","C","D","E","F"]
zones = ["S","W","E","N","C"]
dims = ["Small","Medium","Large"]
price = lambda :random.randrange(10000,100000)
idim = lambda : random.randrange(2)
qty = lambda : random.randrange(100,10000)
wt = lambda : random.randrange(10,100000)
imfg = lambda :random.randrange(len(mfgs))
iz = lambda :random.randrange(len(zones))
description = "This is iron"
zone = lambda : ranomd.randrange(5)
fields = ["PLN", "Item","Price", "Dim","Qty", "Mfg", "Type","Weight", "Description","Zone","TimeStamp"]
iron = pd.DataFrame(columns=fields)
row =["IR","IRON", price(),dims[idim()],qty(),mfgs[imfg()],"Iron",wt(),description,zones[iz()],datetime.now()]
for i in range(1,400):
    row =["IR","IRON", price(),dims[idim()],qty(),mfgs[imfg()],"Iron",wt(),description,zones[iz()],datetime.now()]
    row[0]+=str(i)
    row[1]+="_"+str(i)
    iron.loc[i]=row


In [220]:
iron.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
1,IR1,IRON_1,97234.0,Medium,5798.0,C,Iron,96041.0,This is iron,W,2017-04-01 19:23:22.965843
2,IR2,IRON_2,59211.0,Medium,6930.0,C,Iron,30525.0,This is iron,S,2017-04-01 19:23:22.969981
3,IR3,IRON_3,32979.0,Small,9401.0,B,Iron,5219.0,This is iron,C,2017-04-01 19:23:22.974053
4,IR4,IRON_4,92052.0,Small,3023.0,A,Iron,36487.0,This is iron,E,2017-04-01 19:23:22.980320
5,IR5,IRON_5,62218.0,Small,8982.0,F,Iron,50165.0,This is iron,W,2017-04-01 19:23:22.985774


In [218]:
#generate fake glass
import random
from datetime import datetime
item_type = "Glass"
mfgs = ["A","B","C","D","E","F"]
zones = ["S","W","E","N","C"]
dims = ["Small","Medium","Large"]
price = lambda :random.randrange(10000,100000)
idim = lambda : random.randrange(2)
qty = lambda : random.randrange(100,10000)
wt = lambda : random.randrange(10,100000)
imfg = lambda :random.randrange(len(mfgs))
iz = lambda :random.randrange(len(zones))
description = "This is glass"
zone = lambda : ranomd.randrange(5)
fields = ["PLN", "Item","Price", "Dim","Qty", "Mfg", "Type","Weight", "Description","Zone","TimeStamp"]
glass = pd.DataFrame(columns=fields)
# row =["IR",, price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
for i in range(400,800):
    row =["IR",item_type.upper(), price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
    row[0]+=str(i)
    row[1]+="_"+str(i)
    glass.loc[i]=row
glass

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
400,IR400,GLASS_400,20675.0,Medium,6354.0,F,Glass,27467.0,This is glass,C,2017-04-01 19:23:02.894599
401,IR401,GLASS_401,80112.0,Medium,8416.0,A,Glass,39.0,This is glass,S,2017-04-01 19:23:02.899597
402,IR402,GLASS_402,53007.0,Small,4224.0,B,Glass,83020.0,This is glass,E,2017-04-01 19:23:02.903676
403,IR403,GLASS_403,39935.0,Small,7709.0,D,Glass,36057.0,This is glass,W,2017-04-01 19:23:02.907965
404,IR404,GLASS_404,81078.0,Small,8573.0,D,Glass,36558.0,This is glass,W,2017-04-01 19:23:02.914874
405,IR405,GLASS_405,81263.0,Medium,2191.0,B,Glass,7247.0,This is glass,S,2017-04-01 19:23:02.920800
406,IR406,GLASS_406,15498.0,Small,7302.0,F,Glass,25284.0,This is glass,E,2017-04-01 19:23:02.926014
407,IR407,GLASS_407,43233.0,Small,8535.0,E,Glass,93853.0,This is glass,E,2017-04-01 19:23:02.929767
408,IR408,GLASS_408,39157.0,Medium,7108.0,F,Glass,75795.0,This is glass,E,2017-04-01 19:23:02.933273
409,IR409,GLASS_409,62074.0,Medium,4398.0,B,Glass,18335.0,This is glass,C,2017-04-01 19:23:02.936735


In [221]:
#generate fake accessories
import random
from datetime import datetime
item_type = "Accessories"
mfgs = ["A","B","C","D","E","F"]
zones = ["S","W","E","N","C"]
dims = ["Small","Medium","Large"]
price = lambda :random.randrange(10000,100000)
idim = lambda : random.randrange(2)
qty = lambda : random.randrange(100,10000)
wt = lambda : random.randrange(10,100000)
imfg = lambda :random.randrange(len(mfgs))
iz = lambda :random.randrange(len(zones))
description = "This is Accessories"
zone = lambda : ranomd.randrange(5)
fields = ["PLN", "Item","Price", "Dim","Qty", "Mfg", "Type","Weight", "Description","Zone","TimeStamp"]
acc = pd.DataFrame(columns=fields)
# row =["IR",, price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
for i in range(800,1200):
    row =["IR",item_type.upper(), price(),dims[idim()],qty(),mfgs[imfg()],item_type,wt(),description,zones[iz()],datetime.now()]
    row[0]+=str(i)
    row[1]+="_"+str(i)
    acc.loc[i]=row
acc.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
800,IR800,ACCESSORIES_800,66655.0,Medium,6448.0,B,Accessories,3888.0,This is Accessories,W,2017-04-01 19:23:39.845296
801,IR801,ACCESSORIES_801,36823.0,Medium,4545.0,D,Accessories,65984.0,This is Accessories,S,2017-04-01 19:23:39.849148
802,IR802,ACCESSORIES_802,22695.0,Small,4510.0,D,Accessories,1882.0,This is Accessories,W,2017-04-01 19:23:39.853290
803,IR803,ACCESSORIES_803,73840.0,Small,2665.0,C,Accessories,5684.0,This is Accessories,S,2017-04-01 19:23:39.857320
804,IR804,ACCESSORIES_804,21273.0,Medium,7435.0,F,Accessories,96653.0,This is Accessories,C,2017-04-01 19:23:39.864052


In [200]:
glass.head()

Unnamed: 0,PLN,Item,Price,Dim,Qty,Mfg,Type,Weight,Description,Zone,TimeStamp
400,IR400,GLASS,76739.0,Small,7730.0,D,Glass,80807.0,This is glass,N,2017-04-01 19:06:14.382292
401,IR401,GLASS,49340.0,Small,2477.0,F,Glass,32729.0,This is glass,C,2017-04-01 19:06:14.386162
402,IR402,GLASS,12082.0,Small,3327.0,F,Glass,1171.0,This is glass,W,2017-04-01 19:06:14.390628
403,IR403,GLASS,57122.0,Small,6541.0,B,Glass,92409.0,This is glass,N,2017-04-01 19:06:14.394786
404,IR404,GLASS,37134.0,Small,6707.0,B,Glass,21873.0,This is glass,N,2017-04-01 19:06:14.399479


In [222]:
def dump_to_json(file_name,data):
    import json
    with open(file_name, 'w') as outfile:
        json.dump(data, outfile,indent=4)

In [228]:
iron.TimeStamp= iron.TimeStamp.apply(lambda x:str(x))
iron_matrix = iron.as_matrix()
iron_list = [dict(list(zip(fields,item))) for item in iron_matrix]
dump_to_json("iron.json",iron_list)

In [229]:
glass.TimeStamp= glass.TimeStamp.apply(lambda x:str(x))
glass_matrix = glass.as_matrix()
glass_list = [dict(list(zip(fields,item))) for item in glass_matrix]
dump_to_json("glass.json",glass_list)

In [230]:
acc.TimeStamp= acc.TimeStamp.apply(lambda x:str(x))
acc_matrix = acc.as_matrix()
acc_list = [dict(list(zip(fields,item))) for item in acc_matrix]
dump_to_json("acc.json",acc_list)