In [10]:
from functools import reduce
from itertools import combinations
from collections import defaultdict

### Assocaite rules

In [116]:
baskets = [   
   ("A", "C", "D"),
    ("B", "C", "E"),
    ("A", "C", "E"),
    ("F", "E")
]
baskets = [ frozenset(b) for b in baskets ]

def support( baskets, comb = 2, support=2 ):
    all_items = reduce( lambda x,y: x.union(y), baskets )
    support_dict = defaultdict(int)
    for itemset in combinations(all_items, comb):
        itemset = frozenset(itemset)
        for basket in baskets:
            if itemset.issubset(basket):
                support_dict[ itemset ] += 1
    return { k:v for k,v in support_dict.items() if v >= support }

def interest( baskets, I, j):
    I,j = frozenset(I), frozenset(j)
    supp_of_I = support(baskets, len(I), 0)[I]
    supp_of_Ij = support(baskets, len(I)+1, 0)[ I.union(j) ]
    supp_of_j = support(baskets, 1, 0)[j]
    return (supp_of_Ij/supp_of_I)-(supp_of_j/len(baskets))

def confidence(baskets, I, j):
    I,j = frozenset(I), frozenset(j)
    supp_of_Ij = support(baskets, len(I)+1, 0)[ I.union(j) ]
    supp_of_I = support(baskets, len(I), 0)[I]
    return supp_of_Ij/supp_of_I

In [117]:
support(baskets, 2)

{frozenset({'C', 'E'}): 2, frozenset({'A', 'C'}): 2}

In [49]:
interest(baskets, ("m", "b"), "c"), confidence(baskets, ("m", "b"), "c")

(-0.125, 0.5)

### Min hashing

In [112]:
def getMinHashSignature(table, hashes):
    vals, n = [], len(table)
    
    r = len(table[0])
    h = len(hashes)
    for func in hashes:
        vals.append( [ func(i) for i in range(n) ] )
                      
    ans = [ [float('inf')]*r for _ in range(len(hashes)) ]
    for i in range(n):
        for j in range(r):
            for k in range(h):
                if table[i][j] == 0: continue
                ans[k][j] = min( ans[k][j], vals[k][i] )
                
    return "\n".join(map(str,ans))


table = [ [1,0,0,1],
          [0,0,1,0],
          [0,1,0,1],
          [1,0,1,1],
          [0,0,1,0]]

hashes = [ lambda x: (x+1)%5, lambda x: (3*x + 1)%5 ]

print(getMinHashSignature(table, hashes))

[1, 3, 0, 1]
[0, 2, 0, 0]


### Local Sensitive Hashing

[https://www.desmos.com/calculator/lzzvfjiujn](https://www.desmos.com/calculator/lzzvfjiujn)

Rules of MinHashing:

* No of Bands increases $(b)$
* Rows per band decreases $(r)$
* Threshold decreases $\frac{1}{b}^{\frac{1}{r}}$
* false Neg decreases
* false positive increases


Analysis of banding technique:
* Probability that all rows in band are equal= $t^r$
* Probability that not all rows r are equal = $1-t^r$
* Probability that no band has rows that are all equal = $(1-t^r)^b$
* Probability that at least 1 band has rows that are all equal = $1-(1-t^r)^b$

In [113]:
def calculate_probabilites( t, r, b ):
    first = t**r
    second = 1 - first
    third = second**b
    fourth = 1-third
    return first, second, third, fourth

def getThreshold(b,r): 
    return (1/b)**(1/r)

print(calculate_probabilites(0.8, 5, 20))
print(calculate_probabilites(0.3, 5, 20))

(0.3276800000000001, 0.6723199999999999, 0.0003560578905207767, 0.9996439421094793)
(0.0024299999999999994, 0.99757, 0.9525057408750288, 0.04749425912497118)


In [114]:
calculate_probabilites( 0.3, 5, 20 )

(0.0024299999999999994, 0.99757, 0.9525057408750288, 0.04749425912497118)

### Pearson Similarity

In [97]:
def mean(x): return sum(x)/len(x)
def pearsons_similarity( a,b ):
    avg_a, avg_b, cnt = 0, 0, 0
    for ai,bi in zip(a,b):
        if ai==-1 or bi==-1: continue
        avg_a, avg_b, cnt = avg_a+ai, avg_b+bi, cnt+1
        
    avg_a /= cnt
    avg_b /= cnt
    num,den1,den2 = 0,0,0
    for ai, bi in zip(a,b):
        if ai==-1 or bi==-1: continue
        num += (ai-avg_a)*(bi-avg_b)
        den1 += (ai-avg_a)**2
        den2 += (bi-avg_b)**2
    den = ( den1**0.5 * den2**0.5 )
    if den==0: return 0
    return num/den

#keep unrated items as -1
U1 = [4, -1, 5, 5]
U2 = [2,  1, 3, 5]
pearsons_similarity(U1,U2)

0.7559289460184546

### User based CF predictions

In [99]:
def avg_rating(U):
    sm,cnt = 0,0
    for u in U:
        if u!=-1: sm,cnt = sm+u, cnt+1
    return sm/cnt
            
def user_based_cf( table, coord = (0,1), 
                  neighbors = -1, steps=False ):
    u, i = coord
    total_user, total_items = len(table), len(table[0])
    avg_r = dict()
    
    #calculate avg with item i is not considered
    avg_r[u] = avg_rating(table[u][:i]+table[u][i+1:])
    for ui in range(total_user):
        if ui == u or table[ui][i] == -1: continue
        avg_r[ui] = avg_rating(table[ui][:i]+table[ui][i+1:]) 
    
    if steps: print("Avg for each user", avg_r)
    
    #calcuate pearson similarity of u with each user who rated item i
    similarities = dict()
    for ui in range(total_user):
        if ui == u or table[ui][i] == -1: continue
        similarities[ui] = pearsons_similarity( table[ui], table[u] ) 
    
    if steps: print("Similarity with each user", similarities)
    
    similarities = [ (v,k) for k,v in similarities.items() ]
    similarities.sort(reverse=True)
    if neighbors!=-1:
        similarities = similarities[:neighbors]
    
    num, den = 0, 0
    for sim, ui in similarities:
        num += (table[ui][i] - avg_r[ui])*sim
        den += abs(sim)
    return avg_r[u] + (num/den)


table = [ [4, -1, 5, 5],
          [4,  2, 1, -1],
          [3, -1, 2, 4],
          [4, 4, -1, -1],
          [2, 1, 3,  5]
        ]

user_based_cf(table, coord=(0,1))

3.946914190211329

In [104]:
table = [[2,1,-1,3],
        [3,-1,5,2],
        [-1,4,2,3],
        [5,3,1,-1]]

user_based_cf( table, (1,1) )

3.3333333333333335

### Item Based CF predictions

In [94]:
#based on all ratings
def pearsons_similarity_all_ratings( a,b ):
    avg_a, avg_b, cnta, cntb = 0, 0, 0, 0
    for ai,bi in zip(a,b):
        if ai!=-1: avg_a, cnta = avg_a + ai, cnta+1
        if bi!=-1: avg_b, cntb = avg_b + bi, cntb+1
            
    avg_a /= cnta
    avg_b /= cntb
    num,den1,den2 = 0,0,0
    for ai, bi in zip(a,b):
        if ai==-1 or bi==-1: continue
        num += (ai-avg_a)*(bi-avg_b)
        den1 += (ai-avg_a)**2
        den2 += (bi-avg_b)**2
    den = ( den1**0.5 * den2**0.5 )
    if den==0: return 0
    return num/den

def pearsons_similarity_common_ratings( a,b ):
    avg_a, avg_b, cnt = 0, 0, 0
    for ai,bi in zip(a,b):
        if ai==-1 or bi==-1: continue
        avg_a, avg_b, cnt = avg_a+ai, avg_b+bi, cnt+1
        
    avg_a /= cnt
    avg_b /= cnt
    num,den1,den2 = 0,0,0
    for ai, bi in zip(a,b):
        if ai==-1 or bi==-1: continue
        num += (ai-avg_a)*(bi-avg_b)
        den1 += (ai-avg_a)**2
        den2 += (bi-avg_b)**2
    den = ( den1**0.5 * den2**0.5 )
    if den==0: return 0
    return num/den

pearsons_similarity_all_ratings( [2,3,-1,5], [1,-1,4,3] ),pearsons_similarity_common_ratings( [2,3,-1,5], [1,-1,4,3] )

(0.7657048647896112, 1.0)

In [109]:
def item_based_cf( table, coord = (1,1), neighbors=-1, steps=False ):
    u, i = coord
    total_user, total_items = len(table), len(table[0])
    
    similarity = dict()
    itemset = [ row[i] for row in table ]
    for item in range(total_items):
        if i == item: continue
        itemset1 = [ row[item] for row in table  ]
        similarity[item] = pearsons_similarity_all_ratings(itemset, itemset1)
    
    similarity = [ (v,k) for k,v in similarity.items() ]    
    similarity.sort(reverse=True)
    
    if steps: print("Similarity :",similarity)
    if neighbors != -1:
        similarity = similarity[:neighbors]
    
    num, den = 0, 0
    for sim, item in similarity:
        num += table[u][item] * sim
        den += abs(sim)
    
    return num/den

table = [ [2,1,-1,3],
          [3,-1,5,2],
          [-1,4,2,3],
          [5,3,1,-1]]

item_based_cf(table, neighbors=2)    

2.3697811937369306