Cosine Similarity, Correlation, Euclidean Distance, Jaccard Similarity, Jaccard Distance

In [1]:
###Introduction to Data Mining by Tan, Steinbach & Kumar
###Chapter 2 Problem 19
###Author: Brenda S. Izquierdo

import numpy as np
import math as mt

###Helper Functions
def vector_addition(w):
    return np.sum(list(np.array(w)))

def vector_square(w):
    return np.sum(list(np.array(w)*np.array(w)))
   
def dot_product(x, y):
    return np.sum(list(np.array(x)*np.array(y)))

def vector_square_sqrt(w):
    return mt.sqrt(vector_square(w))


###Measures    
def cosine_similarity(x, y):
    num = dot_product(x, y)
    dem = vector_square_sqrt(x)*vector_square_sqrt(y)
    return num/dem

def pearson_correlation(x, y):
    num = len(x)*dot_product(x, y) - vector_addition(x)*vector_addition(y)
    dem = mt.sqrt((len(x)*vector_square(x) - mt.pow(vector_addition(x), 2))*
                  (len(y)*vector_square(y) - mt.pow(vector_addition(y), 2)))
    if(mt.isnan(num/dem)):
        return "0/0"
    else:
        return num/dem

def euclidean(x, y):
     return mt.sqrt(np.sum(np.square(list(np.array(x)-np.array(y)))))

def jaccard_similarity(x, y):
    num = set(x) - (set(x) - set(y))
    dem = set(x+y)
    return len(num)/len(dem)

def jaccard_distance(x, y):
    return 1 - jaccard_similarity(x, y)
    

In [2]:

###Vectors
x = [[1,1,1,1], [0,1,0,1], [0,-1,0,1], [1,1,0,1,0,1], [2,-1,0,2,0,-3]]
y = [[2,2,2,2], [1,0,1,0], [1,0,-1,0], [1,1,1,0,0,1], [-1,1,-1,0,0,-1]]

for i in range(0, len(x)):
    num_exercise = "[" + str(i) + "]"
    print(num_exercise, "For x =", x[i], "and y =", y[i])
    print("Cosine Similarity:", cosine_similarity(x[i], y[i]))
    print("Pearson Correlation:", pearson_correlation(x[i], y[i]))
    print("Euclidean Distance:", euclidean(x[i], y[i]))
    print("Jaccard Similarity:", jaccard_similarity(x[i], y[i]))
    print("Jaccard Distance:", jaccard_distance(x[i], y[i]), "\n\n")


[0] For x = [1, 1, 1, 1] and y = [2, 2, 2, 2]
Cosine Similarity: 1.0
Pearson Correlation: 0/0
Euclidean Distance: 2.0
Jaccard Similarity: 0.0
Jaccard Distance: 1.0 


[1] For x = [0, 1, 0, 1] and y = [1, 0, 1, 0]
Cosine Similarity: 0.0
Pearson Correlation: -1.0
Euclidean Distance: 2.0
Jaccard Similarity: 1.0
Jaccard Distance: 0.0 


[2] For x = [0, -1, 0, 1] and y = [1, 0, -1, 0]
Cosine Similarity: 0.0
Pearson Correlation: 0.0
Euclidean Distance: 2.0
Jaccard Similarity: 1.0
Jaccard Distance: 0.0 


[3] For x = [1, 1, 0, 1, 0, 1] and y = [1, 1, 1, 0, 0, 1]
Cosine Similarity: 0.75
Pearson Correlation: 0.25
Euclidean Distance: 1.4142135623730951
Jaccard Similarity: 1.0
Jaccard Distance: 0.0 


[4] For x = [2, -1, 0, 2, 0, -3] and y = [-1, 1, -1, 0, 0, -1]
Cosine Similarity: 0.0
Pearson Correlation: 0.0
Euclidean Distance: 4.69041575982343
Jaccard Similarity: 0.4
Jaccard Distance: 0.6 




