In [107]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


In [108]:
def cosine_similarity(v1,v2):
    # compute cosine similarity of vectors v1 to v2: (v1 dot v1)/(||v1||*||v2||)'
    #   (the L2-normalized dot product of vectors)
    from math import sqrt
    
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]
        y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y

    return sumxy/sqrt(sumxx*sumyy)

In [109]:
def euclidean_distance(v1, v2):
    # compute and return euclidean distance between vectors v1 and v2
    from math import sqrt
    
    to_sum = []
    i = 0
    for x,y in zip(v1, v2):
        i += 1
        to_sum.append((x-y)**2)

    return(sqrt(sum(to_sum)))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
lines = ['tom smith 124', 'thomas borgstadt', 'justin riley', 'mike harris', 'larry bird', 'miles davis', 'owen corning', 'tom borgstadt 124', 'tina fay']

v = TfidfVectorizer(binary=False, use_idf = False)

# fit a vector model to text lines
v_m = v.fit_transform(lines)                          

# convert the vector model to an array for ease of looping below
#    the index of the 'lines' list above will refer to to the vector values of the same line
#    so the vector for line[x] will be in v_m_array[x]
v_m_array = v_m.toarray()

# establish save area for most similar lines
cos_most_sim_lines = [-999,0,0]
euc_most_sim_lines = [999,0,0]
    # value 1 = similarity value
    # value 2 = line index
    # value 3 = compared line index

# this code will compare both line1 to line2 and line2 to line1 which in both cases yields the same 
# so to minimize coding this is okay
for i in range(len(v_m_array)):
    print 'primary: ', lines[i]
    for j in range(len(v_m_array)):
        if j != i:      # don't compare a document to itself, skip if so
            c = cosine_similarity(v_m_array[i], v_m_array[j])
            e = euclidean_distances(v_m_array[i], v_m_array[j])
            print '   compare: ', lines[j], ' cos:', c , ' euc:', e
            if c > cos_most_sim_lines[0]:
                cos_most_sim_lines[0] = c   # save most similar
                cos_most_sim_lines[1] = i   # line index
                cos_most_sim_lines[2] = j   # 2nd line index
            if e < euc_most_sim_lines[0]:
                euc_most_sim_lines[0] = e   # retain most similar
                euc_most_sim_lines[1] = i   # line index
                euc_most_sim_lines[2] = j   # 2nd line index

print '\nMost similar lines with cosine similarity of: ', cos_most_sim_lines[0]
print '=>', lines[cos_most_sim_lines[1]]
print '=>', lines[cos_most_sim_lines[2]]

print '\nMost similar lines with euclidean distance of: ', euc_most_sim_lines[0]
print '=>', lines[euc_most_sim_lines[1]]
print '=>', lines[euc_most_sim_lines[2]]

    

primary:  tom smith 124
   compare:  thomas borgstadt  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  justin riley  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  mike harris  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  larry bird  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  miles davis  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  owen corning  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  tom borgstadt 124  cos: [[ 0.66666667]]  euc: [[ 0.81649658]]
   compare:  tina fay  cos: [[ 0.]]  euc: [[ 1.41421356]]
primary:  thomas borgstadt
   compare:  tom smith 124  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  justin riley  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  mike harris  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  larry bird  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  miles davis  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  owen corning  cos: [[ 0.]]  euc: [[ 1.41421356]]
   compare:  tom borgstadt 124  cos: [[ 0.40824829]]  euc: [[ 1.08788943]]


