In [1]:
import gensim.models
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import numpy.linalg as LA

In [2]:
def load_vector(filename):
    '''Load the word vector of the filenmae'''
    data = {}
    with open(filename, "r") as f:
        for l in f:
            words = l.strip().split()
            word = words[0]
            res = np.array([float(item) for item in words[1:]])
            data[word] = res
    return data


In [3]:
def load_wordsim353(filename):
    ''' load the wordsim 353 test
    the test format:
    worda wordb score'''
    words = []
    with open(filename) as f:
        for l in f:
            words.append(l.strip().split())
    test_words = [(item[0], item[1]) for item in words]
    answer = [float(item[2]) for item in words]
    return test_words, answer


In [4]:
vector_filename = "./glove.6B/glove.6B.50d.txt"

word2vector = load_vector(vector_filename)

In [5]:
print (len(word2vector))

400000


In [8]:
print (type(word2vector))

<class 'dict'>


In [9]:
test_filename = "./wordsim353.txt"

test_words, answer = load_wordsim353(test_filename)

In [15]:
print (len(test_words))

353


In [71]:
def similarity(word1, word2, word2vector):
    '''get the cosine similarity of the word1 and word2'''
    if word1 in word2vector:
        vector1 = word2vector[word1]
    else:
        print ("key do not exist:" + word1)
        return 0
        
    if word2 in word2vector:
        vector2 = word2vector[word2]
    else:
        print ("key do not exist:" + word2)
        return 0
        
    res = np.dot(vector1, vector2) / (LA.norm(vector1) * LA.norm(vector2))
    res = float(res)
    return res

In [72]:
def get_score(test_words, word2vector):
    '''test the word2vector using test_words'''
    return [similarity(word1, word2, word2vector) for word1, word2 in test_words]

In [73]:
my_answer = get_score(test_words, word2vector)

key do not exist:CD
key do not exist:Jerusalem
key do not exist:Jerusalem
key do not exist:Maradona
key do not exist:Arafat
key do not exist:Arafat
key do not exist:Arafat
key do not exist:Freud
key do not exist:FBI
key do not exist:FBI
key do not exist:Mars
key do not exist:Mars
key do not exist:Wednesday
key do not exist:Japanese
key do not exist:Harvard
key do not exist:OPEC
key do not exist:OPEC
key do not exist:Mexico


In [60]:
from functools import cmp_to_key

def get_rank(score):
    '''Given a list and give out the rank result of the list'''
    tmp = [(i, item) for i, item in enumerate(score)]
    # tmp = sorted(tmp, cmp=lambda x, y: -cmp_to_key(x[1], y[1]))
    tmp = sorted(tmp, key=cmp_to_key(lambda x, y: (x[1] > y[1]) - (x[1] < y[1])))
#     tmp = sorted(tmp, key=cmp_to_key(lambda x, y: x[1] > y[1]))
    print(type(tmp))
    print(len(tmp))

    res = [0 for i in range(len(tmp))]
    for i, (pos, score) in enumerate(tmp):
        res[pos] = i
    return res

In [61]:
answer_rank = get_rank(answer)

<class 'list'>
353


In [62]:
print (type(answer_rank))
print (len(answer_rank))

<class 'list'>
353


In [63]:
print (answer_rank)

[209, 243, 352, 254, 271, 267, 139, 175, 259, 210, 250, 215, 163, 148, 225, 197, 212, 97, 140, 231, 301, 13, 11, 6, 17, 201, 70, 7, 255, 305, 280, 347, 1, 0, 329, 149, 202, 324, 274, 14, 351, 331, 343, 213, 198, 264, 206, 275, 32, 320, 245, 164, 207, 290, 306, 244, 105, 128, 325, 307, 49, 12, 153, 219, 286, 37, 339, 340, 349, 336, 346, 337, 342, 350, 334, 261, 233, 246, 187, 174, 38, 90, 141, 109, 23, 89, 87, 20, 50, 8, 51, 9, 2, 24, 3, 4, 321, 345, 344, 312, 266, 241, 326, 281, 220, 136, 55, 294, 295, 232, 216, 226, 103, 131, 143, 302, 227, 217, 251, 195, 184, 311, 276, 238, 204, 130, 258, 323, 299, 303, 297, 304, 291, 292, 142, 73, 41, 200, 33, 16, 160, 196, 218, 31, 43, 65, 25, 239, 110, 21, 145, 252, 240, 18, 114, 115, 211, 161, 223, 314, 96, 44, 132, 310, 262, 95, 178, 193, 29, 26, 332, 91, 124, 66, 47, 308, 176, 168, 191, 78, 34, 48, 133, 268, 52, 10, 56, 199, 100, 67, 84, 194, 85, 144, 150, 265, 40, 228, 127, 188, 289, 108, 111, 117, 229, 60, 28, 146, 247, 309, 98, 118, 113, 203

In [76]:
my_rank = get_rank(my_answer)

<class 'list'>
353


In [77]:
print (my_rank)

[239, 237, 352, 263, 220, 335, 265, 322, 262, 341, 320, 272, 339, 295, 332, 223, 268, 116, 238, 311, 346, 185, 5, 43, 47, 90, 58, 99, 233, 247, 225, 349, 1, 29, 327, 53, 170, 6, 7, 75, 33, 8, 348, 343, 229, 119, 9, 10, 11, 267, 284, 96, 144, 286, 48, 351, 72, 87, 260, 297, 126, 84, 128, 318, 317, 120, 288, 312, 336, 122, 329, 0, 301, 338, 264, 275, 49, 134, 178, 186, 2, 52, 105, 24, 180, 4, 145, 77, 118, 124, 242, 93, 51, 31, 46, 23, 230, 350, 194, 278, 261, 147, 248, 209, 68, 199, 175, 113, 62, 38, 79, 141, 27, 56, 89, 334, 155, 59, 158, 115, 159, 12, 171, 165, 337, 188, 259, 153, 197, 253, 183, 212, 298, 88, 182, 76, 41, 246, 44, 40, 82, 85, 28, 34, 3, 26, 25, 104, 69, 54, 63, 92, 152, 106, 228, 127, 208, 205, 13, 14, 217, 15, 16, 330, 102, 154, 266, 174, 55, 17, 342, 191, 108, 184, 67, 319, 250, 314, 18, 95, 97, 107, 177, 169, 202, 35, 137, 241, 274, 138, 234, 296, 214, 333, 211, 279, 179, 129, 73, 103, 161, 215, 81, 37, 323, 276, 39, 45, 94, 19, 70, 206, 227, 156, 304, 257, 167, 30

In [78]:
def get_corr(listA, listB):
    '''get teh correlation of the listA and List B'''
    x_bar = 0.0
    y_bar = 0.0
    for a, b in zip(listA, listB):
        x_bar += a
        y_bar += b
    x_bar *= (1. / float(len(listA)))
    y_bar *= (1. / float(len(listB)))
    res1 = 0.0
    res2 = 0.0
    res3 = 0.0
    for x, y in zip(listA, listB):
        res1 += ((x - x_bar) * (y - y_bar))
        res2 += ((x - x_bar) * (x - x_bar))
        res3 += ((y - y_bar) * (y - y_bar))
    return res1/np.sqrt(res2 * res3)

In [79]:
result = get_corr(my_rank, answer_rank)

In [80]:
print (result)

0.41870637764789587
