In [1]:
import io
import spacy
import pandas as pd
import tqdm
import numpy as np

In [2]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    # n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm.tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array([float(x) for x in tokens[1:]])
    return data

In [3]:
embeddings = load_vectors('embeddings/glove.6B.300d.txt')

400000it [00:46, 8604.26it/s]


In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
eval_set = pd.read_csv('test_set.tsv', sep='\t', header=None)

In [7]:
from numba import jit

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

In [8]:
with open('intermediate_results/test_desm_scores_glove.tsv', 'w') as f:
    for _, (q_id, q_str, doc_id, doc_str) in tqdm.tqdm(eval_set.iterrows()):
        query_doc = nlp(q_str)
        doc_score = np.zeros(300)
        document_doc = nlp(doc_str)
        total_additions = 0
        for doc_token in document_doc:
            if doc_token.text in embeddings:
                doc_score += embeddings[doc_token.text]
                total_additions += 1
        doc_score /= total_additions
        query_doc_score = 0
        total_additions = 0
        for query_token in query_doc:
            if query_token.text in embeddings:
                query_emb = embeddings[query_token.text]
                query_doc_score += cosine_similarity_numba(query_emb, doc_score)
                total_additions += 1
        query_doc_score /= total_additions
        f.write(f'{q_id}\t{doc_id}\t{query_doc_score}\n')

20000it [1:18:42,  4.23it/s]
