# Centroid Embeddings + Document Retrieval

In [264]:
import re
import json
import pandas as pd
from pprint import pprint
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
from numpy import dot
from numpy.linalg import norm
import operator
import gensim

In [3]:
model_path = 'BioWordVec_PubMed_MIMICIII_d200.vec.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [4]:
queried_doc = 'johan_tests/queried_docs.json'
with open(queried_doc,'r') as f:
    queries_docs = json.load(f)

In [270]:
q_results = queries_docs

In [271]:
def clean(text):
    pattern = r'[0-9]'
    stripped = text.strip().lower() #del
    new_string = re.sub(pattern, '', stripped) #delete numbers
    cleaned_text = re.sub(r'[^a-z0-9\s]','',new_string)
    return cleaned_text

In [272]:
def tokenize(text):
    data = []
    for sentence in sent_tokenize(text):
        temp = []
        for word in word_tokenize(sentence):
            temp.append(word)
        data.extend(temp) #data= cada frase tokenizada... en total tenemos 2593 queries
    return(data)

In [273]:
def extract_unique_doc_info(results):
    unique_docs = {}
    for result in results:
        result_docs = list(result.values())[0]['documents']
        for doc, doc_info in result_docs.items():
            if (doc in unique_docs) \
                or isinstance(doc_info, str) \
                or (doc_info['abstract'] == '') \
                or (doc_info['abstract'] is None):
                continue
            else:
                unique_docs[doc] = {
                    'title': tokenize(clean(doc_info['title'])),
                    'abstract': tokenize(clean(doc_info['abstract'])),
                    'score': doc_info['score']
                }
    return unique_docs

In [274]:
def filter_useful_results(results, unique_docs):
    useful_results = []
    for result in results:
        result_docs = list(result.values())[0]['documents']
        result_question = list(result.values())[0]['question']
        result_id = list(result.keys())[0]
        valid_docs = set(result_docs.keys()) & set(unique_docs.keys())
        valid_result = {
            result_id : {
                'question': tokenize(clean(result_question)),
                'documents': {}
            }
        }
        for valid_doc in valid_docs:
            valid_result[result_id]['documents'][valid_doc] = unique_docs[valid_doc]
        useful_results.append(valid_result)
    return useful_results

In [275]:
def extract_unique_title_abstracts(results):
    unique_doc_ids = []
    titles = []
    abstracts = []
    doc_ids = []
    scores = []
    for result in results:
        result_docs = list(result.values())[0]['documents']
        result_doc_ids = list(
            result_docs.keys()
        )
        for result_doc_id in result_doc_ids:
            if result_doc_id not in unique_doc_ids:
                unique_doc_ids.append(result_doc_id)
                doc_ids.append(result_doc_id)
                title = result_docs[result_doc_id]['title']
                titles.append(title)
                abstract = result_docs[result_doc_id]['abstract']
                abstracts.append(abstract)
                score_s =result_docs[result_doc_id]['score']
                scores.append(score_s)
    return titles, abstracts, doc_ids, scores

In [276]:
def extract_unique_questions(results):
    questions = []
    for result in results:
        result_question = list(result.values())[0]['question']
        questions.append(result_question)
    return questions

In [277]:
def bio_w2vec(q_tok,t_tok,a_tok):
    model_q = gensim.models.word2vec.Word2Vec(q_tok, window = 5,vector_size=200, min_count = 1, workers = 3,sg= 0)
    model_d_t = gensim.models.word2vec.Word2Vec(t_tok, window = 5,vector_size=200, min_count = 1, workers = 3,sg= 0)
    model_d_a = gensim.models.word2vec.Word2Vec(a_tok, window = 5,vector_size=200, min_count = 1, workers = 3,sg= 0)
    return(model_q,model_d_t,model_d_a)

In [278]:
def centroide(tokens, model):
    model = model_q
    model.train(questions_tok, total_examples = 1, epochs = 1)
    num = 0
    centroid = []
    for tok in questions_tok:
        try:
            wij = model.wv[tok] #crea vector de dimension 200 (wij vec)
            num = wij
        except:
            pass
        cent = np.mean(num,axis=0)
        centroid.append(cent)
    return(centroid)

In [279]:
def get_qry_ids(useful_results):
    lista_id_qry = []
    for i in range(len(useful_results)):
        lista = list(useful_results[i].keys())
        lista_id_qry.append(lista[0])
    return(lista_id_qry)

In [280]:
def cosine_sim(cemb_dict):
    cembs_dict_total = []
    for i in range(len(cembs_dict)):
        list_rel_d = list(cembs_dict[i][0]['documents'].keys())
        for j in range(len(list_rel_d)):
            sim = dot(cent_q[i],cent_d_t[j])/(norm(cent_q[j])*norm(cent_d_t[j]))
            cembs_dict[i][0]['documents'][list_rel_d[j]].update({'cosine_sim_title':sim})
            sim = dot(cent_q[i],cent_d_a[j])/(norm(cent_q[j])*norm(cent_d_a[j]))
            cembs_dict[i][0]['documents'][list_rel_d[j]].update({'cosine_sim_abstract':sim})
        cembs_dict_total.append(cembs_dict[i][0])
    return(cembs_dict_total)

In [None]:
unique_docs = extract_unique_doc_info(q_results)
useful_results = filter_useful_results(q_results, unique_docs)
titles_tok, abstracts_tok, docs_ids, doc_scores = extract_unique_title_abstracts(useful_results)
questions_tok = extract_unique_questions(useful_results)

In [None]:
model_q,model_d_t,model_d_a = bio_w2vec(questions_tok,titles_tok,abstracts_tok)
model_q.save('NFC_Word2Vec_query_model.bin')
model_d_t.save('NFC_Word2Vec_doc_title_model.bin')
model_d_a.save('NFC_Word2Vec_doc_abstract_model.bin')
model_query = Word2Vec.load("NFC_Word2Vec_query_model.bin")
model_doc_t = Word2Vec.load("NFC_Word2Vec_doc_title_model.bin")
model_doc_a = Word2Vec.load("NFC_Word2Vec_doc_abstract_model.bin")

In [None]:
cent_q = centroide(questions_tok,model_query)
cent_d_t = centroide(titles_tok,model_doc_t)
cent_d_a = centroide(abstracts_tok,model_doc_a)

In [None]:
qry_ids_list = get_qry_ids(useful_results)

In [None]:
# for query 0 
cembs_dict = []
for i in range(len(qry_ids_list)):
    dict_prueba = useful_results[i]
    lista_q = list(dict_prueba.values())
    lista_q[0].update({'centroid_q' : cent_q[i]} )
    list_rel_d = list(lista_q[0]['documents'].keys())
    for j in range(len(list_rel_d)):
        lista_q[0]['documents'][list_rel_d[j]].update({'centroid_d_t':cent_d_t[j],'centroid_d_a':cent_d_a[j]})
    cembs_dict.append(lista_q)

In [None]:
total_dictionary = cosine_sim(cembs_dict)

In [None]:
doc_rel = {}
for i in range(len(qry_ids_list)):
    doc_rel[qry_ids_list[i]] = {}
    lista_d = list(total_dictionary[i]['documents'].values())
    lista_id_d =list(total_dictionary[i]['documents'].keys())
    for j in range(len(lista_d)):
        cos_abs = lista_d[j]['cosine_sim_abstract']
        cos_title = lista_d[j]['cosine_sim_title']
        score_bm25 = lista_d[j]['score']
        score_total = score_bm25*(cos_abs+cos_title)
        doc_rel[qry_ids_list[i]].update({str(lista_id_d[j]):score_total})


### Salida: diccionario de id query, junto con el id de sus docs relevantes con el puntaje de similitud