In [1]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import scispacy
from scispacy.linking import EntityLinker
import numpy as np
import csv
import sys
import os
import math
from collections import Counter

In [4]:
outcome = "pmv"  # Switch to los (length of stay) or mortality for other outcomes

# Loading MeSH terms for documents from outcome-specific literature collection
doc_tags = pickle.load(open('../data/mesh-terms/{}_mesh_terms.pkl'.format(outcome), 'rb'))

# Loading MeSH terms for EHRs for patient cohort 
mention_info = pickle.load(open('../data/mesh-terms/mimic_ehr_mesh_terms.pkl', 'rb'))

# Note that the cohort for PMV prediction is smaller than other outcomes
# So we need to filter out patients for whom PMV information is not available
ids2keep = pickle.load(open('../data/pmv_ids.pkl', 'rb')) if outcome == 'pmv' else None

# Reformat EHR MeSH term data
ehr_tags = {}
for file in mention_info:
    if ids2keep is not None and file not in ids2keep:
        continue
    ehr_mesh_terms = []
    for sent in mention_info[file]:
        for mention in mention_info[file][sent]:
            if 'mesh_ids' not in mention:
                continue
            for pair in mention['mesh_ids']:
                ehr_mesh_terms.append(pair[0])
    ehr_tags[file] = ehr_mesh_terms

In [5]:
# Compute vocabulary of MeSH terms for TF-IDF vector building
mesh_vocab = set([x[0] for y in doc_tags.values() for x in y]).intersection(set([x for y in ehr_tags.values() for x in y]))
print('MeSH vocabulary size: {}'.format(len(mesh_vocab)))
mesh_vocab = dict(list(zip(list(mesh_vocab), range(len(mesh_vocab)))))

MeSH vocabulary size: 8177


In [6]:
# Construct TF-IDF vectors for both outcome-specific literature and EHRs
doc_freq = Counter()
ehr_vectors = {}
article_vectors = {}

# Term frequency computation
for file in ehr_tags:
    term_list = [x for x in ehr_tags[file] if x in mesh_vocab]
    doc_freq.update(set(term_list))
    cur_vec = [0] * len(mesh_vocab)
    for x in term_list:
        cur_vec[mesh_vocab[x]] += 1
    ehr_vectors[file] = cur_vec
for file in doc_tags:
    term_list = [x[0] for x in doc_tags[file] if x[0] in mesh_vocab]
    doc_freq.update(set(term_list))
    cur_vec = [0] * len(mesh_vocab)
    for x in term_list:
        cur_vec[mesh_vocab[x]] += 1
    article_vectors[file] = cur_vec

# Incorporating IDF computation
num_docs = len(doc_tags) + len(ehr_tags)
doc_freq = {k:math.log(num_docs/float(v)) for k,v in doc_freq.items()}
doc_freq_vector = [1] * len(mesh_vocab)
for x in mesh_vocab:
    doc_freq_vector[mesh_vocab[x]] = doc_freq[x]
for file in ehr_vectors:
    ehr_vectors[file] = [x*y for x,y in zip(ehr_vectors[file], doc_freq_vector)]
for file in article_vectors:
    article_vectors[file] = [x*y for x,y in zip(article_vectors[file], doc_freq_vector)]

In [7]:
# Construct TF-IDF vector matrices for both literature and outcomes
# This helps speed up cosine similarity computation 
ehr_items = list(ehr_vectors.items())
ehr_ids, ehr_matrix = [x[0] for x in ehr_items], [x[1] for x in ehr_items]
ehr_matrix = np.vstack(ehr_matrix)
article_items = list(article_vectors.items())
article_ids, article_matrix = [x[0] for x in article_items], [x[1] for x in article_items]
article_matrix = np.vstack(article_matrix)

In [8]:
# Computing cosine similarities and identifying top ranked documents
similarities = []
ranked_pairs = {}
if ehr_matrix.shape[0] < 8000:
    similarities = cosine_similarity(ehr_matrix, article_matrix)
    top_indices = np.argsort(-1*similarities)[:, :1000]
    top_similarities = np.take_along_axis(similarities, top_indices, axis=-1)
    top_pairs = np.stack((top_indices, top_similarities), axis=2).tolist()
    for i, file in enumerate(ehr_ids):
        ranked_pairs[file] = [(article_ids[int(x[0])],x[1]) for x in top_pairs[i]]
else:
    for start in range(0, ehr_matrix.shape[0], 6000):
        print('Computing similarities for slice starting at index {}'.format(start))
        end = min(start + 6000, ehr_matrix.shape[0])
        cur_ehr_matrix = ehr_matrix[start:end, :]
        cur_similarities = cosine_similarity(cur_ehr_matrix, article_matrix)
        top_indices = np.argsort(-1*cur_similarities)[:, :1000]
        top_similarities = np.take_along_axis(cur_similarities, top_indices, axis=-1)
        top_pairs = np.stack((top_indices, top_similarities), axis=2).tolist()
        cur_ehr_ids = ehr_ids[start:end]
        for i, file in enumerate(cur_ehr_ids):
            ranked_pairs[file] = [(article_ids[int(x[0])],x[1]) for x in top_pairs[i]]

In [9]:
# Store ranked results from sparse retriever
pickle.dump(ranked_pairs, open('../data/{}_sparse_ranked_docs.pkl'.format(outcome), 'wb'))