In [49]:
from __future__ import division
import argparse
import numpy as np
import scipy as sp
from scipy.spatial.distance import cosine
import gzip

# Embedding for Medical Concept

Results from paper
<b> Y. Choi, Y. Chiu, D. Sontag. <a href="http://cs.nyu.edu/~dsontag/papers/ChoiChiuSontag_AMIA_CRI16.pdf">Learning Low-Dimensional Representations of Medical Concepts</a>. Published in Proceedings of the AMIA Summit on Clinical Research Informatics (CRI), 2016.</b>

 `claims_codes_hs_300.txt.gz`: Embeddings of ICD-9 diagnosis and procedure codes, NDC medication codes, and LOINC laboratory codes, derived from a large claims dataset from 2005 to 2013 for roughly 4 million people.



### Load the CUI, ICD9, NDC, CPT, LOINIC mapping 

In [94]:
#read in the cui embedding 
#read in the cui embedding 
cuifile = 'claims_cuis_hs_300.txt.gz'
cui_embeddings = {}
with gzip.open(cuifile, 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.decode('utf8').strip().split(' ')
        name = eles[0]
        embedding = ' '.join(eles[1:])
        cui_embeddings[name] = embedding

In [None]:
ie = 'claims_codes_hs_300.txt'

ndc_to_embeddings = {}
loinic_to_embedding = {}
cpt_to_embedding = {}
icd9_to_embedding = {}

with open(ie, 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.strip().split(' ')
        name = eles[0]
        embedding = ' '.join(eles[1:])
        if name[0] == 'N':
            ndc_to_embeddings[name[2:]] = embedding
        if name[0] == 'L':
            loinic_to_embedding[name[2:]] = embedding
        elif name[0] == 'C':
            cpt_to_embedding[name[2:]] = embedding           
        if name[0] == 'I':
            icd9_to_embedding[name[4:]] = embedding


In [91]:
print ('lenght of LOINIC dictionary: ', len(loinic_to_embedding))
print ('lenght of ICD9 dictionary: ' ,len(icd9_to_embedding))
print ('lenght of NDC dictionary: ' ,len(ndc_to_embeddings))
print ('lenght of CPT dictionary: ' ,len(cpt_to_embedding))

lenght of LOINIC dictionary:  3093
lenght of ICD9 dictionary:  14039
lenght of NDC dictionary:  21565
lenght of CPT dictionary:  11746


In [97]:
#### if convert of the ndc and icd to cui

In [53]:
#load the mapping files 
# using parents with smallest number of child
ingredient_to_ndcs = {}
with open('eval/ingredient_ndcs.txt', 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.strip().split(' ')
        ingredient_to_ndcs[eles[0]] = eles[1:]

# using parents with smallest number of child
ingredient_to_ndcs = {}
with open('eval/ingredient_ndcs.txt', 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.strip().split(' ')
        ingredient_to_ndcs[eles[0]] = eles[1:]

# using all parents --- this is the method of converting NDCs to CUIs
# that we are using for this paper. The below are used in the paper.
ingredient_to_all_ndcs = {}
with open('eval/ingredient_all_ndcs.txt', 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.strip().split(' ')
        ingredient_to_all_ndcs[eles[0]] = eles[1:]
        
        
def get_icd9_cui_mappings():
    cui_to_icd9 = {}
    icd9_to_cui = {}
    with open('eval/cui_icd9.txt', 'r') as infile:
        data = infile.readlines()
        for row in data:
            ele = row.strip().split('|')
            if ele[11] == 'ICD9CM':
                cui = ele[0]
                icd9 = ele[10]
                if cui not in cui_to_icd9 and icd9 != '' and '-' not in icd9:
                    cui_to_icd9[cui] = icd9
                    icd9_to_cui[icd9] = cui
    return cui_to_icd9, icd9_to_cui


In [89]:
ie = 'claims_codes_hs_300.txt'

ndc_to_embeddings = {}
icd9_cui_to_embeddings = {}
loinic_to_embedding = {}
cpt_to_embedding = {}
icd9_to_embedding = {}
with open(ie, 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.strip().split(' ')
        name = eles[0]
        embedding = ' '.join(eles[1:])
        if name[0] == 'N':
            ndc_to_embeddings[name[2:]] = embedding
        if name[0] == 'L':
            loinic_to_embedding[name[2:]] = embedding
        elif name[0] == 'C':
            cpt_to_embedding[name[2:]] = embedding           
        if name[0] == 'I':
            icd9_to_embedding[name[4:]] = embedding
            if name[4:] in icd9_to_cui:
                cui = icd9_to_cui[name[4:]]
                icd9_cui_to_embeddings[cui] = embedding


In [92]:
#map of the ndc and icd9 to cui 
ndc_embeddings = []
for ingredient in ingredient_to_all_ndcs.keys():
    ndcs = ingredient_to_all_ndcs[ingredient]
    embeddings = []
    for ndc in ndcs:
        if ndc in ndc_to_embeddings:
            embedding = np.array(ndc_to_embeddings[ndc].split(' '), dtype = float)
            embeddings.append(embedding)
    if len(embeddings) > 0:
        embeddings = np.array(embeddings)
        embedding = np.mean(embeddings, axis=0)
        ndc_embeddings.append((ingredient, embedding))
    else:
        print ('not found')

not found
not found
not found
not found
not found
not found
not found


In [None]:
#write the CUI embeddings  
outfilename = 'claims_cuis_hs_300.txt'
with open(outfilename, 'w') as outfile:
    outfile.write('%s %s\n' %(len(ndc_embeddings) + len(icd9_cui_to_embeddings), embedding.shape[0]))
    for (ingredient, embedding) in ndc_embeddings:
        outfile.write('%s ' %(ingredient))
        for i in range(embedding.shape[0]):
            outfile.write('%.6f ' %(embedding[i]))
        outfile.write('\n')
    for cui in icd9_cui_to_embeddings:
        outfile.write('%s %s\n' %(cui, icd9_cui_to_embeddings[cui]))


### Map of the Concept ID (CUI) to ICD 
Similar mapping for the types of ['MSH', 'MTH', 'MEDLINEPLUS', 'NCI_FDA', 'NCI_NICHD', 'NCI', 'CST', 'MDR',
 'ICPC2P', 'ICPC2ICD10ENG', 'CCPSS', 'COSTAR', 'DXP', 'WHO', 'LCH', 'RCD', 'RCDSY',
'SNM', 'OMIM', 'LNC', 'SNOMEDCT_US', 'LCH_NW', 'HPO', 'NANDA-I','NOC', 'ICD9CM', 'BI',
 'CHV', 'MEDCIN', 'SNMI', 'ICPC2EENG', 'ICF-CY', 'ICF', 'ICD10AM', 'ICD10CM', 'ICD10',
 'SNOMEDCT_VET','CCS','CCS_10', 'NCI_NCI-GLOSS']

In [113]:
#read in the cui embedding 
cuifile = 'claims_cuis_hs_300.txt.gz'
cui_embeddings = {}
with gzip.open(cuifile, 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.decode('utf8').strip().split(' ')
        name = eles[0]
        embedding = ' '.join(eles[1:])
        cui_embeddings[name] = embedding

In [122]:
#genereate the mapping of the CUI and ICD10CM 
cui_to_icd10cm = {}
icd10cm_to_cui = {}
with open('MRCONSO.RRF', 'r') as infile:
    lines = infile.readlines()
    for row in lines:
        datum = row.strip().split('|')
        if (datum[1] == 'ENG') & (datum[11] in ['ICD10CM']) & (datum[0] not in cui_to_icd10cm):
            cui_to_icd10cm[datum[0]] = datum[13] 
            if (datum[13] not in icd10cm_to_cui):
                icd10cm_to_cui[datum[13]] = [datum[0]]
            else: 
                if datum[0] not in icd10cm_to_cui[datum[13]]:
                    icd10cm_to_cui[datum[13]].append(datum[0] )

In [174]:
icd10cm_to_embedding={}

for icd in icd10cm_to_cui.keys():
    cuis = icd10cm_to_cui[icd]
    embeddings = []
    for cui in cuis:
        if cui in cui_embeddings.keys():
            embedding = np.array(cui_embeddings[cui].split(' '), dtype = float)
            embeddings.append(embedding)
    if len(embeddings) > 0 :
        embeddings = np.array(embeddings)
        embedding = np.mean(embeddings, axis=0)
        icd10cm_to_embedding[icd] = embedding

In [198]:
print( 'size of CUI embedding: ', len(cui_embeddings))
print( 'size of embedding ICD10CM: ', len(icd10cm_to_embedding))

size of CUI embedding:  14853
size of embedding ICD10CM:  4394


# Embedding from UMLS 

`stanford_cuis_svd_300.txt.gz`: Embeddings of [UMLS](https://www.nlm.nih.gov/research/umls/) concept unique identifiers (CUIs), derived from 20 million clinical notes spanning 19 years of data from Stanford Hospital and Clinics, using a  [data set](http://datadryad.org/resource/doi:10.5061/dryad.jp917) released in a [paper](http://www.nature.com/articles/sdata201432) by Finlayson, LePendu & Shah.



In [158]:
#read in the cui embedding 
svdfile = 'stanford_cuis_svd_300.txt.gz'
svd_embeddings = {}
with gzip.open(svdfile, 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.decode('utf8').strip().split(' ')
        name = eles[0]
        embedding = ' '.join(eles[1:])
        svd_embeddings[name] = embedding
        
#Read in the concept_to_CUI map
concept_to_CUI_hdr = 'eval/2b_concept_ID_to_CUI.txt'
concept_to_CUI_map = {}
f = open(concept_to_CUI_hdr, 'r')
lines = f.readlines()
for line in lines:
    concept = line.split('\t')[0]
    CUI = line.split('\t')[1].split('\r')[0].splitlines()
    concept_to_CUI_map[concept] = CUI

cui_svd_embeddings = {}
for  line in svd_embeddings.keys():
    if line in concept_to_CUI_map.keys():
        cui = concept_to_CUI_map[line][0]
        cui_svd_embeddings[cui] = svd_embeddings[line]    

In [184]:
icd10cm_svd_embedding={}

for icd in icd10cm_to_cui.keys():
    cuis = icd10cm_to_cui[icd]
    embeddings = []
    for cui in cuis:
        if cui in cui_svd_embeddings.keys():
            embedding = np.array(cui_svd_embeddings[cui].split(' '), dtype = float)
            embeddings.append(embedding)
    if len(embeddings) > 0 :
        embeddings = np.array(embeddings)
        embedding = np.mean(embeddings, axis=0)
        icd10cm_svd_embedding[icd] = embedding

In [197]:
print( 'size of CUI embedding: ', len(svd_embeddings))
print( 'size of embedding ICD10CM: ', len(icd10cm_svd_embedding))

size of CUI embedding:  22706
size of embedding ICD10CM:  3063


# Embedding  from Semantic Similarity

Results generated from paper: 

 `DeVine_etal_200.txt.gz`: Embeddings of UMLS CUIs learned by [De Vine et al. CIKM '14](http://dl.acm.org/citation.cfm?id=2661974), derived from 348,566 medical journal abstracts (courtesy of the authors).


In [147]:
#read in the cui embedding 
file = 'DeVine_etal_200.txt.gz'
cui200_embeddings = {}
with gzip.open(file, 'r') as infile:
    data = infile.readlines()
    for row in data:
        eles = row.decode('utf8').strip().split(' ')
        name = eles[0]
        embedding = ' '.join(eles[1:])
        cui200_embeddings[name] = embedding

In [193]:
icd10cm_DeVine_embedding={}

for icd in icd10cm_to_cui.keys():
    cuis = icd10cm_to_cui[icd]
    embeddings = []
    for cui in cuis:
        if cui in cui200_embeddings.keys():
            embedding = np.array(cui200_embeddings[cui].split(' '), dtype = float)
            embeddings.append(embedding)
    if len(embeddings) > 0 :
        embeddings = np.array(embeddings)
        embedding = np.mean(embeddings, axis=0)
        icd10cm_DeVine_embedding[icd] = embedding

In [196]:
print( 'size of CUI embedding: ', len(cui200_embeddings))
print( 'size of embedding ICD10CM: ', len(icd10cm_DeVine_embedding))

size of CUI embedding:  52103
size of embedding ICD10CM:  3995


# Check the similarity 

In [199]:
# find the common icd10 
icd_ls = []
for x in icd10cm_to_embedding.keys():
    if x in icd10cm_svd_embedding.keys():
        icd_ls.append(x)

In [203]:
for x in icd_ls[:20]:
    print(cosine(icd10cm_to_embedding[x],icd10cm_svd_embedding[x]))

1.0281133579317911
0.9653097554607961
0.8929884075824519
1.0250823382441114
1.0351952733075056
1.0134070325694438
0.9385475067068678
0.9235336549321366
0.8525211368621229
1.0208316444842074
0.9565962753014222
1.0471509774815773
1.0578278027958175
0.9626119194801128
1.0699642739287951
1.058134600588414
0.972127745445609
0.9520061647935075
1.0166671695986844
0.9633936535634939
