In [1]:
import pickle
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import copy
import faiss
import gc
import subprocess

In [2]:
print('Loading Strings')

sorted_umls_df = pd.read_csv('/data/Bodenreider_UMLS_DL/Interns/Bernal/sorted_umls2020_auis.csv',sep='\t',index_col=0)
sorted_umls_df = sorted_umls_df.sort_values('0',ascending=False)

Loading Strings


In [3]:
original_umls_2020, new_umls_2020 = pickle.load(open('aui_string_map_UMLS2020_update.p','rb'))
original_auis = set([x[0] for x in original_umls_2020])

In [4]:
synonym_dict = pickle.load(open('new_umls_synonym_aui_dict.p','rb'))
umls2020AA_synonym_dict = pickle.load(open('original_umls_synonym_aui_dict.p','rb'))

In [5]:
new = []
synonym_list = []

for aui in tqdm(sorted_umls_df.auis):
    
    if aui in original_auis:
        new.append(False)
        synonyms = umls2020AA_synonym_dict[aui]
    else:
        new.append(True)
        synonyms = synonym_dict[aui]
    
    new_synonyms = []

    for aui in synonyms:
        if aui in original_auis:
            new_synonyms.append(aui)

    synonym_list.append(new_synonyms)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8951355/8951355 [01:00<00:00, 149058.79it/s]


In [6]:
sorted_umls_df['2020AB?'] = new
sorted_umls_df['2020AA_synonyms'] = synonym_list

In [7]:
sorted_umls_df.groupby('2020AB?').count()

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms
2020AB?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,8521220,8521200,8521220,8521220
True,430135,430133,430135,430135


In [8]:
model_name = 'sapbert'
vectors_name = '{}_vecs'.format(model_name)

In [9]:
vecs = []
for i in range(167):
    vecs.append(pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Bernal/{}/umls2020_{}_{}.p'.format(vectors_name, vectors_name, i),'rb')))
    
vecs = np.vstack(vecs)

In [10]:
sorted_umls_df[vectors_name] = list(vecs)

In [11]:
umls2020AA_df = sorted_umls_df[sorted_umls_df['2020AB?'] == False][['0','strings','auis','2020AA_synonyms']]
umls2020AA_vecs = sorted_umls_df[sorted_umls_df['2020AB?'] == False][vectors_name]
umls2020AA_vecs = np.vstack(umls2020AA_vecs)

In [12]:
umls2020AB_df = sorted_umls_df[sorted_umls_df['2020AB?']][['0','strings','auis','2020AA_synonyms']]
umls2020AB_vecs = sorted_umls_df[sorted_umls_df['2020AB?']][vectors_name]
umls2020AB_vecs = np.vstack(umls2020AB_vecs)

In [13]:
umls2020AA_aui2str = {}

for aui, string in tqdm(zip(umls2020AA_df.auis, umls2020AA_df.strings)):
    umls2020AA_aui2str[aui] = string

8521220it [00:12, 697561.55it/s]


In [14]:
synonym_strings = []

for syn_auis in tqdm(umls2020AA_df['2020AA_synonyms']):
    syn_strings = [umls2020AA_aui2str[aui] for aui in syn_auis]
    
    synonym_strings.append(syn_strings)

umls2020AA_df['synonym_strings'] = synonym_strings
umls2020AA_df['num_syms'] = [len(s) for s in umls2020AA_df['2020AA_synonyms']]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8521220/8521220 [00:55<00:00, 154661.53it/s]


In [15]:
synonym_strings = []

for syn_auis in tqdm(umls2020AB_df['2020AA_synonyms']):
    syn_strings = [umls2020AA_aui2str[aui] for aui in syn_auis]
    
    synonym_strings.append(syn_strings)

umls2020AB_df['synonym_strings'] = synonym_strings
umls2020AB_df['num_syms'] = [len(s) for s in umls2020AB_df['2020AA_synonyms']]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 430135/430135 [00:02<00:00, 188929.76it/s]


In [17]:
umls2020AA_df[umls2020AA_df['strings'] == 'CEBPA']

Unnamed: 0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms
1259107,4,CEBPA,A10782841,"[A10798769, A29778335, A10825207, A1916225, A1...","[CCAAT/Enhancer Binding Protein Alpha, CCAAT/E...",12
3683749,4,CEBPA,A10782842,"[A10799053, A29778507, A10824555, A29774149, A...","[CEBPA wt Allele, CEBPA wt Allele, CEBPA wt Al...",8
3111674,4,CEBPA,A20716401,"[A19532504, A20752317, A6899252, A23909252, A2...","[CEBPA gene, CEBPA gene, CEBPA gene, CEBPA Gen...",15
3111675,4,CEBPA,A27940230,"[A19532504, A20752317, A6899252, A23909252, A2...","[CEBPA gene, CEBPA gene, CEBPA gene, CEBPA Gen...",15
3111677,4,CEBPA,A12035931,"[A19532504, A20752317, A6899252, A23909252, A2...","[CEBPA gene, CEBPA gene, CEBPA gene, CEBPA Gen...",15
3111676,4,CEBPA,A7638315,"[A19532504, A20752317, A6899252, A23909252, A2...","[CEBPA gene, CEBPA gene, CEBPA gene, CEBPA Gen...",15


In [18]:
umls2020AB_df[umls2020AB_df['strings'] == 'CEBPA']

Unnamed: 0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms
8586334,4,CEBPA,A31614331,"[A19532504, A20752317, A6899252, A23909252, A2...","[CEBPA gene, CEBPA gene, CEBPA gene, CEBPA Gen...",16


In [63]:
umls2020AA_df.groupby('num_syms').count()[:10]/len(umls2020AA_df)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings
num_syms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.280057,0.280057,0.280057,0.280057,0.280057
1,0.18203,0.18203,0.18203,0.18203,0.18203
2,0.11573,0.11573,0.11573,0.11573,0.11573
3,0.088549,0.088548,0.088549,0.088549,0.088549
4,0.075977,0.075977,0.075977,0.075977,0.075977
5,0.032708,0.032708,0.032708,0.032708,0.032708
6,0.022918,0.022918,0.022918,0.022918,0.022918
7,0.019668,0.019668,0.019668,0.019668,0.019668
8,0.015915,0.015915,0.015915,0.015915,0.015915
9,0.014465,0.014465,0.014465,0.014465,0.014465


In [65]:
umls2020AB_df.groupby('num_syms').count()[:10]/len(umls2020AB_df)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings
num_syms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.604596,0.604596,0.604596,0.604596,0.604596
1,0.043865,0.043865,0.043865,0.043865,0.043865
2,0.040724,0.040724,0.040724,0.040724,0.040724
3,0.034429,0.034429,0.034429,0.034429,0.034429
4,0.049666,0.049666,0.049666,0.049666,0.049666
5,0.025669,0.025669,0.025669,0.025669,0.025669
6,0.018497,0.018497,0.018497,0.018497,0.018497
7,0.014698,0.014698,0.014698,0.014698,0.014698
8,0.014528,0.014528,0.014528,0.014528,0.014528
9,0.013526,0.013526,0.013526,0.013526,0.013526


In [35]:
aui_info = []

with open('/data/Bodenreider_UMLS_DL/UMLS_VERSIONS/2020AB-ACTIVE/META/MRCONSO.RRF','r') as fp:
    
    for line in fp.readlines():
        line = line.split('|')
        cui = line[0]
        aui = line[7]
        string = line[-5]
        
        aui_info.append({'AUI':aui, 'CUI':cui, 'STR':string})
        
cui2sg = {}

with open('/data/Bodenreider_UMLS_DL/UMLS_VERSIONS/2020AB-ACTIVE/META/MRSTY.RRF','r') as fp:
    
    for line in fp.readlines():
        line = line.split('|')
        cui = line[0]
        sg = line[3]
        cui2sg[cui] = sg
        
original_umls = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/INTERSECT_AUI2ID.PICKLE','rb'))
new_auis = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/UNIQUE_AUI2ID.PICKLE','rb'))

aui_vecs  = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/AUI2LAYER.PICKLE','rb'))

all_2020_auis = set(original_umls.keys()).union(new_auis.keys())

cui2aui = {}
aui2cui = {}
aui2str = {}
aui2sg = {}

cui_sg = []
cui_aui = []

for tup in aui_info:
    aui = tup['AUI']
    
    if aui in all_2020_auis:        
        cui = tup['CUI']
        string = tup['STR']
        sg = cui2sg[cui]

        auis = cui2aui.get(cui, [])
        auis.append(aui)
        cui2aui[cui] = auis

        aui2cui[aui] = cui
        aui2str[aui] = string
        aui2sg[aui] = sg

        cui_sg.append((cui, sg))
        cui_aui.append((cui, aui))
        
semgroups = pd.read_csv('SemGroups.txt',sep='|',header=None)

semtype2sg = {}

for i, row in semgroups.iterrows():
    
    st = row[3]
    sg = row[1]
    
    semtype2sg[st] = sg

In [36]:
cuis = []
sts = []

for aui in umls2020AA_df.auis:
    
    cuis.append(aui2cui[aui])
    sts.append(aui2sg[aui])
    
umls2020AA_df['cuis'] = cuis
umls2020AA_df['sem_types'] = sts
umls2020AA_df['sem_groups'] = [semtype2sg[st] for st in sts]

In [37]:
cuis = []
sts = []

for aui in umls2020AB_df.auis:
    
    cuis.append(aui2cui[aui])
    sts.append(aui2sg[aui])
    
umls2020AB_df['cuis'] = cuis
umls2020AB_df['sem_types'] = sts
umls2020AB_df['sem_groups'] = [semtype2sg[st] for st in sts]

In [40]:
umls2020AA_df.groupby('sem_groups').count().sort_values('sem_groups')/len(umls2020AA_df)

Unnamed: 0_level_0,0,strings,auis,cuis,sem_types
sem_groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Activities & Behaviors,0.001962,0.001962,0.001962,0.001962,0.001962
Anatomy,0.053923,0.053923,0.053923,0.053923,0.053923
Chemicals & Drugs,0.186851,0.186851,0.186851,0.186851,0.186851
Concepts & Ideas,0.026475,0.026473,0.026475,0.026475,0.026475
Devices,0.014396,0.014396,0.014396,0.014396,0.014396
Disorders,0.221163,0.221163,0.221163,0.221163,0.221163
Genes & Molecular Sequences,0.045673,0.045672,0.045673,0.045673,0.045673
Geographic Areas,0.002502,0.002502,0.002502,0.002502,0.002502
Living Beings,0.258185,0.258185,0.258185,0.258185,0.258185
Objects,0.005677,0.005677,0.005677,0.005677,0.005677


In [41]:
umls2020AB_df.groupby('sem_groups').count().sort_values('sem_groups')/len(umls2020AB_df)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,cuis,sem_types
sem_groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Activities & Behaviors,0.001009,0.001009,0.001009,0.001009,0.001009,0.001009,0.001009,0.001009
Anatomy,0.007372,0.007372,0.007372,0.007372,0.007372,0.007372,0.007372,0.007372
Chemicals & Drugs,0.269634,0.269634,0.269634,0.269634,0.269634,0.269634,0.269634,0.269634
Concepts & Ideas,0.022358,0.022356,0.022358,0.022358,0.022358,0.022358,0.022358,0.022358
Devices,0.009223,0.009223,0.009223,0.009223,0.009223,0.009223,0.009223,0.009223
Disorders,0.092669,0.092669,0.092669,0.092669,0.092669,0.092669,0.092669,0.092669
Genes & Molecular Sequences,0.138236,0.138233,0.138236,0.138236,0.138236,0.138236,0.138236,0.138236
Geographic Areas,0.000137,0.000137,0.000137,0.000137,0.000137,0.000137,0.000137,0.000137
Living Beings,0.379262,0.379262,0.379262,0.379262,0.379262,0.379262,0.379262,0.379262
Objects,0.005136,0.005136,0.005136,0.005136,0.005136,0.005136,0.005136,0.005136


In [70]:
pd.set_option('max_colwidth',5000)

In [27]:
umls2020AA_df = umls2020AA_df.reset_index(drop=True)
umls2020AB_df = umls2020AB_df.reset_index(drop=True)

In [76]:
umls2020AA_df[umls2020AA_df['auis'] == 'A4366995']

Unnamed: 0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms
2993612,4,autoimmune vasculitis,A4366995,"[A13021789, A4366994, A4366996, A4366997, A4366998, A4366993, A23080245, A13023076]","[Autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, Immune mediated vasculitis, Autoimmune vasculitis (disorder)]",8


In [75]:
umls2020AB_df[umls2020AB_df['auis'] == 'A4366995']

Unnamed: 0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms


In [31]:
np.dot(umls2020AA_vecs[8130780],umls2020AB_vecs[428419])

176.92184

In [None]:
umls2020AB_df['{}_{}-NN_strings'.format(model_name, k)] = nearest_neighbors_strings
umls2020AB_df['{}_{}-NN_auis'.format(model_name, k)] = nearest_neighbors_auis
umls2020AB_df['{}_{}-NN_dist'.format(model_name, k)] = list(full_sort_D)

In [None]:
umls2020AB_df['{}_{}-NN_recall'.format(model_name, k)] = recall_array

In [None]:
pickle.dump(umls2020AB_df, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_{}.{}-NN_DataFrame.p'.format(model_name, k),'wb'))