In [2]:
import pickle
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import copy

pd.set_option('precision',2)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
umls2020AB_df = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_2000-NN_DataFrame.p','rb'))

In [3]:
recall_array = list(umls2020AB_df['sapbert_400-NN_recall'].values)

In [4]:
pd.DataFrame(recall_array).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,170077.0,170077.0,170077.0,170077.0,170077.0,170077.0,170077.0,170077.0,170077.0
mean,0.195745,0.437299,0.531481,0.711214,0.7641,0.808504,0.855611,0.878045,0.892669
std,0.274749,0.392385,0.39839,0.368424,0.345396,0.31999,0.290707,0.274055,0.260342
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.083333,0.142857,0.40625,0.588235,0.733333,0.875,0.96,1.0
50%,0.083333,0.285714,0.5,1.0,1.0,1.0,1.0,1.0,1.0
75%,0.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
nearest_neighbors_auis = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Bernal/lex_lm_2000-NN.p','rb'))
nearest_neighbors_dist = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Bernal/lex_lm_2000-NN_dist.p','rb'))
nearest_neighbors_auis = [auis for auis in nearest_neighbors_auis]

original_umls_2020, new_umls_2020 = pickle.load(open('aui_string_map_UMLS2020_update.p','rb'))

new_umls_2020 = [x[0] for x in new_umls_2020]
new_umls_2020 = pd.DataFrame(new_umls_2020,columns=['auis'])
new_umls_2020['lexlm_2000-NN_auis']  = nearest_neighbors_auis
new_umls_2020['lexlm_2000-NN_dist']  = list(nearest_neighbors_dist)

umls2020AB_df = umls2020AB_df.merge(new_umls_2020,on='auis',how='inner')

query_synonym_auis = list(umls2020AB_df['2020AA_synonyms'])
nearest_neighbors_auis = umls2020AB_df['lexlm_2000-NN_auis']

#Calculating Recall @ 1,5,10,50,100
recall_array = []
# closest_dist_true = []
# closest_dist_false = []

for true_syn, top100 in tqdm(zip(query_synonym_auis, nearest_neighbors_auis)):
    
    true_syn = set(true_syn)
    
    if len(true_syn) > 0:
        recalls = []

        for n in [1,5,10,50,100,200,500,1000,2000]:

            topn = set(top100[:n])
            true_pos = topn.intersection(true_syn)

            recalls.append(len(true_pos)/len(true_syn))

        recall_array.append(recalls)
#         closest_dist_true.append([top100_dist[0], np.mean(top100_dist)])
    else:
        recalls = []

        recall_array.append(recalls)
#         closest_dist_false.append([top100_dist[0], np.mean(top100_dist)])

umls2020AB_df['lexlm_2000-NN_recall'] = recall_array

In [14]:
aui_info = []

with open('/data/Bodenreider_UMLS_DL/UMLS_VERSIONS/2020AB-ACTIVE/META/MRCONSO.RRF','r') as fp:
    
    for line in fp.readlines():
        line = line.split('|')
        cui = line[0]
        aui = line[7]
        string = line[-5]
        
        aui_info.append({'AUI':aui, 'CUI':cui, 'STR':string})
        
cui2sg = {}

with open('/data/Bodenreider_UMLS_DL/UMLS_VERSIONS/2020AB-ACTIVE/META/MRSTY.RRF','r') as fp:
    
    for line in fp.readlines():
        line = line.split('|')
        cui = line[0]
        sg = line[3]
        cui2sg[cui] = sg
        
original_umls = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/INTERSECT_AUI2ID.PICKLE','rb'))
new_auis = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/UNIQUE_AUI2ID.PICKLE','rb'))

aui_vecs  = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/AUI2LAYER.PICKLE','rb'))

all_2020_auis = set(original_umls.keys()).union(new_auis.keys())

cui2aui = {}
aui2cui = {}
aui2str = {}
aui2sg = {}

cui_sg = []
cui_aui = []

for tup in aui_info:
    aui = tup['AUI']
    
    if aui in all_2020_auis:        
        cui = tup['CUI']
        string = tup['STR']
        sg = cui2sg[cui]

        auis = cui2aui.get(cui, [])
        auis.append(aui)
        cui2aui[cui] = auis

        aui2cui[aui] = cui
        aui2str[aui] = string
        aui2sg[aui] = sg

        cui_sg.append((cui, sg))
        cui_aui.append((cui, aui))
        
semgroups = pd.read_csv('SemGroups.txt',sep='|',header=None)

semtype2sg = {}

for i, row in semgroups.iterrows():
    
    st = row[3]
    sg = row[1]
    
    semtype2sg[st] = sg
    
cuis = []
sts = []

for aui in umls2020AB_df.auis:
    
    cuis.append(aui2cui[aui])
    sts.append(aui2sg[aui])
    
umls2020AB_df['cuis'] = cuis
umls2020AB_df['sem_types'] = sts
umls2020AB_df['sem_groups'] = [semtype2sg[st] for st in sts]

In [27]:
100*pd.DataFrame(list(umls2020AB_df['sapbert_400-NN_recall'].values)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0
mean,19.57,43.73,53.15,71.12,76.41,80.85,85.56,87.8,89.27
std,27.47,39.24,39.84,36.84,34.54,32.0,29.07,27.41,26.03
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,8.33,14.29,40.62,58.82,73.33,87.5,96.0,100.0
50%,8.33,28.57,50.0,100.0,100.0,100.0,100.0,100.0,100.0
75%,25.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [28]:
100*pd.DataFrame(list(umls2020AB_df[(umls2020AB_df.num_syms > 0) & (umls2020AB_df['0'] < 20)]['sapbert_400-NN_recall'].values)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,15338200.0,15338200.0,15338200.0,15338200.0,15338200.0,15338200.0,15338200.0,15338200.0,15338200.0
mean,20.71,45.61,55.02,72.33,77.15,81.03,85.06,87.05,88.46
std,28.1,39.42,39.72,36.46,34.39,32.22,29.72,28.25,26.93
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,10.0,16.67,46.15,61.54,75.0,86.67,93.75,100.0
50%,9.09,33.33,50.0,100.0,100.0,100.0,100.0,100.0,100.0
75%,25.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [29]:
100*pd.DataFrame(list(umls2020AB_df['lexlm_2000-NN_recall'].values)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0
mean,9.83,21.81,28.0,41.78,46.52,50.75,55.85,58.98,62.17
std,20.97,32.75,37.01,43.77,45.01,45.73,46.19,45.96,45.48
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,3.42,22.22,34.25,58.82,81.82,90.91,98.98
75%,9.09,33.33,50.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [30]:
100*pd.DataFrame(list(umls2020AB_df[(umls2020AB_df.num_syms > 0) & (umls2020AB_df['0'] < 100)]['lexlm_2000-NN_recall'].values)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,16979600.0,16979600.0,16979600.0,16979600.0,16979600.0,16979600.0,16979600.0,16979600.0,16979600.0
mean,9.83,21.83,28.03,41.82,46.56,50.77,55.86,58.97,62.17
std,20.97,32.76,37.02,43.78,45.02,45.74,46.19,45.97,45.48
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,3.42,23.08,34.25,58.93,82.14,91.3,98.98
75%,9.09,33.33,50.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [31]:
pd.set_option('max_rows',200)

In [32]:
100*umls2020AB_df[(umls2020AB_df.num_syms > 0)].groupby('sem_groups').count().sort_values('0',ascending=False)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,sapbert_400-NN_strings,sapbert_400-NN_auis,sapbert_400-NN_dist,sapbert_400-NN_recall,num_syms,lexlm_2000-NN_auis,lexlm_2000-NN_dist,cuis,sem_types,lexlm_2000-NN_recall
sem_groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Chemicals & Drugs,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600,6654600
Genes & Molecular Sequences,5361300,5361200,5361300,5361300,5361300,5361300,5361300,5361300,5361300,5361300,5361300,5361300,5361300,5361300,5361300
Disorders,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300,1760300
Living Beings,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200,1634200
Procedures,669300,669300,669300,669300,669300,669300,669300,669300,669300,669300,669300,669300,669300,669300,669300
Physiology,256800,256800,256800,256800,256800,256800,256800,256800,256800,256800,256800,256800,256800,256800,256800
Concepts & Ideas,238100,238000,238100,238100,238100,238100,238100,238100,238100,238100,238100,238100,238100,238100,238100
Anatomy,170200,170200,170200,170200,170200,170200,170200,170200,170200,170200,170200,170200,170200,170200,170200
Devices,158800,158800,158800,158800,158800,158800,158800,158800,158800,158800,158800,158800,158800,158800,158800
Objects,52900,52900,52900,52900,52900,52900,52900,52900,52900,52900,52900,52900,52900,52900,52900


In [33]:
at1_sb_recall = []
at1_lm_recall = []

for i,row in tqdm(umls2020AB_df.iterrows()):
    recalls = row['sapbert_400-NN_recall']

    if len(recalls) > 0:
        at1_sb_recall.append(recalls)
    else:
        at1_sb_recall.append([None for i in [1,5,10,50,100,200,500,1000,2000]])

    recalls = row['lexlm_2000-NN_recall']

    if len(recalls) > 0:
        at1_lm_recall.append(recalls)
    else:
        at1_lm_recall.append([None for i in [1,5,10,50,100,200,500,1000,2000]])

430135it [01:56, 3688.48it/s] 


In [34]:
for index,n in tqdm(enumerate([1,5,10,50,100,200,500,1000,2000])): 
    umls2020AB_df['R@{}_SB'.format(n)] = np.array(at1_sb_recall)[:,index]   
    umls2020AB_df['R@{}_LM'.format(n)] = np.array(at1_lm_recall)[:,index]

9it [00:20,  2.31s/it]


In [35]:
umls2020AB_df

Unnamed: 0,0,strings,auis,2020AA_synonyms,synonym_strings,sapbert_400-NN_strings,sapbert_400-NN_auis,sapbert_400-NN_dist,sapbert_400-NN_recall,num_syms,...,R@100_SB,R@100_LM,R@200_SB,R@200_LM,R@500_SB,R@500_LM,R@1000_SB,R@1000_LM,R@2000_SB,R@2000_LM
0,1104,TRIFOLIUM PRATENSE FLOWER 3 [hp_X] in 1 mL / C...,A31798620,[],[],[TRIFOLIUM PRATENSE FLOWER 3 [hp_X] in 1 mL / ...,"[A26440340, A30927295, A23841760, A27180770, A...","[0.0, 0.0, 50.141663, 50.141663, 50.141663, 53...",[],0,...,,,,,,,,,,
1,865,LILIUM LANCIFOLIUM WHOLE FLOWERING 15 [hp_X] i...,A31791529,[],[],[LILIUM LANCIFOLIUM WHOLE FLOWERING 4 [hp_X] i...,"[A31203927, A30914006, A30914978, A29259366, A...","[48.458344, 48.458344, 68.00198, 71.98563, 79....",[],0,...,,,,,,,,,,
2,769,GARLIC 3 [hp_X] in 1 mL / AVENA SATIVA FLOWERI...,A32332182,[],[],[GARLIC 3 [hp_X] in 1 mL / AVENA SATIVA FLOWER...,"[A24842418, A28676601, A27390230, A28931492, A...","[0.0, 0.0, 28.652817, 53.046173, 53.046173, 53...",[],0,...,,,,,,,,,,
3,667,SUS SCROFA PITUITARY GLAND 6 [hp_X] in 1 mL / ...,A31622453,[],[],[BOS TAURUS PITUITARY GLAND 6 [hp_X] in 1 mL /...,"[A28532041, A21259528, A28576950, A29277909, A...","[44.42482, 64.85918, 80.25903, 80.25903, 80.25...",[],0,...,,,,,,,,,,
4,617,CHELIDONIUM MAJUS WHOLE 3 [hp_X] in 1 mL / LOB...,A31758369,[],[],[CHELIDONIUM MAJUS 3 [hp_X] in 1 mL / LOBELIA ...,"[A29974671, A27388758, A28558165, A23060867, A...","[43.67552, 43.67552, 50.00624, 50.00624, 50.00...",[],0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430130,3,vincristine,A31682108,"[A0491552, A18576062, A20895977, A22722488, A2...","[vincristine, vincristine, vincristine, vincri...","[VINCRISTINE, Vincristine, vinCRIStine, vincri...","[A8436833, A27055498, A4370610, A0491552, A299...","[0.00033569336, 0.00033569336, 0.00033569336, ...","[0.021739130434782608, 0.10869565217391304, 0....",46,...,0.67,1.00,0.87,1.00,0.96,1.00,0.96,1.00,0.96,1.00
430131,3,uridine,A31751560,"[A0491264, A18613289, A19986797, A26682704, A2...","[uridine, uridine, uridine, uridine, uridine, ...","[uridine, uridine, Uridine, Uridine, uridine, ...","[A19986797, A0491264, A18400855, A0130568, A31...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.037037037037037035, 0.18518518518518517, 0....",27,...,0.81,0.93,0.93,0.93,0.93,0.93,1.00,0.93,1.00,1.00
430132,3,Vagina,A31590388,"[A0131175, A0131181, A15368227, A18406922, A23...","[Vagina, Vagina, Vagina, Vagina, Vagina, Vagin...","[Vagina, Vagina, vagina, vagina, vagina, Vagin...","[A7561556, A19047065, A18613341, A23921079, A0...","[9.1552734e-05, 9.1552734e-05, 9.1552734e-05, ...","[0.037037037037037035, 0.14814814814814814, 0....",27,...,0.93,0.89,0.96,0.89,0.96,0.89,1.00,0.89,1.00,0.89
430133,3,SAV,A11924706,"[A20745789, A6908930, A23821695, A23809878, A2...","[SAV1 gene, SAV1 gene, WW domain-containing ad...","[SAVA, SAVA, SAVI, SAVI, SAV1, SAV1, SAV1 gene...","[A24379526, A23790121, A27927357, A24568869, A...","[41.30989, 41.30989, 53.73889, 53.73889, 87.26...","[0.0, 0.09090909090909091, 0.36363636363636365...",11,...,0.45,0.73,0.45,0.91,0.55,0.91,0.55,1.00,0.55,1.00


In [36]:
for n in [1,5,10,50,100,200,500,1000,2000]:
    umls2020AB_df['R@{}_oracle'.format(n)] = [None if sb == None else max(sb,lm) for lm, sb in zip(umls2020AB_df['R@{}_LM'.format(n)],umls2020AB_df['R@{}_SB'.format(n)])]

In [37]:
100*umls2020AB_df.filter(regex='R@.*_oracle').describe()

Unnamed: 0,R@1_oracle,R@5_oracle,R@10_oracle,R@50_oracle,R@100_oracle,R@200_oracle,R@500_oracle,R@1000_oracle,R@2000_oracle
count,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0,17007700.0
mean,20.14,46.2,56.77,75.6,80.69,84.77,88.93,90.83,92.09
std,27.54,39.1,39.44,35.04,32.35,29.52,26.32,24.58,23.17
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,10.2,19.23,56.25,74.07,84.21,98.77,100.0,100.0
50%,9.09,33.33,58.33,100.0,100.0,100.0,100.0,100.0,100.0
75%,25.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [110]:
sb_sem_group_analysis = []
lm_sem_group_analysis = []
oracle_sum_group_analysis = []

for sg, g in umls2020AB_df.groupby('sem_groups'):
    recalls = pd.DataFrame(list(g['sapbert_400-NN_recall'].values)).agg(['mean']).values
    sb_sem_group_analysis.append((sg, len(g)) + tuple(recalls[0]))

    recalls = pd.DataFrame(list(g['lexlm_2000-NN_recall'].values)).agg(['mean']).values
    lm_sem_group_analysis.append((sg, len(g)) + tuple(recalls[0]))
    
    oracle_sum_group_analysis.append((sg,len(g)) + tuple(g.filter(regex='R@.*_oracle').mean()))

In [111]:
recall_cols = ['R@{}'.format(n) for n in [1,5,10,50,100,200,500,1000,2000]]
sb_sem_group_analysis = pd.DataFrame(sb_sem_group_analysis, columns=['SemGroup','NumAuis'] + recall_cols).sort_values('NumAuis',ascending=False)
lm_sem_group_analysis = pd.DataFrame(lm_sem_group_analysis, columns=['SemGroup','NumAuis'] + recall_cols).sort_values('NumAuis',ascending=False)
oracle_sum_group_analysis = pd.DataFrame(oracle_sum_group_analysis, columns=['SemGroup','NumAuis'] + recall_cols).sort_values('NumAuis',ascending=False)

In [112]:
sb_sem_group_analysis = sb_sem_group_analysis[['SemGroup','NumAuis','R@1','R@100','R@2000']].merge(lm_sem_group_analysis[['SemGroup','R@1','R@100','R@2000']],on='SemGroup',suffixes=['_SAPBERT','_LexLM']).sort_values('R@100_SAPBERT',ascending=False)
sb_sem_group_analysis

Unnamed: 0,SemGroup,NumAuis,R@1_SAPBERT,R@100_SAPBERT,R@2000_SAPBERT,R@1_LexLM,R@100_LexLM,R@2000_LexLM
5,Physiology,10621,0.32,0.96,0.99,0.06,0.26,0.4
4,Procedures,20710,0.3,0.93,0.98,0.16,0.5,0.63
9,Objects,2209,0.26,0.92,0.97,0.16,0.61,0.7
10,Phenomena,680,0.38,0.9,0.99,0.16,0.41,0.59
6,Concepts & Ideas,9617,0.3,0.88,0.96,0.21,0.45,0.54
13,Occupations,86,0.39,0.88,0.99,0.11,0.42,0.55
3,Disorders,39860,0.24,0.87,0.96,0.14,0.59,0.72
0,Living Beings,163134,0.45,0.86,0.94,0.14,0.45,0.55
1,Chemicals & Drugs,115979,0.19,0.85,0.98,0.12,0.63,0.82
14,Geographic Areas,59,0.39,0.84,1.0,0.19,0.56,0.89


In [114]:
sb_sem_group_analysis.merge(oracle_sum_group_analysis[['SemGroup','R@1','R@100','R@2000']],on='SemGroup',suffixes=['','_Oracle'])[['SemGroup','NumAuis','R@1_SAPBERT','R@1','R@100_SAPBERT','R@100']]

Unnamed: 0,SemGroup,NumAuis,R@1_SAPBERT,R@1,R@100_SAPBERT,R@100
0,Physiology,10621,0.32,0.33,0.96,0.96
1,Procedures,20710,0.3,0.31,0.93,0.94
2,Objects,2209,0.26,0.28,0.92,0.94
3,Phenomena,680,0.38,0.39,0.9,0.91
4,Concepts & Ideas,9617,0.3,0.3,0.88,0.89
5,Occupations,86,0.39,0.39,0.88,0.9
6,Disorders,39860,0.24,0.25,0.87,0.92
7,Living Beings,163134,0.45,0.46,0.86,0.9
8,Chemicals & Drugs,115979,0.19,0.2,0.85,0.88
9,Geographic Areas,59,0.39,0.39,0.84,0.84


In [42]:
validation_df = []

for sg, g in umls2020AB_df.groupby('sem_groups'):
    validation_df.append(g.sample(int(len(g)*0.2), random_state=np.random.RandomState(42)))

validation_df = pd.concat(validation_df)

In [43]:
100*pd.DataFrame(list(validation_df['lexlm_2000-NN_recall'].values)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0
mean,9.92,22.03,28.14,41.82,46.48,50.74,55.9,59.09,62.28
std,21.06,32.96,37.15,43.8,45.04,45.74,46.18,45.95,45.47
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,3.42,22.41,34.25,59.09,81.82,91.67,99.52
75%,10.0,33.33,50.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [44]:
100*pd.DataFrame(list(validation_df['sapbert_400-NN_recall'].values)).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0,3402400.0
mean,19.67,43.98,53.35,71.28,76.5,80.8,85.51,87.77,89.23
std,27.52,39.36,39.95,36.81,34.56,32.14,29.2,27.51,26.11
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,8.33,14.49,41.18,60.0,73.91,87.5,96.15,100.0
50%,8.33,28.57,50.0,100.0,100.0,100.0,100.0,100.0,100.0
75%,25.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [61]:
pd.set_option('max_colwidth',2000)

In [54]:
errors = validation_df[validation_df['R@100_SB'] == 0.0]

In [117]:
errors['top10'] = [s[:10] for s in errors['sapbert_400-NN_strings']]
errors['top10_auis'] = [s[:10] for s in errors['sapbert_400-NN_auis']]
errors['top10_dist'] = [s[:10] for s in errors['sapbert_400-NN_dist']]
errors['top100'] = [s[:100] for s in errors['sapbert_400-NN_strings']]
errors['top100_dist'] = [s[:100] for s in errors['sapbert_400-NN_dist']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [67]:
errors.columns

Index(['0', 'strings', 'auis', '2020AA_synonyms', 'synonym_strings',
       'sapbert_400-NN_strings', 'sapbert_400-NN_auis', 'sapbert_400-NN_dist',
       'sapbert_400-NN_recall', 'num_syms', 'lexlm_2000-NN_auis',
       'lexlm_2000-NN_dist', 'cuis', 'sem_types', 'sem_groups',
       'lexlm_2000-NN_recall', 'R@1_SB', 'R@1_LM', 'R@5_SB', 'R@5_LM',
       'R@10_SB', 'R@10_LM', 'R@50_SB', 'R@50_LM', 'R@100_SB', 'R@100_LM',
       'R@200_SB', 'R@200_LM', 'R@500_SB', 'R@500_LM', 'R@1000_SB',
       'R@1000_LM', 'R@2000_SB', 'R@2000_LM', 'R@1_oracle', 'R@5_oracle',
       'R@10_oracle', 'R@50_oracle', 'R@100_oracle', 'R@200_oracle',
       'R@500_oracle', 'R@1000_oracle', 'R@2000_oracle', 'top10', 'top100',
       'top100_dist'],
      dtype='object')

In [78]:
errors.groupby('sem_groups').count()

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,sapbert_400-NN_strings,sapbert_400-NN_auis,sapbert_400-NN_dist,sapbert_400-NN_recall,num_syms,...,R@10_oracle,R@50_oracle,R@100_oracle,R@200_oracle,R@500_oracle,R@1000_oracle,R@2000_oracle,top10,top100,top100_dist
sem_groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Activities & Behaviors,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
Anatomy,23,23,23,23,23,23,23,23,23,23,...,23,23,23,23,23,23,23,23,23,23
Chemicals & Drugs,271,271,271,271,271,271,271,271,271,271,...,271,271,271,271,271,271,271,271,271,271
Concepts & Ideas,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
Devices,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
Disorders,85,85,85,85,85,85,85,85,85,85,...,85,85,85,85,85,85,85,85,85,85
Genes & Molecular Sequences,2005,2004,2005,2005,2005,2005,2005,2005,2005,2005,...,2005,2005,2005,2005,2005,2005,2005,2005,2005,2005
Living Beings,288,288,288,288,288,288,288,288,288,288,...,288,288,288,288,288,288,288,288,288,288
Objects,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Organizations,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [118]:
for i,g in errors.groupby('sem_groups'):
    if len(g) > 80:
        print(i)
        display(g[['auis','strings','2020AA_synonyms','synonym_strings','num_syms','top10','top10_auis']][:20])

Chemicals & Drugs


Unnamed: 0,auis,strings,2020AA_synonyms,synonym_strings,num_syms,top10,top10_auis
348900,A32289507,"Cep170 protein, rat","[A7165417, A6816219, A7170412]","[KAB1 protein, rat, rKAB1 protein, rat, KARP-1 binding protein 1, rat]",3,"[Cep350 protein, rat, centrosomal protein 350, rat, Cend1 protein, rat, coilin protein, rat, coiled-coil domain-containing protein 172, rat, Coil protein, rat, Ccd1 protein, rat, Ccdc47 protein, rat, Ccdc80 protein, rat, centrosome BRCA2-interacting protein, rat]","[A24313756, A24311949, A24315555, A11276853, A24127512, A11281055, A29479453, A31501178, A31495777, A23629597]"
352404,A31749366,Sterculia resin,"[A15973569, A17442291, A20001592, A16002845]","[karaya gum allergenic extract, karaya gum allergenic extract, karaya gum allergenic extract, karaya gum allergenic extract (medication)]",4,"[Sterculia (plant), Sterculia gum (substance), Product containing Sterculia gum (medicinal product), Sterculia gum-containing product, Sterculia gum, Sterculias, Sterculia, sterculia, Sterculia, sterculia]","[A13026242, A3719217, A29526841, A30264938, A3073379, A16995171, A0119512, A18662827, A23867343, A28574616]"
18469,A32288556,"2-oxo-4-hydroxy-4-carboxy-5-ureidoimidazoline decarboxylase, zebrafish",[A12985185],"[OHCU decarboxylase, zebrafish]",1,"[2-oxo-4-hydroxy-4-carboxy-5-ureidoimidazoline decarboxylase, Klebsiella pneumoniae, 2-oxo-4-hydroxy-4-carboxy-5-ureidoimidazoline decarboxylase, Arabidopsis, 2-oxo-4-hydroxy-4-carboxy-5-ureidoimidazoline decarboxylase activity, UREIDOIMIDAZOLINE (2-OXO-4-HYDROXY-4-CARBOXY-5-) DECARBOXYLASE, ureidoimidazoline (2-oxo-4-hydroxy-4-carboxy-5-) decarboxylase, 2-oxo-4-hydroxy-4-carboxy--5-ureidoimidazoline, 2-oxo-4-hydroxy-4-carboxy--5-ureidoimidazoline decarboxylase, mouse, 1-carbamoyl-2-oxo-4,5-dihydroxyimidazolidine, N1-carbamoyl-4,5-dihydroxy-2-oxoimidazolidine, 2-(5-hydrazinocarbonyl-2-oxazolyl)-5,6-dimethoxybenzothiazole]","[A18898708, A13386629, A19349780, A24610322, A23816669, A10911408, A10911407, A18900137, A12999132, A0844213]"
14024,A31791852,ALCOHOL 141.6 mL in 177 mL TOPICAL LIQUID [Garrison Brothers Distillery Hand Sanitizer],"[A23551760, A24136321, A28749498]","[ethanol 80 % Topical Gel, ethanol 80 % Topical Gel, ALCOHOL 0.8 mL in 1 mL TOPICAL GEL [GOJO Instant Hand Sanitizer]]",3,"[ALCOHOL 26.36 g in 44 mL TOPICAL LIQUID [Love You Berry Much], ALCOHOL 146.94 mL in 237 mL TOPICAL LIQUID [skin protectant Original], Alcohol Wintergreen, topical liquid, ALCOHOL 146.94 mL in 237 mL TOPICAL LIQUID [skin protectant Sweet Pea], ALCOHOL 146.94 mL in 237 mL TOPICAL LIQUID [skin protectant Lavender], ALCOHOL 100 mL in 62 mL TOPICAL LIQUID, ALCOHOL 74.4 mL in 120 mL TOPICAL LIQUID [skin protectant], ALCOHOL 26.36 g in 44 mL TOPICAL LIQUID [Citrus Sunshine], ALCOHOL 26.36 g in 44 mL TOPICAL LIQUID [Foxy Citrus], ALCOHOL 17.57 g in 30 mL TOPICAL LIQUID [Rainbow Berry Hand Sanitizer]]","[A27860389, A27344081, A1529280, A27343207, A27344080, A30279727, A27343208, A27861359, A27864245, A29473767]"
127332,A31733188,ALCOHOL 141.6 mL in 177 mL TOPICAL LIQUID,"[A24113008, A27177344, A27862675, A27859742]","[ethanol 80 % Topical Solution, ethanol 80 % Topical Solution, ALCOHOL 800 mg in 1 mL TOPICAL LIQUID [Amoveo 80%], ALCOHOL 80 mL in 100 mL TOPICAL SOLUTION [Alcare Extra Hand Sanitizer]]",4,"[ALCOHOL 146.94 mL in 237 mL TOPICAL LIQUID [skin protectant Original], ALCOHOL 100 mL in 62 mL TOPICAL LIQUID, ALCOHOL 74.4 mL in 120 mL TOPICAL LIQUID [skin protectant], ALCOHOL 146.94 mL in 237 mL TOPICAL LIQUID [skin protectant Lavender], ALCOHOL 146.94 mL in 237 mL TOPICAL LIQUID [skin protectant Sweet Pea], Alcohol Wintergreen, topical liquid, ALCOHOL 26.36 g in 44 mL TOPICAL LIQUID [Love You Berry Much], ALCOHOL 17.98 mL in 29 mL TOPICAL LIQUID [FRESH SCENT HAND SANITIZER], ALCOHOL 17.98 mL in 29 mL TOPICAL LIQUID [FRESH SCENT HAND SANITIZER], ALCOHOL 17.98 mL in 29 mL TOPICAL LIQUID [FRESH SCENT HAND SANITIZER]]","[A27344081, A30279727, A27343208, A27344080, A27343207, A1529280, A27860389, A30271156, A30273038, A30272269]"
65201,A32276077,"Vitamin E 450 MG Oral Capsule, Liquid Filled [LEADER VITAMIN E]","[A29223282, A29260547, A29186259, A29223283, A29260548, A29272814, A29204610, A29272815, A29204611, A29971370, A29965721, A30080534, A28231545, A28244420, A28218481, A28237931, A29967633, A29965722, A15542036, A8451223, A9057365, A9448281, A9452244, A9448313, A9452230, A9448305]","[vitamin E 1,000 unit ORAL CAPSULE, vitamin E 1,000 unit ORAL CAPSULE, vitamin E@1,000 unit@ORAL@CAPSULE, vitamin E acetate 1,000 unit ORAL CAPSULE, vitamin E acetate 1,000 unit ORAL CAPSULE, vitamin E acetate@1,000 unit@ORAL@CAPSULE, vitamin E mixed 1,000 unit ORAL CAPSULE, vitamin E mixed 1,000 unit ORAL CAPSULE, vitamin E mixed@1,000 unit@ORAL@CAPSULE, Premier Value Vitamin E Mixed 1000units Softgel, Radiance Vitamin E Mixed 1000units Natural Softgel, Walgreens Finest Vitamin E Blend 1000units Softgel, Vitamin E 1000 IU Oral Capsule, Liquid Filled [PHARMASSURE VITAMIN E], Vitamin E 1000 IU Oral Capsule, Liquid Filled [E1000 MIXED], Vitamin E 1000 IU Oral Capsule, Liquid Filled [VITAMIN E1000 DL-ALPHA], Vitamin E 1000 IU Oral Capsule, Liquid Filled [E1000], Vitamin E 1,000IU Oral capsule, liquid filled, Vitamin E 1000 IU Oral Capsule, Liquid Filled [VITAMIN E-1000 AVPAK], VITAMIN E 1000 UNT CAP, VITAMIN E 1000UNT CAP, Vitamin E 1000 IU Oral Capsule, Liquid Filled, Vitamin E Complex, 1000 intl units oral capsule, vitamin E with mixed tocopherols 1000 intl units oral capsule, Vitamin E, with mixed tocopherols 1000 intl units oral capsule, vitamin E 1000 intl units oral capsule, Vitamin E, 1000 intl units oral capsule]",26,"[Vitamin E 400 IU Oral Capsule, Liquid Filled [LEADER VITAMIN E], Vitamin E 400 IU Oral Capsule, Liquid Filled [VITAMIN E MIXED TOCOPHEROL], Vitamin E 400 IU Oral Capsule, Liquid Filled [NATURE'S BLEND VITAMIN E], Vitamin E 400 IU Oral Capsule, Liquid Filled [E-GEMS PLUS], Vitamin E 400 IU Oral Capsule, Liquid Filled [E-GEMS], Vitamin E 400 IU Oral Capsule, Liquid Filled [E-GEMS ELITE], Vitamin E 400 IU Oral Capsule, Liquid Filled, Vitamin E 400 IU Oral Capsule, Liquid Filled [VITAMIN E WATER SOLUBLE], Vitamin E 400 IU Oral Capsule, Liquid Filled [PHARMASSURE VITAMIN E], Vitamin E 400 IU Oral Capsule, Liquid Filled [AQUA GEM-E]]","[A28927625, A28231408, A28231405, A28216557, A28228199, A28244264, A9057372, A28211878, A28231406, A28234700]"
6871,A31670573,aluminum hydroxide 40 MG/ML / magnesium hydroxide 40 MG/ML / simethicone 4 MG/ML Oral Suspension [Ri-Mox Plus],"[A18931913, A27793605]","[Ri-Mox Plus (aluminum hydroxide 40 MG/ML / magnesium hydroxide 40 MG/ML / simethicone 4 MG/ML ) Oral Suspension, Ri-Mox 200 MG / 200 MG / 20 MG in 5 mL Oral Suspension]",2,"[Aluminum Hydroxide 40 MG/ML / Magnesium Hydroxide 40 MG/ML / Simethicone 5 MG/ML Oral Suspension, Aluminum Hydroxide 80 MG/ML / Magnesium Hydroxide 80 MG/ML / Simethicone 6 MG/ML Oral Suspension, Aluminum Hydroxide 80 MG/ML / Magnesium Hydroxide 80 MG/ML / Simethicone 12.8 MG/ML Oral Suspension, Aluminum Hydroxide 80 MG/ML / Magnesium Hydroxide 80 MG/ML / Simethicone 5 MG/ML Oral Suspension, Aluminum Hydroxide 80 MG/ML / Magnesium Hydroxide 80 MG/ML / Simethicone 8 MG/ML Oral Suspension, Almacone (aluminum hydroxide 40 MG/ML / magnesium hydroxide 40 MG/ML / simethicone 4 MG/ML) Oral Suspension, aluminum hydroxide 400 MG / magnesium hydroxide 400 MG / simethicone 40 MG in 5 mL Oral Suspension, aluminum hydroxide 400 MG / magnesium hydroxide 400 MG / simethicone 40 MG per 5 ML Oral Suspension, Mi-Acid (aluminum hydroxide 40 MG / magnesium hydroxide 40 MG / simethicone 4 MG) per ML Oral Suspension, ALOH/MGOH/SIMTH XTRA STRENGTH LIQ]","[A15007225, A15007226, A15834894, A15554722, A15554721, A18757356, A27802503, A27343197, A19628705, A18028704]"
193355,A31794202,HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM,"[A18724125, A20211169, A15528004, A31520200, A20994269, A29962738, A22646124, A23071185, A28870634, A29182756, A29275589, A29195134, A29559155, A29569435, A30300224, A18733457, A24041246, A8445245, A10096324, A18728886, A20834445, A30270935, A31213037, A30081463, A31214960]","[hydrocortisone acetate 10 MG/ML Topical Cream, hydrocortisone acetate 10 MG/ML Topical Cream, HYDROCORTISONE ACETATE 1% CREAM, HYDROCORTISONE ACETATE 10 mg in 1 g TOPICAL CREAM [HYDROCORTISONE], HYDROCORTISONE ACETATE 10 mg in 1 g TOPICAL CREAM [HYDROCORTISONE], HYDROCORTISONE ACETATE 1 g in 100 g TOPICAL CREAM [KAISER PERMANENTE Hydrocortisone], HYDROCORTISONE ACETATE 10 mg in 1 g TOPICAL CREAM [MedPride], HYDROCORTISONE ACETATE 10 mg in 1 g TOPICAL CREAM, Exederm for Eczema & Dermatitis, 1% topical cream, hydrocortisone acetate 1 % TOPICAL CREAM (GRAM), hydrocortisone acetate 1 % TOPICAL CREAM (GRAM), hydrocortisone acetate@1 %@TOPICAL@CREAM (GRAM), Hydrocortisone acetate 10 mg/g cutaneous cream, Product containing precisely hydrocortisone acetate 10 milligram/1 gram conventional release cutaneous cream (clinical drug), HYDROCORTISONE ACETATE 10 mg in 1 g TOPICAL CREAM [Ancalima Hydrocortisone Cream], hydrocortisone acetate 1 % Topical Cream, hydrocortisone acetate 1 % Topical Cream, HYDROCORTISONE ACETATE 1% CREAM,TOP, Hydrocortisone Acetate 1% Topical application Cream, HYDROCORTISONE ACETATE 1 g in 100 g TOPICAL CREAM [Hydrocortisone], HYDROCORTISONE ACETATE 1 g in 100 g TOPICAL CREAM [Hydrocortisone], HYDROCORTISONE ACETATE 1 g in 100 g TOPICAL CREAM [Hydrocortisone], HYDROCORTISONE ACETATE 1 g in 100 g TOPICAL CREAM [Hydrocortisone], Hydrocortisone Acetate 1 g in 100 g TOPICAL CREAM [Hydrocortisone], HYDROCORTISONE ACETATE 1 g in 100 g TOPICAL CREAM [HYDROCORTISONE]]",25,"[Hydrocortisone 10 mg in 1 g TOPICAL CREAM, Hydrocortisone 10 mg in 1 g TOPICAL CREAM, HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM, HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM, HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM, HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM, HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM, Hydrocortisone 10 mg in 1 g TOPICAL CREAM, HYDROCORTISONE 10 mg in 1 g TOPICAL CREAM, hydrocortisone 10 mg in 1 g TOPICAL CREAM]","[A26944938, A20172660, A24228257, A27759590, A27759434, A31249370, A31085761, A26454011, A29964607, A31247808]"
933,A32088567,vit-C 60 MG / cholecalciferol 0.01 MG / folate 0.3 MG / vit-B3 13.5 MG / riboflavin 1.2 MG / sodium fluoride 2.2 MG / vit-B1 1.05 MG / vitamin A 0.75 MG / vitamin B12 0.0045 MG / vitamin B6 1.05 MG / vitamin E 15 UNT Chewable Tablet [MVC-Fluoride],"[A28248897, A28262296, A22724472, A17040941, A27755868]","[Ascorbic Acid/Cyanocobalamin/Folic Acid/Niacin/Pyridoxine/Riboflavin/Sodium Fluoride/Thiamine/Vitamin A/Vitamin D/Vitamin E 60 MG-0.0045 MG-0.3 MG-13.5 MG-1.05 MG-1.2 MG-1 MG-1.05 MG-2500 IU-400 IU-15 IU Oral Tablet, Chewable [MVC-FLUORIDE], Sodium Fluoride 1 mg / Vitamin A 2500 [iU] / Ascorbic Acid 60 mg / Vitamin D 400 [iU] / .ALPHA.-TOCOPHEROL 15 [iU] / Thiamine 1.05 mg / Riboflavin 1.2 mg / Niacin 13.5 mg / Pyridoxine 1.05 mg / Folic Acid 0.3 mg / Cyanocobalamin 4.5 ug ORAL TABLET, CHEWABLE [MVC-Fluoride], MVC-Fluoride (ascorbic acid 60 MG / folic Acid 0.3 MG / niacin 13.5 MG / riboflavin 1.2 MG / sodium fluoride 2.2 MG (equivalent to fluoride 1 MG) / thiamine 1.05 MG / vitamin A 2500 UNT / vitamin B 12 0.0045 MG / vitamin B6 1.05 MG / vitamin D 400 UNT / vitamin E 15 UNT) Chewable Tablet, MVC-Fluoride, Multiple Vitamins with Fluoride 1 mg oral tablet, chewable, Sodium Fluoride 1 mg / Vitamin A Palmitate 2500 [iU] / Ascorbic Acid 60 mg / Cholecalciferol 400 [iU] / .Alpha.-Tocopherol Acetate 15 [iU] / Thiamine Mononitrate 1.05 mg / Riboflavin 1.2 mg / Niacinamide 13.5 mg / Pyridoxine Hydrochloride 1.05 mg / Folic Acid 0.3 mg / cyanocobalamin 4.5 ug ORAL TABLET, CHEWABLE [MVC]]",5,"[VITAMIN A PALMITATE 1200 [iU] / ASCORBIC ACID 60 mg / CHOLECALCIFEROL 600 [iU] / .ALPHA.-TOCOPHEROL ACETATE, D- 15 [iU] / PYRIDOXINE HYDROCHLORIDE 1.5 mg / FOLIC ACID 200 ug / CYANOCOBALAMIN 4 ug / SODIUM FLUORIDE 0.25 mg ORAL TABLET, CHEWABLE [Quflora Gummies], ASCORBIC ACID 60 mg / CHOLECALCIFEROL 1000 [iU] / .ALPHA.-TOCOPHEROL ACETATE, DL- 10 [iU] / PYRIDOXINE HYDROCHLORIDE 26 mg / FOLIC ACID 400 ug / CYANOCOBALAMIN 13 ug / BIOTIN 280 ug / CALCIUM CARBONATE 80 mg / FERROUS ASPARTO GLYCINATE 9 mg / IRON 9 mg / POTASSIUM IODIDE 150 ug / MAGNESIUM OXIDE 25 mg / DOCONEXENT 350 mg / LOWBUSH BLUEBERRY 25 mg / LEVOMEFOLIC ACID 600 ug ORAL CAPSULE, GELATIN COATED [Prenate Mini], Vitamin A 1100 [iU] / Ascorbic Acid 60 mg / Vitamin D 1000 [iU] / .Alpha.-Tocopherol 20 [iU] / Thiamine Mononitrate 1.6 mg / Riboflavin 1.8 mg / Niacin 15 mg / Pyridoxine Hydrochloride 2.5 mg / Folic Acid 1 mg / Cyanocobalamin 25 ug / Iodine 150 ug / Iron 90 mg / Magnesium 20 mg / Zinc 25 mg / Copper 2 mg / Doconexent 200 mg ORAL CAPSULE, LIQUID FILLED [Vitafol Fe Plus Prenatal Supplement], ascorbic acid 60 mg / cholecalciferol 600 [iU] / .alpha.-tocopherol acetate, dl- 30 [iU] / thiamine mononitrate 3 mg / riboflavin 3.4 mg / niacinamide 20 mg / pyridoxine hydrochloride 25 mg / folic acid 400 ug / 5-methyltetrahydrofolic acid 600 ug / cyanocobalamin 12 ug / biotin 300 ug / calcium pantothenate 10 mg / calcium carbonate 150 mg / ferrous bisglycinate hydrochloride 30 mg / potassium iodide 150 ug / zinc glycinate 15 mg / copper 2 mg ORAL TABLET [Prenatal Multivitamin], VITAMIN A ACETATE 1200 [iU] / ASCORBIC ACID 60 mg / CHOLECALCIFEROL 400 [iU] / .ALPHA.-TOCOPHEROL, DL- 15 [iU] / THIAMINE HYDROCHLORIDE 1.2 mg / RIBOFLAVIN 1.3 mg / NIACINAMIDE 5 mg / PYRIDOXINE HYDROCHLORIDE 1.5 mg / LEVOMEFOLIC ACID 108 ug / FOLIC ACID 100 ug / CYANOCOBALAMIN 4 ug / MAGNESIUM OXIDE 15 mg / CUPRIC SULFATE 1 mg / SODIUM FLUORIDE 1 mg ORAL TABLET, CHEWABLE [Quflora Pediatric], VITAMIN A ACETATE 1200 [iU] / ASCORB...","[A28620519, A24653491, A26441206, A20963690, A23554360, A23550133, A23553746, A27386511, A27387065, A27388019]"
41530,A31798042,ALCOHOL 40 mL in 50 mL EXTRACORPOREAL GEL [Forkids Hand Sanitizer],"[A23551760, A24136321, A28749498]","[ethanol 80 % Topical Gel, ethanol 80 % Topical Gel, ALCOHOL 0.8 mL in 1 mL TOPICAL GEL [GOJO Instant Hand Sanitizer]]",3,"[ALCOHOL 33 mL in 53 mL TOPICAL GEL [Instant Hand Sanitizer - Original], ALCOHOL 33 mL in 53 mL TOPICAL GEL [Instant Hand Sanitizer - Original], ALCOHOL 33 mL in 53 mL TOPICAL GEL [Instant Hand Sanitizer - Original], ALCOHOL 36.7 mL in 59.2 mL TOPICAL GEL [2-Pack Hand Sanitizer], ALCOHOL 36.7 mL in 59.2 mL TOPICAL GEL [2-Pack Hand Sanitizer], ALCOHOL 17.78 mL in 28 mL TOPICAL GEL [SnugZ Beaded Hand Sanitizer Gel], ALCOHOL 331.1 mL in 473 mL TOPICAL GEL [Antimicrobial Hand Sanitizer], ALCOHOL 165.9 mL in 237 mL TOPICAL GEL [Antimicrobial Hand Sanitizer], ALCOHOL 165.9 mL in 237 mL TOPICAL GEL [Antimicrobial Hand Sanitizer], ALCOHOL 36.58 mL in 59 mL TOPICAL GEL [Forever Hand Sanitizer Hand Sanitizer]]","[A22726551, A22725817, A22727264, A22645203, A31215342, A29374111, A31559354, A31560330, A31561003, A26518918]"


Disorders


Unnamed: 0,auis,strings,2020AA_synonyms,synonym_strings,num_syms,top10,top10_auis
429483,A31604611,Necrosis,"[A0796227, A20276720, A25728945, A3071221, A10834905, A18651715, A25736261, A18633084, A24366040, A23940832, A18595910, A3116844, A18595909, A3201975, A18651716, A18577261, A3707470]","[Skin necrosis, Skin necrosis, Skin necrosis, Skin necrosis, Skin necrosis, necrosis skin, Necrosis skin, skin necrosis, Skin Necrosis, Skin Necrosis, necrosis of skin, Cutaneous necrosis, cutaneous necrosis, Sloughing of skin, skin sloughing, sloughing skin, Skin necrosis (disorder)]",17,"[Necrosis, necrosis, necrosis, necrosis, NECROSIS, Necrosis, Necrosis, necrosis, Necrosis, necrosis]","[A16764872, A14268273, A4386906, A23949206, A25764445, A0090503, A29586163, A18667298, A2884468, A16507762]"
231188,A32292026,"Lymphofollicular Granuloma, Eosinophilic","[A15565644, A17341514, A18612256, A25696334, A2970972, A18612257, A15569571, A3528915]","[Kimura Disease, Kimura disease, kimura disease, Kimura's disease, Kimura's disease, kimura's disease, Kimura's Disease, Kimura's disease (disorder)]",8,"[granuloma; eosinophilic, eosinophilic; granuloma, eosinophilic granuloma (diagnosis), Granuloma, Eosinophilic, granuloma eosinophilic, Eosinophilic Granuloma, Eosinophilic granuloma, eosinophilic granuloma, eosinophilic granuloma, Eosinophilic granuloma]","[A4418496, A4411607, A13869335, A26655006, A18591214, A27924969, A2929893, A14074782, A0478607, A17850571]"
428340,A31610834,II,"[A15567547, A15834642, A15563537, A15559639, A29169869, A15563536, A15559638, A15559640, A15567548]","[ISS Stage II Plasma Cell Myeloma, ISS Stage II Plasma Cell Myeloma, Multiple Myeloma Stage II, Stage II Multiple Myeloma, ISS Stage II Multiple Myeloma/Plasma Cell Myeloma, ISS Stage II Multiple Myeloma, International Staging System Stage II Plasma Cell Myeloma, Stage II Plasma Cell Myeloma, International Staging System Stage II Multiple Myeloma]",9,"[II, II, II, II, II, II, II, II, II, Ii]","[A18130567, A10769123, A24102669, A19042285, A31198103, A20242995, A26709182, A15556545, A19292859, A20236776]"
378985,A32294105,"Glomerulosclerosis, Nodular","[A31530005, A31078041, A23375627, A31057835, A23366347, A31535266, A31532834]","[Intracapillary glomerulosclerosis due to diabetes mellitus, Intracapillary glomerulosclerosis due to diabetes mellitus, Diabetic intracapillary glomerulosclerosis, Diabetic intracapillary glomerulosclerosis, diabetic intracapillary glomerulosclerosis (diagnosis), Intracapillary glomerulosclerosis of kidney due to diabetes mellitus, Intracapillary glomerulosclerosis of kidney due to diabetes mellitus (disorder)]",7,"[Nodular glomerulosclerosis, Nodular glomerulosclerosis, Nodular glomerulosclerosis, Nodular glomerulosclerosis, nodular glomerulosclerosis (diagnosis), Nodular glomerulosclerosis (morphologic abnormality), nephrotic syndrome due to nodular glomerulosclerosis, nephrotic syndrome due to nodular glomerulosclerosis (diagnosis), Nodular diabetic glomerulosclerosis, Nodular diabetic glomerulosclerosis]","[A3049446, A6839827, A23376815, A25711506, A23388706, A3583896, A13773525, A13547067, A3049442, A6839828]"
262054,A32319979,Immunoglobulin A vasculitis (disorder),"[A26609656, A1372290, A25687031, A18630766, A18668023, A17011008, A8342177, A26598828, A26663518, A0420960, A15228811, A19287955, A25743520, A9333042, A16955988, A16994528, A26679749, A16997265, A16997266, A16997264, A17008289, A16994527, A16991894, A16997267, A17000020, A16994526, A17002793, A17812399, A18612315, A18686423, A1372288, A25760391, A25770472, A9342243, A14065565, A0108032, A0108033, A0114460, A17687438, A26679748, A1372291, A26620438, A25686817, A4449804, A0389122, A0389123, A17863359, A8352171, A25716978, A18668021, A13659869, A4449343, A0450282, A26631411, A8342174, A17812400, A25703798, A4449805, A12007227, A25691788, A2922934, A18649453, A4455428, A0450283, A26658203, A18574971, A2967070, A18668022, A12985359, A25686818, A25733548, A14022313, A15309966, A25683440, A25772108, A18668024, A2955294, A8370234, A2989161, A3072970, A18612317, A3316255, A18612316, A3029954, A18696629, A1306986, A18678168, A25691790, A25743519, A18556570, A25702126, A18686422, A18649454, A4406319, A4449802, A4406330, A13612388, A13529530, A14121748]","[Henoch-Schoenlein Purpura, Henoch-Schoenlein Purpura, Schoenlein-Henoch purpura, henoch schoenlein purpura, henoch-schoenlein purpura, Purpura, Henoch-Schonlein, Purpura, Schonlein-Henoch, Purpura, Schonlein-Henoch, Henoch-Schonlein Purpura, Henoch-Schonlein purpura, Henoch-Schonlein purpura, Henoch-Schonlein purpura, Henoch-Schonlein purpura, Henoch-Schonlein purpura, Henoch-Schonlein purpura, Henoch Schonlein Purpuras, Henoch Schonlein Purpura, Henoch-Schonlein Purpuras, Purpura, Henoch Schonlein, Purpura, Schonlein Henoch, Purpuras, Henoch Schonlein, Purpuras, Henoch-Schonlein, Purpuras, Schonlein-Henoch, Schonlein Purpura, Henoch, Schonlein Purpuras, Henoch, Schonlein-Henoch Purpuras, Schonlein-Henoch Purpura, Purpura Henoch(-Schönlein), henoch schonlein purpura, henoch-schonlein purpura, Henoch Schoenlein Purpura, Henoch-Schönlein Purpura, Henoch-Schönlein Purpura, Purpura;Henoch-Schonlein, Henoch Schonlein purpura, Purpura, Schoenlein Henoch, Purpura, Schoenlein-Henoch, Schoenlein Henoch Purpura, Schoenlein-Henoch Purpura, Schoenlein-Henoch Purpura, Purpura, Henoch-Schoenlein, Allergic Purpura, Purpura allergic, purpura; allergic, Allergic purpura, Allergic purpura, Allergic purpura, Allergic purpura, Allergic purpura, allergic purpura, allergic purpura, allergic; purpura, Purpura, Allergic, Anaphylactoid Purpura, Purpura, anaphylactoid, Purpura anaphylactoid, Purpura anaphylactoid, purpura; anaphylactoid, Anaphylactoid purpura, Anaphylactoid purpura, Anaphylactoid purpura, anaphylactoid purpura, anaphylactoid; purpura, Purpura, Anaphylactoid, Henoch Purpura, henoch purpura, Henoch's purpura, henoch's purpura, Purpura, Henoch, Purpura vascular allergic, Allergic vascular purpura, allergic vascular purpura, vascular allergic purpura, Anaphylactic vascular purpura, HSP, hsp, Autoimmune purpura, Purpura, autoimmune, Acute vascular purpura, Spring fever, spring fever, HSP - Henoch-Schonlein purpura, henoch-schoenlein purpura (HSP), Henoch-Schoenlein vasculiti...",99,"[Immunoglobulin A-associated vasculitis, IgA vasculitis, Vasculitis, Pauci-Immune, Pauci Immune Vasculitis, Autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis, autoimmune vasculitis]","[A31206906, A31206755, A16998333, A16995534, A13021789, A4366998, A4366995, A4366994, A4366996, A4366997]"
419314,A31609021,MiNEN,"[A31066401, A31072426, A31055577]","[Mixed neuroendocrine-non neuroendocrine neoplasm, Mixed neuroendocrine-non neuroendocrine neoplasm (morphologic abnormality), MiNEN - mixed neuroendocrine-non neuroendocrine neoplasm]",3,"[mine, MINE, MINE, Minerin, MINE-ESHAP, mines, Minetia, Miner, miner, Miner]","[A18555635, A4361235, A7589964, A1996629, A31190114, A18667129, A20546161, A24833154, A18672726, A3045293]"
278662,A31789856,SARS-CoV-2 infection,[A31548810],[Disease caused by 2019 novel coronavirus],1,"[SARS-CoV infection, SARS-CoV-2, SARS-CoV, SARS-CoV, SARS-CoV, sars-cov, SARS coronavirus Tor2, SARS (Disease), coronavirus sars-associated, coronavirus SARS]","[A7881734, A31564425, A26651188, A7875079, A26710744, A18626035, A7881714, A23869464, A27163034, A18700177]"
419576,A31609005,BSNS,"[A31545618, A31538194]","[Biphenotypic sinonasal sarcoma, Biphenotypic sinonasal sarcoma (morphologic abnormality)]",2,"[BSN, BSN, BSN, BSND, BSND, BSND, BSND, BSND, BSND, BCNS]","[A15568744, A12012666, A20769110, A17466722, A29162779, A26951193, A29144585, A20751166, A29171851, A10773997]"
363346,A32290146,Coenurosis,"[A0044721, A0044722, A0044727, A17824912, A23868171, A24094035, A25742738, A27375967, A2880388, A7569882, A8340201, A18590822, A4402057, A15222973, A0044718, A18646617, A8354352, A2961284, A3511392, A0682439, A17786407, A0682438, A3037478, A3037479, A3037477, A26577983, A26589369, A25734540, A3379673, A15242315]","[Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, Cysticercosis, cysticercosis, cysticercosis, cysticercosis, Cysticercoses, cysticercoses, Cysticerciasis, Cysticerciasis, Infection by tapeworm larvae, Cysticercosis, unspecified, Cysticercosis, unspecified, Cysticercosis, unspecified, Larval tapeworm infection, Larval teniasis, Larval taeniasis, Infection caused by tapeworm larvae, Infection caused by tapeworm larvae (disorder), Cystercercosis, Cysticercosis (disorder), cysticercosis (diagnosis)]",30,"[coenurosis, Coenurosis, coenurosis, coenurosis, Coenurosis, Coenurosis (disorder), coenurosis (diagnosis), Coenuriasis, Coenuriasis, coenuriasis]","[A15176257, A2958885, A18553520, A4397616, A17862929, A3361792, A15242147, A2880081, A7757386, A18646445]"
336394,A31590489,Independent for Controlling Bowels,"[A3326968, A16339126, A2930694, A27923354, A3129300, A3103713, A27931198, A30274116]","[Bowels: fully continent (finding), Bowels: fully continent (finding), Fecal continence, Fecal Continence, Faecal continence, Bowels: fully continent, Continent, Bowels always continent]",8,"[LT ADL goals bowel management independently, short-term: bowel management independently, long-term: bowel management independently, ST ADL goals bowel management independently, patient managed bowel independently, patient managed bowel independently (physical finding), Continence independent, long-term ADL goals: bowel management independently, short-term ADL goals: bowel management independently, long-term ADL goals: bowel management independently (treatment)]","[A13676186, A15281630, A15258187, A13676182, A17152813, A17260478, A3371307, A17252783, A17235875, A17224954]"


Genes & Molecular Sequences


Unnamed: 0,auis,strings,2020AA_synonyms,synonym_strings,num_syms,top10,top10_auis
421583,A31812696,PUMB1,"[A20768283, A6898370, A29753786, A24365246, A24598574, A11936380, A24569500, A20714410, A11936382, A11950692, A26967323, A24388617, A20768282, A24373134, A27938731, A11950693, A24565412, A24577685, A20750312]","[ALDH1A1 gene, ALDH1A1 gene, ALDH1A1 Gene, ALDH1A1 Gene, ALDEHYDE DEHYDROGENASE 1, ALDH1, RETINAL DEHYDROGENASE 1, RALDH1, RALDH1, ALDEHYDE DEHYDROGENASE 1 FAMILY, MEMBER A1, aldehyde dehydrogenase 1 family member A1, Aldehyde Dehydrogenase 1 Family, Member A1 Gene, ALDH1A1, ALDH1A1, ALDH1A1, ALDH1A1, ALDH, LIVER CYTOSOLIC, ACETALDEHYDE DEHYDROGENASE 1, retinaldehyde dehydrogenase 1]",19,"[PUMB1, PUM1, PUM1, PUM1 gene, PUM1 gene, PUM2, PUM2, PUMH1, PUM2 gene, PUM2 gene]","[A24368807, A11996679, A20779809, A20761872, A6907383, A12024969, A20797694, A20797692, A20708104, A6907384]"
367697,A31801197,H3F3AP4,[A20791474],[p13],1,"[H3F3AP1, H3F3CP, H3F3A, H3F3A, H3F3A, H3F3C, H3F3C, H3F3C, H3F3C, H3F3B]","[A20773667, A20719648, A12029004, A21408221, A31191527, A24568940, A30126063, A30124983, A31181429, A31191010]"
424816,A31851290,OSRC,"[A18284400, A20815960, A6904242, A18604498, A29760479, A7588550, A20690509, A23811250, A24335231, A24333416, A31556270, A27919651, A24572748, A20815959, A27934313, A4358491, A7587073, A20270084, A10816040, A18660259]","[RB1 gene, RB1 gene, RB1 gene, rb1 gene, RB1 Gene, RB1 Gene, RB, prepro-retinoblastoma-associated protein, PPP1R130, protein phosphatase 1, regulatory subunit 130, RB TRANSCRIPTIONAL COREPRESSOR 1, RB transcriptional corepressor 1, p105-Rb, RB1, RB1, RB1, RB1, RB1, Retinoblastoma 1 (Including Osteosarcoma) Gene, retinoblastoma 1 (RB1) gene]",20,"[OSRC, OSR1, OSR1, Osr1, OSR1, OSPR, OSR1 gene, OSR1 gene, OSR2, OSR2]","[A11959899, A11980891, A24571429, A21246015, A20706796, A7656950, A20689080, A6903333, A13286846, A20796359]"
138263,A31825440,transfer RNA lysine 19 (anticodon UUU),"[A24333996, A24338874, A24333995, A30285700]","[TRK-TTT6-1 gene, TRK-TTT6-1 gene, TRK-TTT6-1, tRNA-Lys-TTT-6-1]",4,"[tRNA-Ala (AGC) 19-1, tRNA-Ala-AGC-19-1, T cell receptor alpha joining 19 (non-functional), tRNA LYSINE 1 (UUU), TRANSFER RNA LYSINE 1, small Cajal body-specific RNA 19, Small Cajal Body-Specific RNA 19, TRN-GTT19-1 gene, TRN-GTT19-1 gene, TRA-AGC19-1 gene]","[A30284332, A30284333, A20730156, A24570248, A24620102, A20765556, A29751648, A24337579, A24338782, A24332593]"
295857,A31846326,chromosome 1 open reading frame 171,"[A20695161, A11717102, A26967216, A20730878, A24630503, A13293306, A20712850, A13291255]","[TYW3 gene, TYW3 gene, tRNA-yW synthesizing protein 3 homolog, FLJ40918, tRNA-YW-SYNTHESIZING PROTEIN 3, S. CEREVISIAE, HOMOLOG OF, tRNA-WYBUTOSINE-SYNTHESIZING PROTEIN 3, S. CEREVISIAE, HOMOLOG OF, TYW3, TYW3]",8,"[chromosome 1 open reading frame 167, coiled-coil domain containing 171, CHROMOSOME 1 OPEN READING FRAME 177, protein phosphatase 1, regulatory subunit 171, transmembrane protein 171, CHROMOSOME 1 OPEN READING FRAME 168, CHROMOSOME 1 OPEN READING FRAME 172, CHROMOSOME 1 OPEN READING FRAME 170, C1orf167, C1orf167 gene]","[A20769207, A20734048, A26955811, A24332709, A20819721, A31080116, A26951875, A24593754, A20679833, A11713044]"
298680,A31826969,chromosome 10 open reading frame 3,"[A20734399, A11713594, A29757911, A21408551, A21412534, A26971021, A20680830, A20788196, A21418494, A27932544, A11965943, A11951584, A20806263, A20770212]","[CEP55 gene, CEP55 gene, CEP55 Gene, CEP55 Gene, Centrosomal Protein 55kDa Gene, centrosomal protein 55, FLJ10540, CEP55, CEP55, CEP55, CEP55, CENTROSOMAL PROTEIN, 55-KD, CT111, cancer/testis antigen 111]",14,"[Chromosome 10 Open Reading Frame 3 Gene, C10orf3, CHROMOSOME 10 OPEN READING FRAME 2, CHROMOSOME 10 OPEN READING FRAME 4, CHROMOSOME 10 OPEN READING FRAME, CHROMOSOME 10 OPEN READING FRAME 10, CHROMOSOME 20 OPEN READING FRAME 3, CHROMOSOME 10 OPEN READING FRAME 12, CHROMOSOME 18 OPEN READING FRAME 3, C10ORF2]","[A24364617, A21398606, A28266165, A24580318, A24597379, A26954071, A24618582, A29369703, A30186236, A11936543]"
396589,A31843094,C14orf43,[A20805488],[LSR68],1,"[C15ORF43, C12orf43, C12orf43 gene, C12orf43 gene, CHROMOSOME 15 OPEN READING FRAME 43, C11orf43, C1ORF43, C1orf43, C1orf43 gene, C1orf43 gene]","[A27819051, A20733633, A20715674, A11713372, A27823368, A24384838, A28266697, A20715345, A8266358, A20769162]"
402606,A31806223,C1orf135,"[A21248432, A21257931, A21246009, A21247623, A21247624, A20769200]","[AUNIP gene, AUNIP gene, aurora kinase A and ninein interacting protein, AUNIP, AIBp, MGC2603]",6,"[C9orf135, C9orf135 gene, C9orf135 gene, chromosome 9 open reading frame 135, TMEM135 gene, TMEM135 gene, TMEM135, TMEM135, FAM135A, COILED-COIL DOMAIN-CONTAINING PROTEIN 135]","[A20680024, A20805378, A11713314, A20805377, A20729876, A11716235, A26947454, A20729875, A20790335, A31552201]"
421256,A31828850,SCA22,"[A20756809, A6914141, A26962640, A11969010, A20703090, A24613248, A20703091, A24584000, A20703089, A11925900, A11997819]","[KCND3 gene, KCND3 gene, potassium voltage-gated channel subfamily D member 3, POTASSIUM VOLTAGE-GATED CHANNEL, SHAL-RELATED SUBFAMILY, MEMBER 3, Kv4.3, Kv4.3, KSHIVB, KCND3S, KCND3, KCND3, KCND3L]",11,"[SCA22, SCA21, SCA23, SCA42, SCA25, SCA20, SCA16, SCA27, SCA27, SCA27]","[A12024865, A12010582, A11922675, A26956079, A11966772, A11952492, A26712035, A11966348, A20683098, A29142406]"
341641,A31811478,"deafness, autosomal recessive 76","[A21247513, A21255414, A21244280, A23788743, A24618515, A20787521, A23783449, A21245892, A27917629, A24568792, A23790044, A20733749, A24626752, A23788076]","[SYNE4 gene, SYNE4 gene, SYNE4, SYNE4, NESPRIN 4, Nesprin-4, NESP4, Nesp4, spectrin repeat containing nuclear envelope family member 4, NUCLEAR ENVELOPE SPECTRIN REPEAT PROTEIN 4, SPECTRIN REPEAT-CONTAINING NUCLEAR ENVELOPE PROTEIN 4, FLJ36445, CHROMOSOME 19 OPEN READING FRAME 46, C19ORF46]",14,"[DEAFNESS, AUTOSOMAL RECESSIVE 76, DEAFNESS, AUTOSOMAL RECESSIVE 76, Deafness, Autosomal Recessive 77, DEAFNESS, AUTOSOMAL RECESSIVE 77, DEAFNESS, AUTOSOMAL RECESSIVE 77, DEAFNESS, AUTOSOMAL RECESSIVE 74, DEAFNESS, AUTOSOMAL RECESSIVE 74, DEAFNESS, AUTOSOMAL RECESSIVE 77 (disorder), DEAFNESS, AUTOSOMAL RECESSIVE 86, DEAFNESS, AUTOSOMAL RECESSIVE 86]","[A24623308, A23786760, A20973483, A17462583, A24585971, A19289116, A24631565, A18475178, A24602611, A21238745]"


Living Beings


Unnamed: 0,auis,strings,2020AA_synonyms,synonym_strings,num_syms,top10,top10_auis
134399,A31990087,Euphorbia subgen. Athymalus,[A23660598],[Rhizanthium],1,"[Athymoris, Athyma, Athelia sp., Athrycia, Atheniella, Euphorbia epithymoides, Atholus, Athemus, Athymia, athymia]","[A20454937, A12825691, A29068751, A26911438, A30614850, A17570733, A24524017, A12825183, A28681604, A1303475]"
35445,A32065899,Phyllostachys nigra var. henonis x Sasa veitchii,"[A9256069, A19192830]","[Hibanobambusa tranquillans, x Phyllosasa tranquillans]",2,"[Phyllostachys kwangsiensis x Phyllostachys bambusoides, Phyllostachys pubescens x Phyllostachys bambusoides, Phyllostachys edulis x Phyllostachys bambusoides, Phyllostachys sp. JID-2010, Phyllostachys sp. 2 SL-2018, Phyllostachys sp. KD-2017, Phyllostachys sp. PC-2007, Phyllostachys nigra, Lindsaea ensifolia x Odontosoria chinensis, Potamogeton strictifolius x Potamogeton zosteriformis]","[A17493499, A17593161, A17493500, A19146444, A30674231, A29034482, A20435353, A9273888, A29043468, A12855711]"
287091,A32066187,Allolevivirus subgroup IV,[A28153152],[Escherichia virus FI],1,"[Allolevivirus, Allolevivirus, Alloleviviruses, Brevidensovirus, Brevidensovirus, Brevidensovirus, Brevidensovirus, Iotatorquevirus, Aveparvovirus, Vidavervirus]","[A0363120, A0363121, A0363122, A3886147, A7812783, A2028728, A2028729, A17595368, A24467470, A30663394]"
226809,A31925723,unclassified SP6-like viruses,[A30730068],[unclassified Zindervirus],1,"[unclassified Schizotequatrovirus, unclassified Sripuvirus, unclassified Viunavirus, unclassified Tlsvirus, unclassified Prymnesiovirus, unclassified Unaquatrovirus, unclassified Spbetalikevirus, unclassified L5-like viruses, unclassified Spounavirinae, unclassified Spumavirus]","[A30544321, A30714372, A30717618, A29069711, A9300722, A30461614, A26815082, A30636425, A19177306, A28988380]"
384647,A32001859,Ossaea humilis,[A12883811],[Leandra humilis],1,"[Sutherlandia humilis, Gymnodia humilis, Oreocarya humilis, Neomarica humilis, Colostethus humilis, Agaricia humilis, Podagrostis humilis, Ferulago humilis, Stemodiopsis humilis, Kalanchoe humilis]","[A19196663, A24405326, A21355261, A21321716, A15018091, A12826960, A29036088, A12915187, A7128837, A28000368]"
331255,A31899817,Sheetfish iridovirus,[A2141808],[European catfish virus],1,"[Seabass iridovirus, Snakehead iridovirus, Synodus indicus iridovirus, Red seabream iridovirus, Orbiculate batfish iridovirus, Singapore grouper iridovirus, Japanese sea bass iridovirus, Common ponyfish iridovirus, Bluegill sunfish iridovirus, Sturgeon iridovirus]","[A16567490, A29019409, A16579930, A23671016, A24562521, A9284446, A29127033, A16588577, A29041903, A30456732]"
380717,A31938425,Agave amica,[A8585167],[Polianthes tuberosa],1,"[Agaves, Agaves, Agave (organism), Agathon, agathon, Catocala amica, Agave americana (organism), Agave, Agave, agave]","[A23863045, A17008585, A3524221, A2692013, A18626091, A19221202, A3524231, A2027345, A2027346, A18692733]"
352190,A31948521,Emberiza unicolor,"[A23355911, A23094620, A23149210]","[Phrygilus unicolor, Phrygilus unicolor (organism), Plumbeous sierra finch]",3,"[Eurydera unicolor, Epuraea unicolor, Spreo unicolor, Lenzites unicolor, Telamona unicolor, Aerodramus unicolor, Haplospiza unicolor, Haplospiza unicolor, Diagramma unicolor, Striglia unicolor]","[A2709601, A24447484, A19245186, A28896927, A2207225, A23339665, A2149679, A23348649, A26725225, A28894793]"
207820,A32002437,Kyllinga macrocephala,"[A27947775, A28105969]","[Cyperus richardii, Kyllinga bulbosa]",2,"[Carphochaete macrocephala, Cypholophus macrocephalus, Caecosagitta macrocephala, Leucopaxillus macrocephalus, Luciobrama macrocephalus, Pundamilia macrocephala, Conocybe macrocephala, Kryptopterus macrocephalus, Thurnia macrocephala, Streptoglossa macrocephala]","[A16518116, A17520761, A19142937, A24416361, A7110097, A20422064, A21344648, A9259150, A2209422, A27961921]"
193796,A31901079,unclassified Cjwunalikevirus,[A30451395],[unclassified Kostyavirus],1,"[unclassified Kleczkowskavirus, unclassified Kappatorquevirus, unclassified Coetzeevirus, unclassified Jwalphavirus, unclassified Uetakevirus, unclassified Gordtnkvirus, unclassified Phicbkvirus, unclassified Wphvirus, unclassified Cepunavirus, unclassified Eclunavirus]","[A30549341, A29082024, A30562881, A30661884, A30385009, A30672826, A28009592, A30612233, A30474956, A30729705]"


In [141]:
validation_df.columns

Index(['0', 'strings', 'auis', '2020AA_synonyms', 'synonym_strings',
       'sapbert_400-NN_strings', 'sapbert_400-NN_auis', 'sapbert_400-NN_dist',
       'sapbert_400-NN_recall', 'num_syms', 'lexlm_2000-NN_auis_x', 'cuis',
       'sem_groups', 'lexlm_2000-NN_recall', 'R@1_SB', 'R@1_LM', 'R@5_SB',
       'R@5_LM', 'R@10_SB', 'R@10_LM', 'R@50_SB', 'R@50_LM', 'R@100_SB',
       'R@100_LM', 'R@200_SB', 'R@200_LM', 'R@500_SB', 'R@500_LM', 'R@1000_SB',
       'R@1000_LM', 'R@2000_SB', 'R@2000_LM', 'R@1_oracle', 'R@5_oracle',
       'R@10_oracle', 'R@50_oracle', 'R@100_oracle', 'R@200_oracle',
       'R@500_oracle', 'R@1000_oracle', 'R@2000_oracle', 'fps', 'sb_fps',
       'lexlm_2000-NN_auis_y', 'lexlm_2000-NN_dist', 'lm_fps'],
      dtype='object')

In [142]:
distance_at_100 = []

for i,row in validation_df.iterrows():
    distances = row['sapbert_400-NN_dist']
    
    distance_at_100.append(distances[100])    
    
validation_df['sb_dist@100'] = distance_at_100

distance_100_threshold = validation_df[validation_df['num_syms'] > 0]['sb_dist@100'].mean()

In [143]:
#Calculating Recall & # of False Positives @ Distance Threshold in Validation Set

nearest_neighbors_auis = umls2020AB_df['sapbert_400-NN_auis']
nearest_neighbors_dist = umls2020AB_df['sapbert_400-NN_dist']

recall_array = []
fps = []

for true_syn, top_auis, top_dist in tqdm(zip(query_synonym_auis, nearest_neighbors_auis, nearest_neighbors_dist)):
    
    if len(true_syn) > 0:
        true_syn = set(true_syn)

        n = len(np.where(top_dist < distance_100_threshold)[0])

        topn = set(top_auis[:n])
        true_pos = topn.intersection(true_syn)

        recall_array.append(len(true_pos)/len(true_syn))
        fps.append(n-len(true_pos))
    else:
        recalls = None

        recall_array.append(recalls)

        n = len(np.where(top_dist < distance_100_threshold)[0])
        fps.append(n)

umls2020AB_df['sb_fps'] = fps
umls2020AB_df[umls2020AB_df['num_syms'] == 0].sb_fps.mean(),umls2020AB_df[umls2020AB_df['num_syms'] > 0].sb_fps.mean()

430135it [00:05, 73235.44it/s] 


(263.7152058386975, 166.56143393874538)

In [144]:
pd.DataFrame(recall_array).describe()

Unnamed: 0,0
count,170077.0
mean,0.8
std,0.33
min,0.0
25%,0.75
50%,1.0
75%,1.0
max,1.0


In [145]:
distance_at_100 = []

for i,row in validation_df.iterrows():
    distances = row['lexlm_2000-NN_dist']
    
    distance_at_100.append(distances[100])    
    
validation_df['lm_dist@100'] = distance_at_100

distance_100_threshold = validation_df[validation_df['num_syms'] > 0]['lm_dist@100'].mean()

In [146]:
#Calculating Recall & # of False Positives @ Distance Threshold in Validation Set

nearest_neighbors_auis = umls2020AB_df['lexlm_2000-NN_auis_x']
nearest_neighbors_dist = umls2020AB_df['lexlm_2000-NN_dist']

recall_array = []
fps = []

for true_syn, top_auis, top_dist in tqdm(zip(query_synonym_auis, nearest_neighbors_auis, nearest_neighbors_dist)):
    
    if len(true_syn) > 0:
        true_syn = set(true_syn)

        n = len(np.where(top_dist < distance_100_threshold)[0])

        topn = set(top_auis[:n])
        true_pos = topn.intersection(true_syn)

        recall_array.append(len(true_pos)/len(true_syn))
        fps.append(n-len(true_pos))
    else:
        recalls = None

        recall_array.append(recalls)

        n = len(np.where(top_dist < distance_100_threshold)[0])
        fps.append(n)

umls2020AB_df['lm_fps'] = fps
umls2020AB_df[umls2020AB_df['num_syms'] == 0].lm_fps.mean(),umls2020AB_df[umls2020AB_df['num_syms'] > 0].lm_fps.mean()

430135it [00:07, 53888.36it/s]


(365.0733836298057, 268.92025964710103)

In [147]:
validation_df.columns

Index(['0', 'strings', 'auis', '2020AA_synonyms', 'synonym_strings',
       'sapbert_400-NN_strings', 'sapbert_400-NN_auis', 'sapbert_400-NN_dist',
       'sapbert_400-NN_recall', 'num_syms', 'lexlm_2000-NN_auis_x', 'cuis',
       'sem_groups', 'lexlm_2000-NN_recall', 'R@1_SB', 'R@1_LM', 'R@5_SB',
       'R@5_LM', 'R@10_SB', 'R@10_LM', 'R@50_SB', 'R@50_LM', 'R@100_SB',
       'R@100_LM', 'R@200_SB', 'R@200_LM', 'R@500_SB', 'R@500_LM', 'R@1000_SB',
       'R@1000_LM', 'R@2000_SB', 'R@2000_LM', 'R@1_oracle', 'R@5_oracle',
       'R@10_oracle', 'R@50_oracle', 'R@100_oracle', 'R@200_oracle',
       'R@500_oracle', 'R@1000_oracle', 'R@2000_oracle', 'fps', 'sb_fps',
       'lexlm_2000-NN_auis_y', 'lexlm_2000-NN_dist', 'lm_fps', 'sb_dist@100',
       'lm_dist@100'],
      dtype='object')

In [153]:
bins = []
large_bin = []

for i,g in umls2020AB_df.groupby('num_syms'):
    if i < 10:
        bins.append((i, g['R@100_SB'].mean()))
    else:
        large_bin.append(g)
        
bins.append(('10+',pd.concat(large_bin)['R@100_SB'].mean()))

In [154]:
pd.DataFrame(bins)

Unnamed: 0,0,1
0,0,
1,1,0.9
2,2,0.93
3,3,0.9
4,4,0.83
5,5,0.7
6,6,0.74
7,7,0.76
8,8,0.74
9,9,0.68


In [155]:
bins = []
large_bin = []

for i,g in umls2020AB_df.groupby('num_syms'):
    if i < 10:
        bins.append((i, g['R@100_LM'].mean()))
    else:
        large_bin.append(g)
        
bins.append(('10+',pd.concat(large_bin)['R@100_LM'].mean()))

In [156]:
pd.DataFrame(bins)

Unnamed: 0,0,1
0,0,
1,1,0.52
2,2,0.49
3,3,0.48
4,4,0.34
5,5,0.35
6,6,0.45
7,7,0.48
8,8,0.47
9,9,0.47
