In [1]:
import _pickle as pickle
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import copy
import glob
import gc
import ipdb
import time

In [2]:
pd.set_option('precision',2)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('max_rows',200)

In [3]:
full_df = []

exp_list = glob.glob('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_*sapbert.2000-NN_DataFrame.p')

print(exp_list)

for filename in tqdm(exp_list):
    file = open(filename,'rb')
    full_df.append(pickle.load(file))
    file.close()

['/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_sapbert.2000-NN_DataFrame.p']


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [06:55<00:00, 415.16s/it]


In [4]:
umls2020AB_df = full_df[0]

In [None]:
for df in full_df[1:]:
    nn_columns = df.filter(regex='.*NN.*').columns
    for col in nn_columns:
        umls2020AB_df[col] = df[col]

In [5]:
del full_df

In [6]:
umls2020AB_df.columns

Index(['0', 'strings', 'auis', '2020AA_synonyms', 'synonym_strings',
       'num_syms', 'sapbert_2000-NN_strings', 'sapbert_2000-NN_auis',
       'sapbert_2000-NN_dist', 'sapbert_2000-NN_recall'],
      dtype='object')

In [None]:
for col in umls2020AB_df.filter(regex='lexlm.*').columns:
    umls2020AB_df.drop(col,axis=1)

In [None]:
nearest_neighbors_auis = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Bernal/lex_lm_2000-NN.p','rb'))
nearest_neighbors_dist = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Bernal/lex_lm_2000-NN_dist.p','rb'))
nearest_neighbors_auis = [auis for auis in nearest_neighbors_auis]

original_umls_2020, new_umls_2020 = pickle.load(open('aui_string_map_UMLS2020_update.p','rb'))

new_umls_2020 = [x[0] for x in new_umls_2020]
new_umls_2020 = pd.DataFrame(new_umls_2020,columns=['auis'])
new_umls_2020['lexlm_2000-NN_auis']  = nearest_neighbors_auis
new_umls_2020['lexlm_2000-NN_dist']  = list(nearest_neighbors_dist)

umls2020AB_df = umls2020AB_df.merge(new_umls_2020,on='auis',how='inner')

query_synonym_auis = list(umls2020AB_df['2020AA_synonyms'])
nearest_neighbors_auis = umls2020AB_df['lexlm_2000-NN_auis']

#Calculating Recall @ 1,5,10,50,100
recall_array = []
# closest_dist_true = []
# closest_dist_false = []

for true_syn, top100 in tqdm(zip(query_synonym_auis, nearest_neighbors_auis)):
    
    true_syn = set(true_syn)
    
    if len(true_syn) > 0:
        recalls = []

        for n in [1,5,10,50,100,200,500,1000,2000]:

            topn = set(top100[:n])
            true_pos = topn.intersection(true_syn)

            recalls.append(len(true_pos)/len(true_syn))

        recall_array.append(recalls)
#         closest_dist_true.append([top100_dist[0], np.mean(top100_dist)])
    else:
        recalls = []

        recall_array.append(recalls)
#         closest_dist_false.append([top100_dist[0], np.mean(top100_dist)])

umls2020AB_df['lexlm_2000-NN_recall'] = recall_array

In [3]:
aui_info = []

with open('/data/Bodenreider_UMLS_DL/UMLS_VERSIONS/2020AB-ACTIVE/META/MRCONSO.RRF','r') as fp:
    
    pbar = tqdm(total=15000000)
    line = fp.readline()
    
    while line:
        line = line.split('|')
        cui = line[0]
        aui = line[7]
        string = line[-5]
        scui = line[9]
        source = line[11]
        term_status = line[2]
        string_type = line[4]
        language = line[1]
        
        aui_info.append((aui,cui,string,scui,source, term_status, string_type, language))
        
        line = fp.readline()
        pbar.update(1)

15218468it [00:55, 300868.72it/s]                                                                                                      

In [4]:
cui2sg = {}

with open('/data/Bodenreider_UMLS_DL/UMLS_VERSIONS/2020AB-ACTIVE/META/MRSTY.RRF','r') as fp:
    
    for line in fp.readlines():
        line = line.split('|')
        cui = line[0]
        sg = line[3]
        cui2sg[cui] = sg
        
original_umls = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/INTERSECT_AUI2ID.PICKLE','rb'))
new_auis = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/UNIQUE_AUI2ID.PICKLE','rb'))

# aui_vecs  = pickle.load(open('/data/Bodenreider_UMLS_DL/Interns/Vishesh/eval_umls/AUI2LAYER.PICKLE','rb'))
all_2020_auis = set(original_umls.keys()).union(new_auis.keys())

In [5]:
cui2aui = {}
aui2cui = {}
aui2scui = {}
cui2preferred = {}

aui2str = {}
str2aui = {}
aui2sg = {}
scui2auis = {}

cui_sg = []
cui_aui = []

for tup in tqdm(aui_info):
    
    tup = {'AUI':tup[0], 'CUI':tup[1], 'STR':tup[2], 'SCUI':tup[3]+'|||'+tup[4], 'SOURCE':tup[4], 'PREF':(tup[5], tup[6]), 'LANG':tup[7]}

    current_time = time.time()
    
    aui = tup['AUI']
    scui = tup['SCUI']
    
    auis = scui2auis.get(scui, [])
    auis.append(aui)
    scui2auis[scui] = auis
    
    aui2scui[aui] = scui
    
    cui = tup['CUI']
    string = tup['STR']
    
    pref = tup['PREF']
    
    if pref[0] == 'P' and pref[1] == 'PF' and tup['LANG'] == 'ENG':
        cui2preferred[cui] = string

    if aui in all_2020_auis:
        scui = tup['SCUI']
        sg = cui2sg[cui]

        auis = cui2aui.get(cui, [])
        auis.append(aui)
        cui2aui[cui] = auis
        
        aui2cui[aui] = cui
        aui2str[aui] = string
        aui2sg[aui] = sg

        auis = str2aui.get(string, [])
        auis.append(aui)
        str2aui[string] = auis
        
        cui_sg.append((cui, sg))
        cui_aui.append((cui, aui))
        
        if (time.time() - current_time) > 5:
            print(tup)
        
semgroups = pd.read_csv('SemGroups.txt',sep='|',header=None)

semtype2sg = {}

for i, row in semgroups.iterrows():
    
    st = row[3]
    sg = row[1]
    
    semtype2sg[st] = sg
    
cuis = []
sts = []

for aui in umls2020AB_df.auis:
    
    cuis.append(aui2cui[aui])
    sts.append(aui2sg[aui])
    
umls2020AB_df['cuis'] = cuis
umls2020AB_df['sem_types'] = sts
umls2020AB_df['sem_groups'] = [semtype2sg[st] for st in sts]


  0%|                                                                                                     | 0/15229292 [00:00<?, ?it/s][A
15229292it [01:10, 300868.72it/s]                                                            | 4454/15229292 [00:01<59:08, 4289.90it/s][A
  0%|▏                                                                                     | 27634/15229292 [00:01<07:57, 31807.39it/s][A
  0%|▏                                                                                     | 41446/15229292 [00:03<23:22, 10832.85it/s][A
  0%|▎                                                                                     | 64962/15229292 [00:03<11:58, 21111.82it/s][A
  1%|▍                                                                                     | 87739/15229292 [00:03<07:27, 33809.23it/s][A
  1%|▌                                                                                    | 111529/15229292 [00:03<05:00, 50277.26it/s][A
  1%|▋                    

  8%|██████▋                                                                            | 1229019/15229292 [00:22<01:55, 121314.00it/s][A
  8%|██████▊                                                                            | 1245911/15229292 [00:22<01:46, 131483.19it/s][A
  8%|██████▉                                                                             | 1262190/15229292 [00:24<10:07, 22994.41it/s][A
  8%|███████                                                                             | 1278272/15229292 [00:24<07:41, 30221.70it/s][A
  9%|███████▏                                                                            | 1295028/15229292 [00:24<05:50, 39792.83it/s][A
  9%|███████▏                                                                            | 1311334/15229292 [00:24<04:33, 50953.77it/s][A
  9%|███████▎                                                                            | 1327529/15229292 [00:24<03:38, 63665.96it/s][A
  9%|███████▍              

 16%|█████████████                                                                      | 2401149/15229292 [00:34<01:01, 208512.44it/s][A
 16%|█████████████▎                                                                      | 2422071/15229292 [00:37<09:32, 22357.69it/s][A
 16%|█████████████▍                                                                      | 2441081/15229292 [00:37<07:10, 29688.47it/s][A
 16%|█████████████▌                                                                      | 2461146/15229292 [00:38<05:21, 39734.77it/s][A
 16%|█████████████▋                                                                      | 2481349/15229292 [00:38<04:03, 52303.77it/s][A
 16%|█████████████▊                                                                      | 2499515/15229292 [00:38<03:18, 64107.56it/s][A
 17%|█████████████▉                                                                      | 2519591/15229292 [00:38<02:37, 80883.21it/s][A
 17%|██████████████        

 23%|███████████████████▍                                                               | 3562234/15229292 [00:46<00:57, 202016.39it/s][A
 24%|███████████████████▌                                                               | 3584794/15229292 [00:46<00:55, 209056.95it/s][A
 24%|███████████████████▋                                                               | 3605720/15229292 [00:46<00:56, 206823.70it/s][A
 24%|███████████████████▊                                                               | 3627944/15229292 [00:46<00:54, 211400.09it/s][A
 24%|███████████████████▉                                                               | 3649102/15229292 [00:47<00:55, 209079.29it/s][A
 24%|████████████████████▏                                                               | 3670027/15229292 [00:50<09:07, 21095.66it/s][A
 24%|████████████████████▎                                                               | 3689942/15229292 [00:50<06:46, 28383.92it/s][A
 24%|████████████████████▍ 

 31%|█████████████████████████▌                                                         | 4690357/15229292 [00:59<01:13, 143534.63it/s][A
 31%|█████████████████████████▋                                                         | 4705541/15229292 [00:59<01:14, 140609.07it/s][A
 31%|█████████████████████████▋                                                         | 4720177/15229292 [00:59<01:15, 138315.56it/s][A
 31%|█████████████████████████▊                                                         | 4734402/15229292 [00:59<01:16, 136355.36it/s][A
 31%|█████████████████████████▉                                                         | 4748455/15229292 [00:59<01:16, 137503.44it/s][A
 31%|█████████████████████████▉                                                         | 4763599/15229292 [00:59<01:13, 141436.46it/s][A
 31%|██████████████████████████                                                         | 4780481/15229292 [00:59<01:09, 149305.84it/s][A
 32%|██████████████████████

 38%|███████████████████████████████▋                                                   | 5811282/15229292 [01:09<00:55, 168181.95it/s][A
 38%|███████████████████████████████▊                                                   | 5833576/15229292 [01:09<00:51, 182535.97it/s][A
 38%|███████████████████████████████▉                                                   | 5852411/15229292 [01:09<00:56, 166053.69it/s][A
 39%|███████████████████████████████▉                                                   | 5869634/15229292 [01:09<00:59, 156937.56it/s][A
 39%|████████████████████████████████                                                   | 5885781/15229292 [01:09<01:00, 154844.12it/s][A
 39%|████████████████████████████████▏                                                  | 5904372/15229292 [01:09<00:57, 163076.60it/s][A
 39%|████████████████████████████████▎                                                  | 5924705/15229292 [01:10<00:53, 174131.66it/s][A
 39%|██████████████████████

 45%|█████████████████████████████████████▎                                             | 6849741/15229292 [01:20<01:02, 133008.80it/s][A
 45%|█████████████████████████████████████▍                                             | 6871113/15229292 [01:20<00:54, 153978.54it/s][A
 45%|█████████████████████████████████████▌                                             | 6889661/15229292 [01:20<00:51, 162503.98it/s][A
 45%|█████████████████████████████████████▋                                             | 6909193/15229292 [01:20<00:48, 171596.05it/s][A
 45%|█████████████████████████████████████▊                                             | 6928680/15229292 [01:20<00:46, 178223.06it/s][A
 46%|█████████████████████████████████████▊                                             | 6948980/15229292 [01:20<00:44, 185382.12it/s][A
 46%|█████████████████████████████████████▉                                             | 6968063/15229292 [01:20<00:44, 186846.49it/s][A
 46%|██████████████████████

 52%|██████████████████████████████████████████▉                                        | 7877186/15229292 [01:31<00:58, 125547.39it/s][A
 52%|███████████████████████████████████████████                                        | 7893396/15229292 [01:31<00:54, 134429.58it/s][A
 52%|███████████████████████████████████████████                                        | 7909316/15229292 [01:31<00:53, 136238.57it/s][A
 52%|███████████████████████████████████████████▏                                       | 7926122/15229292 [01:31<00:50, 144594.43it/s][A
 52%|███████████████████████████████████████████▎                                       | 7945960/15229292 [01:31<00:45, 159266.26it/s][A
 52%|███████████████████████████████████████████▍                                       | 7962971/15229292 [01:31<00:45, 160250.19it/s][A
 52%|███████████████████████████████████████████▌                                       | 7981741/15229292 [01:31<00:43, 168027.71it/s][A
 53%|██████████████████████

 58%|████████████████████████████████████████████████▎                                  | 8868648/15229292 [01:37<00:41, 153746.78it/s][A
 58%|█████████████████████████████████████████████████▌                                   | 8884490/15229292 [01:43<11:23, 9279.70it/s][A
 58%|█████████████████████████████████████████████████                                   | 8900687/15229292 [01:43<08:08, 12957.54it/s][A

{'AUI': 'A32291825', 'CUI': 'C1955933', 'STR': 'p21-Activated Kinase', 'SCUI': 'M0505237|||MSH', 'SOURCE': 'MSH', 'PREF': ('P', 'VO'), 'LANG': 'ENG'}



 59%|█████████████████████████████████████████████████▏                                  | 8919704/15229292 [01:43<05:34, 18852.53it/s][A
 59%|█████████████████████████████████████████████████▎                                  | 8936299/15229292 [01:43<04:06, 25525.66it/s][A
 59%|█████████████████████████████████████████████████▍                                  | 8956588/15229292 [01:43<02:52, 36320.66it/s][A
 59%|█████████████████████████████████████████████████▌                                  | 8975795/15229292 [01:43<02:08, 48779.28it/s][A
 59%|█████████████████████████████████████████████████▌                                  | 8993258/15229292 [01:43<01:42, 61085.28it/s][A
 59%|█████████████████████████████████████████████████▋                                  | 9010263/15229292 [01:43<01:25, 73093.47it/s][A
 59%|█████████████████████████████████████████████████▊                                  | 9026439/15229292 [01:43<01:13, 83834.06it/s][A
 59%|█████████████████████

 64%|█████████████████████████████████████████████████████                              | 9739091/15229292 [01:49<00:38, 141463.64it/s][A
 64%|█████████████████████████████████████████████████████▏                             | 9753246/15229292 [01:49<00:38, 140679.67it/s][A
 64%|█████████████████████████████████████████████████████▎                             | 9771467/15229292 [01:49<00:35, 152907.09it/s][A
 64%|█████████████████████████████████████████████████████▎                             | 9786785/15229292 [01:49<00:37, 145027.16it/s][A
 64%|█████████████████████████████████████████████████████▍                             | 9801384/15229292 [01:49<00:39, 138333.17it/s][A
 64%|█████████████████████████████████████████████████████▍                             | 9815836/15229292 [01:49<00:38, 140064.91it/s][A
 65%|█████████████████████████████████████████████████████▌                             | 9830952/15229292 [01:49<00:37, 143229.07it/s][A
 65%|██████████████████████

{'AUI': 'A19799169', 'CUI': 'C2480759', 'STR': 'Placement @ Anatomical Regions @ Change @ Lower Arm, Right @ External @ Other Device', 'SCUI': '2W0CXY|||ICD10PCS', 'SOURCE': 'ICD10PCS', 'PREF': ('P', 'PF'), 'LANG': 'ENG'}



 66%|███████████████████████████████████████████████████████                            | 10112175/15229292 [01:57<06:14, 13647.50it/s][A
 66%|███████████████████████████████████████████████████████▏                           | 10124270/15229292 [01:58<04:43, 18023.87it/s][A
 67%|███████████████████████████████████████████████████████▏                           | 10137312/15229292 [01:58<03:30, 24211.90it/s][A
 67%|███████████████████████████████████████████████████████▎                           | 10150222/15229292 [01:58<02:39, 31859.99it/s][A
 67%|███████████████████████████████████████████████████████▍                           | 10162761/15229292 [01:58<02:04, 40683.41it/s][A
 67%|███████████████████████████████████████████████████████▍                           | 10175421/15229292 [01:58<01:39, 50887.00it/s][A
 67%|███████████████████████████████████████████████████████▌                           | 10191572/15229292 [01:58<01:15, 66932.75it/s][A
 67%|█████████████████████

 72%|███████████████████████████████████████████████████████████▍                      | 11034277/15229292 [02:04<00:26, 156614.09it/s][A
 73%|███████████████████████████████████████████████████████████▌                      | 11052971/15229292 [02:05<00:25, 165222.54it/s][A
 73%|███████████████████████████████████████████████████████████▌                      | 11069839/15229292 [02:05<00:25, 160376.06it/s][A
 73%|███████████████████████████████████████████████████████████▋                      | 11086140/15229292 [02:05<00:27, 151020.40it/s][A
 73%|███████████████████████████████████████████████████████████▊                      | 11101510/15229292 [02:05<00:27, 148424.45it/s][A
 73%|███████████████████████████████████████████████████████████▊                      | 11116533/15229292 [02:05<00:27, 146896.12it/s][A
 73%|███████████████████████████████████████████████████████████▉                      | 11131343/15229292 [02:05<00:28, 143075.82it/s][A
 73%|██████████████████████

 78%|████████████████████████████████████████████████████████████████▎                 | 11940804/15229292 [02:19<00:22, 143933.06it/s][A
 79%|████████████████████████████████████████████████████████████████▍                 | 11958759/15229292 [02:19<00:21, 153917.45it/s][A
 79%|████████████████████████████████████████████████████████████████▍                 | 11974513/15229292 [02:19<00:21, 150566.15it/s][A
 79%|████████████████████████████████████████████████████████████████▌                 | 11989830/15229292 [02:19<00:21, 147494.23it/s][A
 79%|████████████████████████████████████████████████████████████████▋                 | 12004764/15229292 [02:19<00:22, 143360.23it/s][A
 79%|████████████████████████████████████████████████████████████████▋                 | 12019238/15229292 [02:19<00:22, 141172.49it/s][A
 79%|████████████████████████████████████████████████████████████████▊                 | 12033448/15229292 [02:20<00:22, 140163.85it/s][A
 79%|██████████████████████

 84%|█████████████████████████████████████████████████████████████████████             | 12830672/15229292 [02:26<00:18, 128102.25it/s][A
 84%|█████████████████████████████████████████████████████████████████████▏            | 12844339/15229292 [02:26<00:18, 130475.38it/s][A
 84%|█████████████████████████████████████████████████████████████████████▏            | 12858505/15229292 [02:26<00:17, 133640.78it/s][A
 85%|█████████████████████████████████████████████████████████████████████▎            | 12872194/15229292 [02:26<00:17, 133332.86it/s][A
 85%|█████████████████████████████████████████████████████████████████████▍            | 12886759/15229292 [02:26<00:17, 136902.52it/s][A
 85%|█████████████████████████████████████████████████████████████████████▍            | 12900969/15229292 [02:26<00:16, 138422.53it/s][A
 85%|█████████████████████████████████████████████████████████████████████▌            | 12914935/15229292 [02:26<00:16, 138473.53it/s][A
 85%|██████████████████████

{'AUI': 'A27427942', 'CUI': 'C4294922', 'STR': 'retracted pars tensa bilateral with drainage bloody', 'SCUI': '299265|||MEDCIN', 'SOURCE': 'MEDCIN', 'PREF': ('P', 'PF'), 'LANG': 'ENG'}



 88%|█████████████████████████████████████████████████████████████████████████▍         | 13476038/15229292 [02:38<02:16, 12798.55it/s][A
 89%|█████████████████████████████████████████████████████████████████████████▌         | 13490368/15229292 [02:38<01:42, 16893.14it/s][A
 89%|█████████████████████████████████████████████████████████████████████████▌         | 13507142/15229292 [02:38<01:13, 23407.16it/s][A
 89%|█████████████████████████████████████████████████████████████████████████▋         | 13522461/15229292 [02:39<00:55, 30948.21it/s][A
 89%|█████████████████████████████████████████████████████████████████████████▊         | 13540408/15229292 [02:39<00:39, 42331.62it/s][A
 89%|█████████████████████████████████████████████████████████████████████████▉         | 13556293/15229292 [02:39<00:31, 53225.50it/s][A
 89%|█████████████████████████████████████████████████████████████████████████▉         | 13571686/15229292 [02:39<00:25, 64752.83it/s][A
 89%|█████████████████████

 94%|█████████████████████████████████████████████████████████████████████████████▏    | 14346622/15229292 [02:44<00:05, 166308.33it/s][A
 94%|█████████████████████████████████████████████████████████████████████████████▎    | 14364115/15229292 [02:44<00:05, 168867.83it/s][A
 94%|█████████████████████████████████████████████████████████████████████████████▍    | 14381029/15229292 [02:45<00:05, 165011.77it/s][A
 95%|█████████████████████████████████████████████████████████████████████████████▌    | 14397567/15229292 [02:45<00:05, 155836.55it/s][A
 95%|█████████████████████████████████████████████████████████████████████████████▌    | 14413269/15229292 [02:45<00:05, 147196.16it/s][A
 95%|█████████████████████████████████████████████████████████████████████████████▋    | 14428138/15229292 [02:45<00:05, 137353.95it/s][A
 95%|█████████████████████████████████████████████████████████████████████████████▊    | 14442058/15229292 [02:45<00:05, 134589.98it/s][A
 95%|██████████████████████

 99%|█████████████████████████████████████████████████████████████████████████████████▌| 15153075/15229292 [02:51<00:00, 123370.21it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████▋| 15165466/15229292 [02:51<00:00, 121929.99it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████▋| 15177689/15229292 [02:51<00:00, 120965.48it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████▊| 15191782/15229292 [02:51<00:00, 126628.12it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████▊| 15205343/15229292 [02:51<00:00, 129215.83it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 15229292/15229292 [02:51<00:00, 88638.89it/s][A


NameError: name 'umls2020AB_df' is not defined

In [None]:
str2cuis = {}

for aui in tqdm(aui2str.keys()):
    
    aui_string = aui2str[aui]
    
    cuis = str2cuis.get(aui_string, set())
    cuis.add(aui2cui[aui])
    str2cuis[aui_string] = cuis

In [None]:
str2cui_df = pd.DataFrame(str2cuis.items())

In [None]:
str2cui_df['num_cuis'] = [len(c) for c in str2cui_df[1]]

In [None]:
str2cui_df.sort_values('num_cuis',ascending=False)[:100]

In [18]:
sem_groups = []

for i,row in tqdm(str2cui_df.iterrows(), total=len(str2cui_df)):
    cuis = row[1]
    
    group_row = set()
    
    for cui in cuis:
        group = cui2sg[cui]
        group_row.add(group)
        
    sem_groups.append(group_row)


0it [00:00, ?it/s][A
1it [00:01,  1.12s/it][A
1397it [00:01, 1574.38it/s][A
2775it [00:01, 3267.13it/s][A
4157it [00:01, 5012.07it/s][A
5536it [00:01, 6689.11it/s][A
6798it [00:01, 7443.84it/s][A
7955it [00:01, 7977.90it/s][A
9307it [00:01, 9268.00it/s][A
10698it [00:01, 10426.02it/s][A
12097it [00:02, 11362.00it/s][A
13495it [00:02, 12073.35it/s][A
14870it [00:02, 12540.91it/s][A
16259it [00:02, 12925.71it/s][A
17646it [00:02, 13197.63it/s][A
19028it [00:02, 13378.26it/s][A
20430it [00:02, 13565.42it/s][A
21815it [00:02, 13649.12it/s][A
23202it [00:02, 13712.03it/s][A
24589it [00:02, 13756.54it/s][A
25973it [00:03, 13755.27it/s][A
27366it [00:03, 13806.65it/s][A
28762it [00:03, 13850.64it/s][A
30150it [00:03, 13747.82it/s][A
31532it [00:03, 13767.16it/s][A
32920it [00:03, 13800.34it/s][A
34306it [00:03, 13817.55it/s][A
35689it [00:03, 13795.39it/s][A
37073it [00:03, 13807.39it/s][A
38458it [00:03, 13818.54it/s][A
39853it [00:04, 13854.95it/s][A
41239it 

667119it [00:49, 13702.90it/s][A
668490it [00:49, 13701.12it/s][A
669864it [00:49, 13711.45it/s][A
671242it [00:50, 13731.27it/s][A
672616it [00:50, 13607.87it/s][A
673994it [00:50, 13657.08it/s][A
675375it [00:50, 13702.48it/s][A
676746it [00:50, 13696.75it/s][A
678116it [00:50, 13634.64it/s][A
679482it [00:50, 13639.99it/s][A
680847it [00:50, 13604.84it/s][A
682219it [00:50, 13638.17it/s][A
683583it [00:50, 13601.08it/s][A
684954it [00:51, 13631.41it/s][A
686334it [00:51, 13680.96it/s][A
687718it [00:51, 13726.87it/s][A
689091it [00:51, 13622.42it/s][A
690468it [00:51, 13665.63it/s][A
691841it [00:51, 13683.33it/s][A
693212it [00:51, 13690.00it/s][A
694582it [00:51, 13685.85it/s][A
695961it [00:51, 13715.21it/s][A
697338it [00:51, 13729.23it/s][A
698717it [00:52, 13746.74it/s][A
700092it [00:52, 13724.04it/s][A
701472it [00:52, 13745.83it/s][A
702847it [00:52, 13734.16it/s][A
704221it [00:52, 13724.14it/s][A
705594it [00:52, 13678.05it/s][A
706970it [00:5

1317318it [01:37, 13650.94it/s][A
1318684it [01:37, 13646.57it/s][A
1320064it [01:37, 13689.73it/s][A
1321434it [01:37, 13678.25it/s][A
1322803it [01:38, 13679.83it/s][A
1324173it [01:38, 13685.70it/s][A
1325553it [01:38, 13719.29it/s][A
1326925it [01:38, 13602.83it/s][A
1328294it [01:38, 13627.28it/s][A
1329674it [01:38, 13678.30it/s][A
1331042it [01:38, 13655.99it/s][A
1332412it [01:38, 13666.92it/s][A
1333785it [01:38, 13682.91it/s][A
1335154it [01:38, 13663.19it/s][A
1336527it [01:39, 13666.08it/s][A
1337907it [01:39, 13705.52it/s][A
1339283it [01:39, 13718.90it/s][A
1340655it [01:39, 13688.97it/s][A
1342024it [01:39, 13651.72it/s][A
1343390it [01:39, 13627.94it/s][A
1344761it [01:39, 13650.72it/s][A
1346141it [01:39, 13694.76it/s][A
1347511it [01:39, 13634.98it/s][A
1348883it [01:39, 13658.66it/s][A
1350257it [01:40, 13682.53it/s][A
1351626it [01:40, 13644.55it/s][A
1352991it [01:40, 13629.53it/s][A
1354360it [01:40, 13646.92it/s][A
1355727it [01:40, 13

1970968it [02:24, 14172.48it/s][A
1972388it [02:24, 14178.24it/s][A
1973806it [02:25, 14159.83it/s][A
1975224it [02:25, 14164.75it/s][A
1976661it [02:25, 14224.93it/s][A
1978084it [02:25, 14225.41it/s][A
1979513it [02:25, 14244.33it/s][A
1980938it [02:25, 14218.12it/s][A
1982360it [02:25, 14206.52it/s][A
1983786it [02:25, 14220.92it/s][A
1985209it [02:25, 14216.14it/s][A
1986631it [02:25, 14203.77it/s][A
1988060it [02:26, 14229.44it/s][A
1989483it [02:26, 14224.08it/s][A
1990906it [02:26, 14192.70it/s][A
1992326it [02:26, 14184.35it/s][A
1993757it [02:26, 14221.12it/s][A
1995186it [02:26, 14239.99it/s][A
1996611it [02:26, 14212.50it/s][A
1998039it [02:26, 14232.30it/s][A
1999463it [02:26, 14217.50it/s][A
2000887it [02:26, 14222.08it/s][A
2002310it [02:27, 14192.26it/s][A
2003730it [02:27, 14179.67it/s][A
2005154it [02:27, 14195.95it/s][A
2006574it [02:27, 14192.88it/s][A
2007994it [02:27, 14145.61it/s][A
2009420it [02:27, 14178.05it/s][A
2010846it [02:27, 14

2637435it [03:12, 14237.96it/s][A
2638864it [03:12, 14252.63it/s][A
2640295it [03:12, 14266.84it/s][A
2641726it [03:12, 14279.50it/s][A
2643154it [03:12, 14264.66it/s][A
2644581it [03:12, 14265.69it/s][A
2646008it [03:12, 14253.91it/s][A
2647434it [03:12, 14252.55it/s][A
2648860it [03:12, 14197.58it/s][A
2650292it [03:12, 14233.27it/s][A
2651728it [03:13, 14267.83it/s][A
2653155it [03:13, 14221.82it/s][A
2654595it [03:13, 14273.60it/s][A
2656023it [03:13, 14273.89it/s][A
2657456it [03:13, 14288.54it/s][A
2658885it [03:13, 14265.61it/s][A
2660319it [03:13, 14287.38it/s][A
2661748it [03:13, 14283.48it/s][A
2663178it [03:13, 14286.42it/s][A
2664607it [03:13, 14217.24it/s][A
2666051it [03:14, 14283.32it/s][A
2667481it [03:14, 14287.82it/s][A
2668921it [03:14, 14319.24it/s][A
2670353it [03:14, 14184.84it/s][A
2671784it [03:14, 14220.07it/s][A
2673231it [03:14, 14293.55it/s][A
2674661it [03:14, 14282.95it/s][A
2676090it [03:14, 14259.68it/s][A
2677517it [03:14, 14

3305102it [03:59, 14217.99it/s][A
3306539it [03:59, 14260.79it/s][A
3307966it [03:59, 14256.56it/s][A
3309396it [03:59, 14266.78it/s][A
3310829it [03:59, 14284.88it/s][A
3312258it [03:59, 14140.65it/s][A
3313685it [03:59, 14177.02it/s][A
3315120it [03:59, 14226.92it/s][A
3316543it [04:00, 14212.78it/s][A
3317988it [04:00, 14283.13it/s][A
3319417it [04:00, 14186.33it/s][A
3320847it [04:00, 14220.03it/s][A
3322276it [04:00, 14238.38it/s][A
3323706it [04:00, 14255.08it/s][A
3325132it [04:00, 14187.95it/s][A
3326552it [04:00, 14190.91it/s][A
3327983it [04:00, 14224.96it/s][A
3329411it [04:00, 14239.38it/s][A
3330835it [04:01, 14178.72it/s][A
3332276it [04:01, 14246.20it/s][A
3333701it [04:01, 14233.18it/s][A
3335127it [04:01, 14239.82it/s][A
3336552it [04:01, 14197.22it/s][A
3337972it [04:01, 14142.92it/s][A
3339387it [04:01, 11107.06it/s][A
3340810it [04:01, 11889.99it/s][A
3342209it [04:01, 12442.12it/s][A
3343643it [04:02, 12959.15it/s][A
3345063it [04:02, 13

3974656it [04:46, 14112.55it/s][A
3976096it [04:46, 14195.84it/s][A
3977545it [04:46, 14280.74it/s][A
3978974it [04:46, 14278.74it/s][A
3980403it [04:47, 14076.50it/s][A
3981836it [04:47, 14149.40it/s][A
3983266it [04:47, 14193.72it/s][A
3984695it [04:47, 14221.09it/s][A
3986127it [04:47, 14247.87it/s][A
3987553it [04:47, 14165.06it/s][A
3988972it [04:47, 14171.30it/s][A
3990395it [04:47, 14186.15it/s][A
3991826it [04:47, 14220.95it/s][A
3993249it [04:47, 14175.49it/s][A
3994687it [04:48, 14234.47it/s][A
3996111it [04:48, 14226.90it/s][A
3997534it [04:48, 14195.78it/s][A
3998959it [04:48, 14210.58it/s][A
4000400it [04:48, 14269.50it/s][A
4001839it [04:48, 14303.92it/s][A
4003270it [04:48, 14235.67it/s][A
4004694it [04:48, 14227.37it/s][A
4006128it [04:48, 14258.72it/s][A
4007561it [04:48, 14277.83it/s][A
4008989it [04:49, 14214.61it/s][A
4010418it [04:49, 14236.76it/s][A
4011842it [04:49, 14235.82it/s][A
4013267it [04:49, 14238.33it/s][A
4014691it [04:49, 14

4643020it [06:56, 14424.01it/s][A
4644463it [06:56, 14405.66it/s][A
4645920it [06:56, 14452.97it/s][A
4647373it [06:57, 14475.27it/s][A
4648828it [06:57, 14496.22it/s][A
4650287it [06:57, 14521.92it/s][A
4651743it [06:57, 14530.93it/s][A
4653197it [06:57, 14519.05it/s][A
4654649it [06:57, 14357.94it/s][A
4656092it [06:57, 14378.18it/s][A
4657551it [06:57, 14440.64it/s][A
4659001it [06:57, 14456.76it/s][A
4660447it [06:57, 14456.68it/s][A
4661893it [06:58, 14344.01it/s][A
4663348it [06:58, 14403.14it/s][A
4664811it [06:58, 14470.41it/s][A
4666261it [06:58, 14477.33it/s][A
4667709it [06:58, 14357.02it/s][A
4669159it [06:58, 14397.23it/s][A
4670619it [06:58, 14456.24it/s][A
4672072it [06:58, 14477.82it/s][A
4673526it [06:58, 14495.50it/s][A
4674976it [06:59, 14368.18it/s][A
4676431it [06:59, 14420.90it/s][A
4677880it [06:59, 14440.28it/s][A
4679325it [06:59, 14376.86it/s][A
4680784it [06:59, 14438.38it/s][A
4682228it [06:59, 14437.01it/s][A
4683681it [06:59, 14

5318634it [07:43, 14324.08it/s][A
5320077it [07:43, 14353.45it/s][A
5321522it [07:44, 14379.49it/s][A
5322961it [07:44, 14306.24it/s][A
5324396it [07:44, 14318.10it/s][A
5325851it [07:44, 14385.20it/s][A
5327302it [07:44, 14420.58it/s][A
5328745it [07:44, 14392.37it/s][A
5330196it [07:44, 14425.64it/s][A
5331647it [07:44, 14450.64it/s][A
5333093it [07:44, 14377.37it/s][A
5334532it [07:44, 14379.95it/s][A
5335982it [07:45, 14413.73it/s][A
5337426it [07:45, 14421.27it/s][A
5338869it [07:45, 14396.28it/s][A
5340309it [07:45, 14384.01it/s][A
5341750it [07:45, 14391.12it/s][A
5343190it [07:45, 14373.73it/s][A
5344628it [07:45, 14366.82it/s][A
5346065it [07:45, 14365.68it/s][A
5347504it [07:45, 14370.08it/s][A
5348942it [07:45, 14332.99it/s][A
5350376it [07:46, 14306.19it/s][A
5351829it [07:46, 14372.08it/s][A
5353279it [07:46, 14409.48it/s][A
5354720it [07:46, 14405.13it/s][A
5356161it [07:46, 14396.62it/s][A
5357601it [07:46, 14376.81it/s][A
5359039it [07:46, 14

5968252it [08:31, 13856.88it/s][A
5969638it [08:31, 13853.50it/s][A
5971024it [08:31, 13801.46it/s][A
5972406it [08:31, 13804.69it/s][A
5973790it [08:31, 13814.02it/s][A
5975173it [08:31, 13817.86it/s][A
5976555it [08:31, 13782.10it/s][A
5977942it [08:31, 13806.72it/s][A
5979323it [08:31, 13768.39it/s][A
5980700it [08:31, 13735.06it/s][A
5982097it [08:32, 13802.25it/s][A
5983486it [08:32, 13827.48it/s][A
5984877it [08:32, 13850.95it/s][A
5986263it [08:32, 13747.02it/s][A
5987638it [08:32, 13740.27it/s][A
5989019it [08:32, 13760.61it/s][A
5990403it [08:32, 13782.35it/s][A
5991782it [08:32, 13767.12it/s][A
5993160it [08:32, 13770.41it/s][A
5994538it [08:32, 13768.60it/s][A
5995927it [08:33, 13802.52it/s][A
5997308it [08:33, 13785.98it/s][A
5998688it [08:33, 13789.43it/s][A
6000067it [08:33, 13767.34it/s][A
6001448it [08:33, 13779.90it/s][A
6002827it [08:33, 13720.40it/s][A
6004202it [08:33, 13727.72it/s][A
6005591it [08:33, 13775.96it/s][A
6006972it [08:33, 13

6636106it [09:18, 14353.77it/s][A
6637542it [09:18, 14291.42it/s][A
6638980it [09:18, 14316.62it/s][A
6640412it [09:18, 14312.74it/s][A
6641845it [09:18, 14315.42it/s][A
6643277it [09:18, 14273.44it/s][A
6644705it [09:18, 14273.35it/s][A
6646148it [09:18, 14319.97it/s][A
6647581it [09:18, 14289.43it/s][A
6649010it [09:19, 14123.73it/s][A
6650436it [09:19, 14162.35it/s][A
6651884it [09:19, 14255.12it/s][A
6653322it [09:19, 14291.84it/s][A
6654756it [09:19, 14303.98it/s][A
6656187it [09:19, 14272.51it/s][A
6657621it [09:19, 14290.85it/s][A
6659057it [09:19, 14310.29it/s][A
6660492it [09:19, 14321.31it/s][A
6661931it [09:20, 14341.55it/s][A
6663366it [09:20, 14333.30it/s][A
6664800it [09:20, 14332.17it/s][A
6666234it [09:20, 14176.42it/s][A
6667656it [09:20, 14186.63it/s][A
6669090it [09:20, 14229.33it/s][A
6670524it [09:20, 14261.23it/s][A
6671956it [09:20, 14277.60it/s][A
6673384it [09:20, 14248.00it/s][A
6674811it [09:20, 14251.63it/s][A
6676237it [09:21, 14

7298277it [10:05, 13655.96it/s][A
7299660it [10:05, 13707.56it/s][A
7301047it [10:05, 13755.61it/s][A
7302423it [10:05, 13703.43it/s][A
7303806it [10:05, 13739.25it/s][A
7305203it [10:05, 13805.56it/s][A
7306584it [10:06, 13802.83it/s][A
7307965it [10:06, 13659.08it/s][A
7309354it [10:06, 13725.73it/s][A
7310741it [10:06, 13766.61it/s][A
7312125it [10:06, 13788.20it/s][A
7313510it [10:06, 13805.42it/s][A
7314891it [10:06, 13673.54it/s][A
7316277it [10:06, 13727.16it/s][A
7317662it [10:06, 13762.66it/s][A
7319057it [10:06, 13815.65it/s][A
7320446it [10:07, 13836.44it/s][A
7321838it [10:07, 13839.65it/s][A
7323226it [10:07, 13851.45it/s][A
7324612it [10:07, 13843.62it/s][A
7325998it [10:07, 13846.54it/s][A
7327390it [10:07, 13865.83it/s][A
7328777it [10:07, 13851.62it/s][A
7330163it [10:07, 13835.44it/s][A
7331547it [10:07, 13540.51it/s][A
7332919it [10:07, 13592.38it/s][A
7334306it [10:08, 13673.88it/s][A
7335691it [10:08, 13725.15it/s][A
7337079it [10:08, 13

In [19]:
str2cui_df['sem_groups'] = ['|||'.join([str(s) for s in np.sort(list(gs),kind='stable')]) for gs in tqdm(sem_groups)]


  0%|                                                                                                      | 0/7931549 [00:00<?, ?it/s][A
  0%|▏                                                                                     | 15037/7931549 [00:00<00:52, 150357.96it/s][A
  0%|▎                                                                                     | 30244/7931549 [00:00<00:52, 151358.72it/s][A
  1%|▍                                                                                     | 45589/7931549 [00:00<00:51, 152307.54it/s][A
  1%|▋                                                                                     | 60966/7931549 [00:00<00:51, 152881.05it/s][A
  1%|▊                                                                                     | 76255/7931549 [00:00<00:51, 152404.39it/s][A
  1%|▉                                                                                     | 91947/7931549 [00:00<00:50, 153930.04it/s][A
  1%|█▏                   

 23%|███████████████████▎                                                                | 1823223/7931549 [00:11<00:43, 139068.93it/s][A
 23%|███████████████████▍                                                                | 1838380/7931549 [00:11<00:42, 142586.83it/s][A
 23%|███████████████████▋                                                                | 1853489/7931549 [00:12<00:41, 145023.82it/s][A
 24%|███████████████████▊                                                                | 1868404/7931549 [00:12<00:42, 144314.28it/s][A
 24%|███████████████████▉                                                                | 1884174/7931549 [00:12<00:40, 148201.38it/s][A
 24%|████████████████████                                                                | 1899618/7931549 [00:12<00:40, 150030.18it/s][A
 24%|████████████████████▎                                                               | 1914936/7931549 [00:12<00:39, 150958.43it/s][A
 24%|████████████████████▍ 

 45%|██████████████████████████████████████▏                                             | 3608743/7931549 [00:23<00:28, 152144.55it/s][A
 46%|██████████████████████████████████████▍                                             | 3624224/7931549 [00:23<00:28, 152924.81it/s][A
 46%|██████████████████████████████████████▌                                             | 3640003/7931549 [00:23<00:27, 154361.16it/s][A
 46%|██████████████████████████████████████▋                                             | 3655874/7931549 [00:23<00:27, 155650.50it/s][A
 46%|██████████████████████████████████████▉                                             | 3671449/7931549 [00:23<00:27, 154974.55it/s][A
 46%|███████████████████████████████████████                                             | 3686954/7931549 [00:24<00:27, 154336.35it/s][A
 47%|███████████████████████████████████████▏                                            | 3702393/7931549 [00:24<00:27, 154289.00it/s][A
 47%|██████████████████████

 68%|█████████████████████████████████████████████████████████▏                          | 5395329/7931549 [00:35<00:16, 152158.71it/s][A
 68%|█████████████████████████████████████████████████████████▎                          | 5410761/7931549 [00:35<00:16, 152797.76it/s][A
 68%|█████████████████████████████████████████████████████████▍                          | 5426046/7931549 [00:35<00:16, 152544.32it/s][A
 69%|█████████████████████████████████████████████████████████▋                          | 5441305/7931549 [00:35<00:16, 151521.34it/s][A
 69%|█████████████████████████████████████████████████████████▊                          | 5456461/7931549 [00:35<00:16, 149858.35it/s][A
 69%|█████████████████████████████████████████████████████████▉                          | 5471907/7931549 [00:35<00:16, 151215.94it/s][A
 69%|██████████████████████████████████████████████████████████                          | 5487146/7931549 [00:36<00:16, 151560.81it/s][A
 69%|██████████████████████

 90%|███████████████████████████████████████████████████████████████████████████▉        | 7170522/7931549 [00:47<00:05, 152197.44it/s][A
 91%|████████████████████████████████████████████████████████████████████████████        | 7185743/7931549 [00:47<00:04, 150190.38it/s][A
 91%|████████████████████████████████████████████████████████████████████████████▎       | 7201183/7931549 [00:47<00:04, 151312.84it/s][A
 91%|████████████████████████████████████████████████████████████████████████████▍       | 7216584/7931549 [00:47<00:04, 152110.65it/s][A
 91%|████████████████████████████████████████████████████████████████████████████▌       | 7232050/7931549 [00:47<00:04, 152868.30it/s][A
 91%|████████████████████████████████████████████████████████████████████████████▊       | 7247626/7931549 [00:47<00:04, 153728.97it/s][A
 92%|████████████████████████████████████████████████████████████████████████████▉       | 7263002/7931549 [00:47<00:04, 153615.52it/s][A
 92%|██████████████████████

In [20]:
ambiguous_strings = str2cui_df[str2cui_df.num_cuis > 1]

In [60]:
ambiguous_strings.groupby('sem_groups').count().sort_values(0,ascending=False)[:50]

Unnamed: 0_level_0,0,1,num_cuis
sem_groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gene or Genome,10452,10452,10452
Biologically Active Substance|||Gene or Genome,3885,3885,3885
Enzyme|||Gene or Genome,2312,2312,2312
Disease or Syndrome|||Gene or Genome,2295,2295,2295
Clinical Attribute|||Intellectual Product,1818,1818,1818
Neoplastic Process,1603,1603,1603
Clinical Drug,1343,1343,1343
Disease or Syndrome,1325,1325,1325
Clinical Attribute,1253,1253,1253
Clinical Attribute|||Finding,1162,1162,1162


In [22]:
str2cui_df.groupby('num_cuis').count().sort_values(0,ascending=False)

Unnamed: 0_level_0,0,1,sem_groups
num_cuis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,7865805,7865805,7865805
2,54123,54123,54123
3,8063,8063,8063
4,1951,1951,1951
5,716,716,716
6,350,350,350
7,187,187,187
8,120,120,120
9,63,63,63
10,43,43,43


In [23]:
source_syn_candidates = []
source_syn_plus = []

for aui in tqdm(umls2020AB_df.auis):
    scui = aui2scui[aui]
    
    if scui.split('|||')[0] != '':       
        #Only get terms with source synonymy
        source_syns = list(set(scui2auis[scui]))
        if len(source_syns) > 500:
            break
            
        #For each source synonym, get all its 2020AA defined synonyms and add them to the candidate list
        all_syns = copy.deepcopy(source_syns)
        for source_syn_aui in source_syns:
            if source_syn_aui in aui2cui:
                AA_syns = cui2aui[aui2cui[source_syn_aui]]
                all_syns.extend(AA_syns)
    else:
        source_syns = []
        all_syns = []
        
    source_syn_candidates.append(source_syns)
    source_syn_plus.append(all_syns)
    
umls2020AB_df['source_syns'] = source_syn_candidates
umls2020AB_df['source_syns_plus'] = source_syn_plus


  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  2%|█▍                                                                                       | 7060/430135 [00:00<00:05, 70582.40it/s][A
  4%|███▎                                                                                    | 16388/430135 [00:00<00:04, 83927.31it/s][A
  6%|█████▎                                                                                  | 26261/430135 [00:00<00:04, 90677.37it/s][A
  8%|███████▍                                                                                | 36337/430135 [00:00<00:04, 94652.04it/s][A
 11%|█████████▎                                                                              | 45803/430135 [00:00<00:04, 93432.78it/s][A
 13%|███████████▎                                                                            | 55150/430135 [00:00<00:04, 91754.65it/s][A
 15%|█████████████▎       

## Evaluating kNN Models + Source Synonymy 

In [24]:
aui_columns = umls2020AB_df.filter(regex='.*_auis').columns

In [25]:
aui_columns

Index(['sapbert_2000-NN_auis'], dtype='object')

In [26]:
query_synonym_auis = list(umls2020AB_df['2020AA_synonyms'])
source_syns = umls2020AB_df['source_syns']

for aui_col in aui_columns:
    print(aui_col)
    aui_name = aui_col.split('_auis')[0]
    nearest_neighbors_auis = umls2020AB_df[aui_col]

    #Calculating Recall @ 1,5,10,50,100
    recall_array = []

    for true_syn, top100, source in tqdm(zip(query_synonym_auis, nearest_neighbors_auis, source_syns)):

        true_syn = set(true_syn)

        source = copy.deepcopy(list(set(source)))

        if source is not None:
            source_syn_num = len(source)        
            source.extend(top100)
        else:
            source = top100        
            source_syn_num = 0

        if len(true_syn) > 0:
            recalls = []

            for n in [0,1,5,10,50,100,200,500,1000,2000]:

                topn = set(source[:n+source_syn_num])
                true_pos = topn.intersection(true_syn)

                recalls.append(len(true_pos)/len(true_syn))

            recall_array.append(recalls)
        else:
            recalls = []

            recall_array.append(recalls)

    umls2020AB_df['{}_source_syn_recall'.format(aui_name)] = recall_array

sapbert_2000-NN_auis



0it [00:00, ?it/s][A
585it [00:00, 5837.06it/s][A
1169it [00:00, 5819.33it/s][A
1841it [00:00, 6223.27it/s][A
2545it [00:00, 6543.60it/s][A
3200it [00:00, 6414.60it/s][A
3842it [00:00, 6224.71it/s][A
4522it [00:00, 6402.64it/s][A
5164it [00:00, 6379.48it/s][A
5803it [00:00, 6294.74it/s][A
6434it [00:01, 6099.33it/s][A
7130it [00:01, 6348.30it/s][A
7767it [00:01, 6352.69it/s][A
8404it [00:01, 6282.96it/s][A
9034it [00:01, 6158.94it/s][A
9677it [00:01, 6235.28it/s][A
10302it [00:01, 6238.58it/s][A
10927it [00:01, 6003.49it/s][A
11856it [00:01, 6952.81it/s][A
12557it [00:02, 5724.15it/s][A
13293it [00:02, 6139.66it/s][A
13942it [00:02, 6099.06it/s][A
14576it [00:02, 5520.99it/s][A
15164it [00:02, 5611.41it/s][A
15744it [00:02, 5655.84it/s][A
16324it [00:02, 5238.20it/s][A
16882it [00:02, 5329.24it/s][A
17426it [00:02, 5280.63it/s][A
17962it [00:03, 5026.85it/s][A
18593it [00:03, 5376.83it/s][A
19139it [00:03, 5329.72it/s][A
19678it [00:03, 5190.81it/s][A
2

392255it [01:01, 5383.45it/s][A
392874it [01:01, 4797.01it/s][A
393417it [01:01, 4126.60it/s][A
393884it [01:02, 3702.66it/s][A
394293it [01:02, 3454.76it/s][A
394664it [01:02, 3315.79it/s][A
395011it [01:02, 3261.69it/s][A
395347it [01:02, 3140.40it/s][A
395667it [01:02, 3069.33it/s][A
395977it [01:02, 2972.95it/s][A
396276it [01:02, 2956.14it/s][A
396591it [01:03, 3006.44it/s][A
396905it [01:03, 3041.79it/s][A
397211it [01:03, 3037.45it/s][A
397529it [01:03, 3076.69it/s][A
397838it [01:03, 3071.94it/s][A
398158it [01:03, 3107.51it/s][A
398481it [01:03, 3142.21it/s][A
398796it [01:03, 3105.60it/s][A
399353it [01:03, 3829.52it/s][A
399738it [01:03, 3586.34it/s][A
400126it [01:04, 3668.76it/s][A
400497it [01:04, 3464.55it/s][A
400848it [01:04, 3284.74it/s][A
401181it [01:04, 3179.67it/s][A
401502it [01:04, 3144.20it/s][A
401819it [01:04, 3036.67it/s][A
402125it [01:04, 2907.86it/s][A
402427it [01:04, 2937.47it/s][A
402730it [01:04, 2961.34it/s][A
403028it [

In [27]:
query_synonym_auis = list(umls2020AB_df['2020AA_synonyms'])
source_syns = umls2020AB_df['source_syns_plus']

for aui_col in aui_columns:
    print(aui_col)

    aui_name = aui_col.split('_auis')[0]
    nearest_neighbors_auis = umls2020AB_df[aui_col]

    #Calculating Recall @ 1,5,10,50,100
    recall_array = []

    for true_syn, top100, source in tqdm(zip(query_synonym_auis, nearest_neighbors_auis, source_syns),total=len(query_synonym_auis)):

        true_syn = set(true_syn)

        source = copy.deepcopy(list(set(source)))

        if source is not None:
            source_syn_num = len(source)        
            source.extend(top100)
        else:
            source = top100        
            source_syn_num = 0

        if len(true_syn) > 0:
            recalls = []

            for n in [0,1,5,10,50,100,200,500,1000,2000]:

                topn = set(source[:n+source_syn_num])
                true_pos = topn.intersection(true_syn)

                recalls.append(len(true_pos)/len(true_syn))

            recall_array.append(recalls)
        else:
            recalls = []

            recall_array.append(recalls)

    umls2020AB_df['{}_source_syn_plus_recall'.format(aui_name)] = recall_array

sapbert_2000-NN_auis



  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  0%|                                                                                           | 583/430135 [00:00<01:13, 5824.19it/s][A
  0%|▏                                                                                         | 1166/430135 [00:00<01:16, 5622.83it/s][A
  0%|▍                                                                                         | 1855/430135 [00:00<01:09, 6189.92it/s][A
  1%|▌                                                                                         | 2493/430135 [00:00<01:08, 6259.12it/s][A
  1%|▋                                                                                         | 3135/430135 [00:00<01:07, 6316.10it/s][A
  1%|▊                                                                                         | 3768/430135 [00:00<01:09, 6106.72it/s][A
  1%|▉                    

 19%|████████████████▊                                                                        | 81054/430135 [00:13<01:10, 4926.78it/s][A
 19%|████████████████▉                                                                        | 81572/430135 [00:14<01:14, 4668.39it/s][A
 19%|████████████████▉                                                                        | 82053/430135 [00:14<01:18, 4435.07it/s][A
 19%|█████████████████                                                                        | 82504/430135 [00:14<01:18, 4431.28it/s][A
 19%|█████████████████▏                                                                       | 82952/430135 [00:14<01:20, 4300.08it/s][A
 19%|█████████████████▎                                                                       | 83392/430135 [00:14<01:20, 4322.43it/s][A
 19%|█████████████████▎                                                                       | 83848/430135 [00:14<01:18, 4383.56it/s][A
 20%|█████████████████▋    

 43%|█████████████████████████████████████▍                                                  | 183121/430135 [00:29<01:02, 3937.51it/s][A
 43%|█████████████████████████████████████▌                                                  | 183523/430135 [00:29<01:02, 3930.99it/s][A
 43%|█████████████████████████████████████▋                                                  | 183922/430135 [00:29<01:07, 3624.27it/s][A
 43%|█████████████████████████████████████▋                                                  | 184292/430135 [00:29<01:09, 3520.65it/s][A
 43%|█████████████████████████████████████▊                                                  | 184649/430135 [00:29<01:10, 3501.21it/s][A
 43%|█████████████████████████████████████▊                                                  | 185043/430135 [00:29<01:07, 3621.67it/s][A
 43%|█████████████████████████████████████▉                                                  | 185580/430135 [00:29<00:59, 4111.67it/s][A
 43%|██████████████████████

 66%|██████████████████████████████████████████████████████████▎                             | 285257/430135 [00:43<00:24, 5910.29it/s][A
 66%|██████████████████████████████████████████████████████████▍                             | 285868/430135 [00:44<00:25, 5614.03it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 286443/430135 [00:44<00:25, 5560.98it/s][A
 67%|██████████████████████████████████████████████████████████▋                             | 287008/430135 [00:44<00:26, 5336.53it/s][A
 67%|██████████████████████████████████████████████████████████▊                             | 287547/430135 [00:44<00:27, 5273.78it/s][A
 67%|██████████████████████████████████████████████████████████▉                             | 288152/430135 [00:44<00:25, 5482.35it/s][A
 67%|███████████████████████████████████████████████████████████                             | 288705/430135 [00:44<00:25, 5471.09it/s][A
 67%|██████████████████████

 85%|███████████████████████████████████████████████████████████████████████████             | 366941/430135 [00:57<00:12, 4919.18it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▏            | 367449/430135 [00:57<00:12, 4880.07it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▎            | 367948/430135 [00:57<00:13, 4676.55it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▎            | 368424/430135 [00:58<00:13, 4479.74it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▍            | 368878/430135 [00:58<00:14, 4364.00it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▌            | 369318/430135 [00:58<00:13, 4351.31it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▋            | 369756/430135 [00:58<00:14, 4312.37it/s][A
 86%|██████████████████████

 99%|███████████████████████████████████████████████████████████████████████████████████████ | 425610/430135 [01:11<00:01, 2703.06it/s][A
 99%|███████████████████████████████████████████████████████████████████████████████████████▏| 425889/430135 [01:11<00:01, 2725.24it/s][A
 99%|███████████████████████████████████████████████████████████████████████████████████████▏| 426173/430135 [01:12<00:01, 2754.48it/s][A
 99%|███████████████████████████████████████████████████████████████████████████████████████▏| 426467/430135 [01:12<00:01, 2805.61it/s][A
 99%|███████████████████████████████████████████████████████████████████████████████████████▎| 426757/430135 [01:12<00:01, 2830.29it/s][A
 99%|███████████████████████████████████████████████████████████████████████████████████████▎| 427041/430135 [01:12<00:01, 2760.73it/s][A
 99%|███████████████████████████████████████████████████████████████████████████████████████▍| 427319/430135 [01:12<00:01, 2764.10it/s][A
 99%|██████████████████████

In [28]:
recall_df = []
names = []

for recall_col in umls2020AB_df.filter(regex='.*recall.*').columns:
    names.append(recall_col)
    recall_array = list(umls2020AB_df[recall_col].values)
    recall_df.append(pd.DataFrame(recall_array).agg(['mean']))
    

recall_df = pd.concat(recall_df)
recall_df['model'] = names

recall_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,model
mean,0.2,0.44,0.53,0.71,0.76,0.81,0.86,0.88,0.89,,sapbert_2000-NN_recall
mean,0.35,0.46,0.63,0.7,0.83,0.86,0.9,0.93,0.94,0.95,sapbert_2000-NN_source_syn_recall
mean,0.83,0.85,0.89,0.9,0.94,0.95,0.97,0.99,0.99,0.99,sapbert_2000-NN_source_syn_plus_recall


## RW-UVA Using CUIs

In [29]:
query_synonym_auis = list(umls2020AB_df['2020AA_synonyms'])
umls2020AB_df['2020AA_synonyms_cuis'] = [[aui2cui[aui] for aui in true_syn] for true_syn in tqdm(query_synonym_auis)]


  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  3%|██▌                                                                                    | 12490/430135 [00:00<00:03, 124873.62it/s][A
  6%|█████▌                                                                                 | 27219/430135 [00:00<00:02, 138029.24it/s][A
 10%|████████▎                                                                              | 41022/430135 [00:00<00:02, 137473.29it/s][A
 16%|█████████████▊                                                                         | 68285/430135 [00:00<00:01, 190691.09it/s][A
 25%|█████████████████████▊                                                                | 108966/430135 [00:00<00:01, 268490.50it/s][A
 36%|███████████████████████████████▎                                                      | 156351/430135 [00:00<00:00, 338229.80it/s][A
 47%|█████████████████████

In [30]:
source_syns = umls2020AB_df['source_syns']

source_syn_cuis = []

for source_syn_row in tqdm(source_syns):
    
    source_syn_row_2020AA = []
    
    for source_syn_aui in source_syn_row:
        if source_syn_aui in aui2cui:
            source_syn_row_2020AA.append(aui2cui[source_syn_aui])
    
    source_syn_cuis.append(source_syn_row_2020AA)
    
umls2020AB_df['source_syns_cuis'] = source_syn_cuis


  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  8%|███████▎                                                                               | 35966/430135 [00:00<00:01, 359639.76it/s][A
 17%|██████████████▋                                                                        | 72630/430135 [00:00<00:00, 363743.38it/s][A
 25%|█████████████████████▊                                                                | 109005/430135 [00:00<00:00, 346723.00it/s][A
 33%|████████████████████████████▋                                                         | 143768/430135 [00:00<00:00, 340126.31it/s][A
 42%|███████████████████████████████████▊                                                  | 178924/430135 [00:00<00:00, 344107.85it/s][A
 50%|██████████████████████████████████████████▋                                           | 213384/430135 [00:00<00:00, 329558.09it/s][A
 58%|█████████████████████

In [31]:
for aui_col in aui_columns:
    aui_name = aui_col.split('_auis')[0]
    nearest_neighbors_auis = umls2020AB_df[aui_col]
    umls2020AB_df['{}_cuis'.format(aui_name)] = [[aui2cui[aui] for aui in pred_syn] for pred_syn in tqdm(nearest_neighbors_auis)]


  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  0%|                                                                                           | 144/430135 [00:00<04:59, 1436.22it/s][A
  0%|                                                                                           | 307/430135 [00:00<04:37, 1546.41it/s][A
  0%|                                                                                           | 469/430135 [00:00<04:32, 1577.60it/s][A
  0%|▏                                                                                          | 659/430135 [00:00<04:12, 1700.55it/s][A
  0%|▏                                                                                          | 833/430135 [00:00<04:10, 1714.04it/s][A
  0%|▏                                                                                         | 1005/430135 [00:00<04:40, 1527.62it/s][A
  0%|▏                    

  3%|██▋                                                                                       | 12941/430135 [00:13<07:38, 908.95it/s][A
  3%|██▋                                                                                       | 13036/430135 [00:13<10:03, 690.98it/s][A
  3%|██▋                                                                                       | 13124/430135 [00:13<09:29, 732.66it/s][A
  3%|██▊                                                                                       | 13222/430135 [00:13<08:46, 792.47it/s][A
  3%|██▊                                                                                       | 13310/430135 [00:13<08:32, 813.02it/s][A
  3%|██▊                                                                                       | 13397/430135 [00:14<09:42, 715.63it/s][A
  3%|██▊                                                                                       | 13486/430135 [00:14<09:10, 757.37it/s][A
  3%|██▊                   

  6%|█████▎                                                                                    | 25183/430135 [00:26<07:26, 906.98it/s][A
  6%|█████▎                                                                                    | 25281/430135 [00:26<07:17, 925.21it/s][A
  6%|█████▎                                                                                    | 25376/430135 [00:27<07:26, 905.91it/s][A
  6%|█████▎                                                                                    | 25478/430135 [00:27<07:11, 937.31it/s][A
  6%|█████▎                                                                                   | 25603/430135 [00:27<06:34, 1026.35it/s][A
  6%|█████▎                                                                                   | 25714/430135 [00:27<06:26, 1046.91it/s][A
  6%|█████▎                                                                                   | 25821/430135 [00:27<06:24, 1051.38it/s][A
  6%|█████▍                

  9%|████████                                                                                  | 38371/430135 [00:41<08:51, 736.77it/s][A
  9%|████████                                                                                  | 38459/430135 [00:41<08:27, 772.34it/s][A
  9%|████████                                                                                  | 38542/430135 [00:41<08:18, 786.00it/s][A
  9%|████████                                                                                  | 38666/430135 [00:41<07:11, 907.56it/s][A
  9%|████████                                                                                  | 38760/430135 [00:41<07:25, 878.61it/s][A
  9%|████████▏                                                                                 | 38856/430135 [00:41<07:14, 901.23it/s][A
  9%|████████▏                                                                                 | 38954/430135 [00:41<07:04, 922.27it/s][A
  9%|████████▏             

 12%|██████████▎                                                                              | 49893/430135 [00:55<05:39, 1119.30it/s][A
 12%|██████████▎                                                                              | 50084/430135 [00:55<04:43, 1341.05it/s][A
 12%|██████████▍                                                                              | 50222/430135 [00:55<05:25, 1167.10it/s][A
 12%|██████████▍                                                                              | 50346/430135 [00:55<06:13, 1015.56it/s][A
 12%|██████████▌                                                                               | 50455/430135 [00:55<06:26, 981.54it/s][A
 12%|██████████▌                                                                               | 50559/430135 [00:55<06:33, 965.55it/s][A
 12%|██████████▍                                                                              | 50687/430135 [00:55<06:04, 1042.38it/s][A
 12%|██████████▋           

 14%|█████████████                                                                             | 62197/430135 [01:09<08:01, 764.34it/s][A
 14%|█████████████                                                                             | 62280/430135 [01:09<07:54, 775.55it/s][A
 14%|█████████████                                                                             | 62363/430135 [01:09<07:47, 786.25it/s][A
 15%|█████████████                                                                             | 62445/430135 [01:09<07:42, 794.74it/s][A
 15%|█████████████                                                                             | 62527/430135 [01:09<07:41, 796.05it/s][A
 15%|█████████████                                                                             | 62609/430135 [01:09<07:47, 786.61it/s][A
 15%|█████████████                                                                             | 62689/430135 [01:10<08:54, 688.02it/s][A
 15%|█████████████▏        

 17%|███████████████▍                                                                          | 73526/430135 [01:23<06:34, 904.47it/s][A
 17%|███████████████▍                                                                          | 73619/430135 [01:23<06:53, 861.64it/s][A
 17%|███████████████▍                                                                          | 73707/430135 [01:23<06:52, 863.43it/s][A
 17%|███████████████▍                                                                          | 73795/430135 [01:23<08:00, 741.66it/s][A
 17%|███████████████▍                                                                          | 73881/430135 [01:24<07:41, 771.32it/s][A
 17%|███████████████▍                                                                          | 73971/430135 [01:24<07:22, 805.24it/s][A
 17%|███████████████▍                                                                          | 74057/430135 [01:24<07:14, 819.10it/s][A
 17%|███████████████▌      

 20%|█████████████████▌                                                                        | 84211/430135 [01:37<07:47, 740.68it/s][A
 20%|█████████████████▋                                                                        | 84298/430135 [01:37<07:27, 773.51it/s][A
 20%|█████████████████▋                                                                        | 84392/430135 [01:37<07:03, 816.75it/s][A
 20%|█████████████████▋                                                                        | 84477/430135 [01:37<07:07, 808.92it/s][A
 20%|█████████████████▋                                                                        | 84568/430135 [01:37<06:53, 834.89it/s][A
 20%|█████████████████▋                                                                        | 84661/430135 [01:37<06:41, 859.53it/s][A
 20%|█████████████████▋                                                                        | 84752/430135 [01:37<06:35, 873.41it/s][A
 20%|█████████████████▊    

 22%|███████████████████▉                                                                      | 95102/430135 [01:51<08:11, 681.56it/s][A
 22%|███████████████████▉                                                                      | 95182/430135 [01:51<07:54, 705.60it/s][A
 22%|███████████████████▉                                                                      | 95262/430135 [01:51<09:05, 614.44it/s][A
 22%|███████████████████▉                                                                      | 95332/430135 [01:51<08:47, 634.94it/s][A
 22%|███████████████████▉                                                                      | 95410/430135 [01:51<08:18, 671.39it/s][A
 22%|███████████████████▉                                                                      | 95502/430135 [01:52<07:33, 737.22it/s][A
 22%|████████████████████                                                                      | 95597/430135 [01:52<07:00, 795.09it/s][A
 22%|████████████████████  

 25%|█████████████████████▊                                                                   | 105632/430135 [02:05<06:58, 775.40it/s][A
 25%|█████████████████████▊                                                                   | 105711/430135 [02:05<08:35, 629.36it/s][A
 25%|█████████████████████▉                                                                   | 105789/430135 [02:05<08:07, 664.83it/s][A
 25%|█████████████████████▉                                                                   | 105860/430135 [02:05<08:05, 668.20it/s][A
 25%|█████████████████████▉                                                                   | 105930/430135 [02:05<08:11, 659.26it/s][A
 25%|█████████████████████▉                                                                   | 106001/430135 [02:05<08:02, 672.28it/s][A
 25%|█████████████████████▉                                                                   | 106084/430135 [02:05<07:32, 715.96it/s][A
 25%|█████████████████████▉

 27%|███████████████████████▊                                                                 | 115300/430135 [02:18<05:34, 942.20it/s][A
 27%|███████████████████████▉                                                                 | 115396/430135 [02:19<06:19, 828.61it/s][A
 27%|███████████████████████▉                                                                 | 115484/430135 [02:19<06:14, 839.81it/s][A
 27%|███████████████████████▉                                                                 | 115571/430135 [02:19<06:30, 805.90it/s][A
 27%|███████████████████████▉                                                                 | 115670/430135 [02:19<06:07, 855.30it/s][A
 27%|███████████████████████▉                                                                 | 115759/430135 [02:19<06:03, 864.02it/s][A
 27%|███████████████████████▉                                                                 | 115849/430135 [02:19<05:59, 873.56it/s][A
 27%|██████████████████████

 29%|██████████████████████████                                                               | 125774/430135 [02:32<06:43, 755.03it/s][A
 29%|██████████████████████████                                                               | 125851/430135 [02:32<07:56, 638.02it/s][A
 29%|██████████████████████████                                                               | 125926/430135 [02:32<07:37, 664.69it/s][A
 29%|██████████████████████████                                                               | 126009/430135 [02:32<07:10, 706.01it/s][A
 29%|██████████████████████████                                                               | 126083/430135 [02:32<07:07, 710.99it/s][A
 29%|██████████████████████████                                                               | 126157/430135 [02:33<07:12, 703.55it/s][A
 29%|██████████████████████████                                                               | 126237/430135 [02:33<06:56, 730.22it/s][A
 29%|██████████████████████

 31%|███████████████████████████▊                                                             | 134709/430135 [02:46<06:19, 777.60it/s][A
 31%|███████████████████████████▉                                                             | 134788/430135 [02:46<06:18, 779.53it/s][A
 31%|███████████████████████████▉                                                             | 134867/430135 [02:46<08:06, 606.80it/s][A
 31%|███████████████████████████▉                                                             | 134934/430135 [02:46<08:10, 601.52it/s][A
 31%|███████████████████████████▉                                                             | 134999/430135 [02:46<08:11, 600.47it/s][A
 31%|███████████████████████████▉                                                             | 135063/430135 [02:46<08:06, 607.07it/s][A
 31%|███████████████████████████▉                                                             | 135135/430135 [02:46<07:43, 635.90it/s][A
 31%|██████████████████████

 34%|█████████████████████████████▉                                                           | 144753/430135 [03:00<11:40, 407.26it/s][A
 34%|█████████████████████████████▉                                                           | 144839/430135 [03:00<09:46, 486.09it/s][A
 34%|█████████████████████████████▉                                                           | 144915/430135 [03:00<08:47, 540.68it/s][A
 34%|██████████████████████████████                                                           | 145004/430135 [03:00<07:41, 618.06it/s][A
 34%|██████████████████████████████                                                           | 145084/430135 [03:00<07:22, 643.51it/s][A
 34%|██████████████████████████████                                                           | 145161/430135 [03:00<07:11, 660.64it/s][A
 34%|██████████████████████████████                                                           | 145236/430135 [03:00<08:11, 579.66it/s][A
 34%|██████████████████████

 36%|███████████████████████████████▉                                                         | 154441/430135 [03:14<06:35, 697.86it/s][A
 36%|███████████████████████████████▉                                                         | 154522/430135 [03:14<06:19, 725.97it/s][A
 36%|███████████████████████████████▉                                                         | 154624/430135 [03:14<05:42, 804.05it/s][A
 36%|████████████████████████████████                                                         | 154717/430135 [03:14<05:28, 838.36it/s][A
 36%|████████████████████████████████                                                         | 154804/430135 [03:14<05:34, 824.23it/s][A
 36%|████████████████████████████████                                                         | 154889/430135 [03:14<05:50, 785.18it/s][A
 36%|████████████████████████████████                                                         | 154969/430135 [03:14<07:17, 628.39it/s][A
 36%|██████████████████████

 38%|██████████████████████████████████                                                       | 164706/430135 [03:27<06:35, 671.97it/s][A
 38%|██████████████████████████████████                                                       | 164776/430135 [03:28<06:32, 676.48it/s][A
 38%|██████████████████████████████████                                                       | 164847/430135 [03:28<06:28, 682.79it/s][A
 38%|██████████████████████████████████                                                       | 164918/430135 [03:28<06:24, 689.51it/s][A
 38%|██████████████████████████████████▏                                                      | 164999/430135 [03:28<06:07, 721.27it/s][A
 38%|██████████████████████████████████▏                                                      | 165077/430135 [03:28<06:00, 735.72it/s][A
 38%|██████████████████████████████████▏                                                      | 165165/430135 [03:28<05:40, 777.32it/s][A
 38%|██████████████████████

 40%|████████████████████████████████████                                                     | 174142/430135 [03:41<05:26, 784.60it/s][A
 41%|████████████████████████████████████                                                     | 174222/430135 [03:41<05:25, 786.48it/s][A
 41%|████████████████████████████████████                                                     | 174311/430135 [03:41<06:00, 710.49it/s][A
 41%|████████████████████████████████████                                                     | 174385/430135 [03:42<06:31, 653.69it/s][A
 41%|████████████████████████████████████                                                     | 174453/430135 [03:42<06:33, 650.17it/s][A
 41%|████████████████████████████████████                                                     | 174588/430135 [03:42<05:06, 833.45it/s][A
 41%|████████████████████████████████████▏                                                    | 174707/430135 [03:42<04:34, 929.18it/s][A
 41%|██████████████████████

 43%|██████████████████████████████████████▏                                                  | 184701/430135 [03:55<04:10, 978.17it/s][A
 43%|██████████████████████████████████████▏                                                  | 184801/430135 [03:55<05:15, 777.79it/s][A
 43%|██████████████████████████████████████▎                                                  | 184924/430135 [03:56<04:36, 888.10it/s][A
 43%|██████████████████████████████████████▎                                                  | 185031/430135 [03:56<04:22, 932.49it/s][A
 43%|██████████████████████████████████████▎                                                  | 185131/430135 [03:56<04:42, 866.26it/s][A
 43%|██████████████████████████████████████▎                                                  | 185227/430135 [03:56<04:35, 889.43it/s][A
 43%|██████████████████████████████████████▎                                                  | 185345/430135 [03:56<04:13, 965.07it/s][A
 43%|██████████████████████

 45%|████████████████████████████████████████                                                 | 193915/430135 [04:09<05:42, 690.63it/s][A
 45%|████████████████████████████████████████▏                                                | 194003/430135 [04:09<05:17, 743.08it/s][A
 45%|████████████████████████████████████████▏                                                | 194080/430135 [04:09<05:15, 748.08it/s][A
 45%|████████████████████████████████████████▏                                                | 194159/430135 [04:09<05:11, 758.75it/s][A
 45%|████████████████████████████████████████▏                                                | 194236/430135 [04:09<05:26, 722.05it/s][A
 45%|████████████████████████████████████████▏                                                | 194310/430135 [04:09<05:45, 683.25it/s][A
 45%|████████████████████████████████████████▏                                                | 194380/430135 [04:10<05:48, 675.74it/s][A
 45%|██████████████████████

 47%|█████████████████████████████████████████▉                                               | 202699/430135 [04:23<04:59, 759.25it/s][A
 47%|█████████████████████████████████████████▉                                               | 202776/430135 [04:24<15:28, 244.87it/s][A
 47%|█████████████████████████████████████████▉                                               | 202847/430135 [04:24<12:39, 299.19it/s][A
 47%|█████████████████████████████████████████▉                                               | 202912/430135 [04:24<10:50, 349.12it/s][A
 47%|█████████████████████████████████████████▉                                               | 202985/430135 [04:24<09:10, 412.99it/s][A
 47%|██████████████████████████████████████████                                               | 203061/430135 [04:24<07:52, 480.60it/s][A
 47%|██████████████████████████████████████████                                               | 203143/430135 [04:24<06:50, 553.58it/s][A
 47%|██████████████████████

 49%|███████████████████████████████████████████▊                                             | 211857/430135 [04:37<05:58, 608.43it/s][A
 49%|███████████████████████████████████████████▊                                             | 211942/430135 [04:37<05:24, 672.07it/s][A
 49%|███████████████████████████████████████████▊                                             | 212031/430135 [04:38<04:58, 730.25it/s][A
 49%|███████████████████████████████████████████▉                                             | 212109/430135 [04:38<04:53, 743.40it/s][A
 49%|███████████████████████████████████████████▉                                             | 212189/430135 [04:38<04:46, 759.52it/s][A
 49%|███████████████████████████████████████████▉                                             | 212286/430135 [04:38<04:26, 818.71it/s][A
 49%|███████████████████████████████████████████▉                                             | 212370/430135 [04:38<04:27, 813.19it/s][A
 49%|██████████████████████

 51%|█████████████████████████████████████████████▋                                           | 220694/430135 [04:51<05:22, 650.21it/s][A
 51%|█████████████████████████████████████████████▋                                           | 220760/430135 [04:51<05:23, 646.67it/s][A
 51%|█████████████████████████████████████████████▋                                           | 220825/430135 [04:51<06:26, 541.89it/s][A
 51%|█████████████████████████████████████████████▋                                           | 220890/430135 [04:51<06:07, 569.11it/s][A
 51%|█████████████████████████████████████████████▋                                           | 220956/430135 [04:51<05:52, 593.40it/s][A
 51%|█████████████████████████████████████████████▋                                           | 221023/430135 [04:51<05:41, 613.07it/s][A
 51%|█████████████████████████████████████████████▋                                           | 221089/430135 [04:51<05:34, 624.57it/s][A
 51%|██████████████████████

 53%|███████████████████████████████████████████████▍                                         | 229131/430135 [05:04<06:08, 545.12it/s][A
 53%|███████████████████████████████████████████████▍                                         | 229249/430135 [05:04<04:42, 710.69it/s][A
 53%|███████████████████████████████████████████████▍                                         | 229374/430135 [05:05<03:53, 858.01it/s][A
 53%|███████████████████████████████████████████████▍                                         | 229464/430135 [05:05<03:59, 838.75it/s][A
 53%|███████████████████████████████████████████████▍                                         | 229551/430135 [05:05<04:08, 806.69it/s][A
 53%|███████████████████████████████████████████████▌                                         | 229645/430135 [05:05<03:58, 842.06it/s][A
 53%|███████████████████████████████████████████████▌                                         | 229731/430135 [05:05<04:03, 822.60it/s][A
 53%|██████████████████████

 56%|████████████████████████████████████████████████▉                                       | 239336/430135 [05:18<01:48, 1759.91it/s][A
 56%|█████████████████████████████████████████████████                                       | 239515/430135 [05:18<01:53, 1674.13it/s][A
 56%|█████████████████████████████████████████████████                                       | 239685/430135 [05:19<01:57, 1621.13it/s][A
 56%|█████████████████████████████████████████████████                                       | 239849/430135 [05:19<02:03, 1536.32it/s][A
 56%|█████████████████████████████████████████████████                                       | 240005/430135 [05:19<02:25, 1304.49it/s][A
 56%|█████████████████████████████████████████████████▏                                      | 240142/430135 [05:19<02:32, 1249.27it/s][A
 56%|█████████████████████████████████████████████████▏                                      | 240272/430135 [05:19<02:40, 1185.12it/s][A
 56%|██████████████████████

 58%|███████████████████████████████████████████████████▊                                     | 250680/430135 [05:32<04:24, 679.25it/s][A
 58%|███████████████████████████████████████████████████▉                                     | 250758/430135 [05:33<04:14, 703.68it/s][A
 58%|███████████████████████████████████████████████████▉                                     | 250840/430135 [05:33<04:04, 732.78it/s][A
 58%|███████████████████████████████████████████████████▉                                     | 250920/430135 [05:33<03:59, 748.27it/s][A
 58%|███████████████████████████████████████████████████▉                                     | 251006/430135 [05:33<03:49, 779.03it/s][A
 58%|███████████████████████████████████████████████████▉                                     | 251086/430135 [05:33<03:58, 749.24it/s][A
 58%|███████████████████████████████████████████████████▉                                     | 251163/430135 [05:33<05:24, 551.15it/s][A
 58%|██████████████████████

 61%|█████████████████████████████████████████████████████▊                                   | 260252/430135 [05:46<04:12, 671.61it/s][A
 61%|█████████████████████████████████████████████████████▊                                   | 260322/430135 [05:46<04:27, 635.63it/s][A
 61%|█████████████████████████████████████████████████████▉                                   | 260388/430135 [05:47<05:27, 518.55it/s][A
 61%|█████████████████████████████████████████████████████▉                                   | 260453/430135 [05:47<05:09, 547.91it/s][A
 61%|█████████████████████████████████████████████████████▉                                   | 260519/430135 [05:47<04:54, 575.58it/s][A
 61%|█████████████████████████████████████████████████████▉                                   | 260583/430135 [05:47<04:46, 591.74it/s][A
 61%|█████████████████████████████████████████████████████▉                                   | 260646/430135 [05:47<04:42, 599.86it/s][A
 61%|██████████████████████

 63%|███████████████████████████████████████████████████████▊                                 | 269801/430135 [06:01<03:55, 680.47it/s][A
 63%|███████████████████████████████████████████████████████▊                                 | 269898/430135 [06:01<03:46, 706.23it/s][A
 63%|███████████████████████████████████████████████████████▊                                 | 269989/430135 [06:01<03:37, 737.77it/s][A
 63%|███████████████████████████████████████████████████████▉                                 | 270078/430135 [06:01<03:59, 668.88it/s][A
 63%|███████████████████████████████████████████████████████▉                                 | 270157/430135 [06:01<03:52, 687.98it/s][A
 63%|███████████████████████████████████████████████████████▉                                 | 270235/430135 [06:01<03:45, 708.70it/s][A
 63%|███████████████████████████████████████████████████████▉                                 | 270318/430135 [06:02<03:36, 739.02it/s][A
 63%|██████████████████████

 65%|█████████████████████████████████████████████████████████▊                               | 279652/430135 [06:15<03:37, 690.90it/s][A
 65%|█████████████████████████████████████████████████████████▉                               | 279722/430135 [06:15<03:38, 689.76it/s][A
 65%|█████████████████████████████████████████████████████████▉                               | 279792/430135 [06:15<04:34, 548.08it/s][A
 65%|█████████████████████████████████████████████████████████▉                               | 279854/430135 [06:15<04:26, 563.65it/s][A
 65%|█████████████████████████████████████████████████████████▉                               | 279919/430135 [06:15<04:16, 585.04it/s][A
 65%|█████████████████████████████████████████████████████████▉                               | 280012/430135 [06:15<03:41, 677.79it/s][A
 65%|█████████████████████████████████████████████████████████▉                               | 280093/430135 [06:15<03:30, 711.76it/s][A
 65%|██████████████████████

 67%|███████████████████████████████████████████████████████████▋                             | 288685/430135 [06:28<02:49, 833.32it/s][A
 67%|███████████████████████████████████████████████████████████▊                             | 288773/430135 [06:28<03:16, 721.17it/s][A
 67%|███████████████████████████████████████████████████████████▊                             | 288854/430135 [06:29<03:10, 741.31it/s][A
 67%|███████████████████████████████████████████████████████████▊                             | 288944/430135 [06:29<03:00, 780.64it/s][A
 67%|███████████████████████████████████████████████████████████▊                             | 289048/430135 [06:29<02:45, 852.34it/s][A
 67%|███████████████████████████████████████████████████████████▊                             | 289150/430135 [06:29<02:36, 898.02it/s][A
 67%|███████████████████████████████████████████████████████████▊                             | 289247/430135 [06:29<02:33, 917.47it/s][A
 67%|██████████████████████

 70%|██████████████████████████████████████████████████████████████                           | 299937/430135 [06:42<02:39, 814.19it/s][A
 70%|██████████████████████████████████████████████████████████████                           | 300044/430135 [06:42<02:27, 880.55it/s][A
 70%|██████████████████████████████████████████████████████████████                           | 300141/430135 [06:42<02:23, 903.39it/s][A
 70%|██████████████████████████████████████████████████████████████                           | 300243/430135 [06:42<02:18, 934.82it/s][A
 70%|██████████████████████████████████████████████████████████████▏                          | 300339/430135 [06:42<02:22, 912.96it/s][A
 70%|██████████████████████████████████████████████████████████████▏                          | 300435/430135 [06:42<02:20, 925.57it/s][A
 70%|██████████████████████████████████████████████████████████████▏                          | 300534/430135 [06:43<02:17, 943.05it/s][A
 70%|██████████████████████

 72%|████████████████████████████████████████████████████████████████▏                        | 310127/430135 [06:56<02:59, 668.07it/s][A
 72%|████████████████████████████████████████████████████████████████▏                        | 310196/430135 [06:56<02:58, 673.13it/s][A
 72%|████████████████████████████████████████████████████████████████▏                        | 310264/430135 [06:56<03:02, 658.23it/s][A
 72%|████████████████████████████████████████████████████████████████▏                        | 310331/430135 [06:56<03:38, 548.65it/s][A
 72%|████████████████████████████████████████████████████████████████▏                        | 310394/430135 [06:56<03:31, 567.41it/s][A
 72%|████████████████████████████████████████████████████████████████▏                        | 310454/430135 [06:56<03:29, 570.81it/s][A
 72%|████████████████████████████████████████████████████████████████▎                        | 310520/430135 [06:56<03:21, 594.06it/s][A
 72%|██████████████████████

 74%|██████████████████████████████████████████████████████████████████▏                      | 319751/430135 [07:10<02:56, 624.73it/s][A
 74%|██████████████████████████████████████████████████████████████████▏                      | 319841/430135 [07:10<02:43, 674.74it/s][A
 74%|██████████████████████████████████████████████████████████████████▏                      | 319953/430135 [07:10<02:21, 778.21it/s][A
 74%|██████████████████████████████████████████████████████████████████▏                      | 320049/430135 [07:10<02:28, 741.05it/s][A
 74%|██████████████████████████████████████████████████████████████████▏                      | 320136/430135 [07:10<02:26, 752.56it/s][A
 74%|██████████████████████████████████████████████████████████████████▎                      | 320221/430135 [07:11<02:27, 743.42it/s][A
 74%|██████████████████████████████████████████████████████████████████▎                      | 320302/430135 [07:11<02:31, 723.55it/s][A
 74%|██████████████████████

 77%|███████████████████████████████████████████████████████████████████▎                    | 329205/430135 [07:24<01:32, 1094.57it/s][A
 77%|███████████████████████████████████████████████████████████████████▍                    | 329343/430135 [07:24<01:25, 1173.70it/s][A
 77%|███████████████████████████████████████████████████████████████████▍                    | 329469/430135 [07:24<01:24, 1196.80it/s][A
 77%|███████████████████████████████████████████████████████████████████▍                    | 329607/430135 [07:24<01:20, 1248.00it/s][A
 77%|███████████████████████████████████████████████████████████████████▍                    | 329734/430135 [07:24<01:27, 1148.68it/s][A
 77%|███████████████████████████████████████████████████████████████████▍                    | 329852/430135 [07:25<01:29, 1120.85it/s][A
 77%|███████████████████████████████████████████████████████████████████▌                    | 329967/430135 [07:25<01:28, 1128.35it/s][A
 77%|██████████████████████

 80%|█████████████████████████████████████████████████████████████████████▉                  | 342040/430135 [07:38<01:27, 1002.79it/s][A
 80%|██████████████████████████████████████████████████████████████████████▊                  | 342142/430135 [07:38<01:32, 951.48it/s][A
 80%|██████████████████████████████████████████████████████████████████████▊                  | 342239/430135 [07:38<01:47, 819.12it/s][A
 80%|██████████████████████████████████████████████████████████████████████▊                  | 342343/430135 [07:38<01:40, 875.47it/s][A
 80%|██████████████████████████████████████████████████████████████████████▊                  | 342441/430135 [07:38<01:37, 902.59it/s][A
 80%|██████████████████████████████████████████████████████████████████████▉                  | 342543/430135 [07:38<01:34, 931.02it/s][A
 80%|██████████████████████████████████████████████████████████████████████▉                  | 342655/430135 [07:38<01:28, 983.01it/s][A
 80%|██████████████████████

 82%|█████████████████████████████████████████████████████████████████████████▏               | 353715/430135 [07:52<01:21, 940.18it/s][A
 82%|████████████████████████████████████████████████████████████████████████▍               | 353844/430135 [07:52<01:13, 1037.85it/s][A
 82%|█████████████████████████████████████████████████████████████████████████▏               | 353953/430135 [07:52<01:22, 922.62it/s][A
 82%|█████████████████████████████████████████████████████████████████████████▎               | 354051/430135 [07:52<01:27, 872.01it/s][A
 82%|█████████████████████████████████████████████████████████████████████████▎               | 354143/430135 [07:53<01:30, 836.94it/s][A
 82%|█████████████████████████████████████████████████████████████████████████▎               | 354230/430135 [07:53<01:30, 837.98it/s][A
 82%|█████████████████████████████████████████████████████████████████████████▎               | 354336/430135 [07:53<01:24, 897.47it/s][A
 82%|██████████████████████

 85%|███████████████████████████████████████████████████████████████████████████▋             | 365775/430135 [08:06<01:21, 792.40it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▋             | 365859/430135 [08:06<01:20, 796.70it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▋             | 365954/430135 [08:06<01:17, 831.63it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▋             | 366041/430135 [08:06<01:17, 823.76it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▊             | 366126/430135 [08:07<01:18, 816.22it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▊             | 366209/430135 [08:07<01:19, 804.80it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▊             | 366291/430135 [08:07<01:20, 794.02it/s][A
 85%|██████████████████████

 87%|█████████████████████████████████████████████████████████████████████████████▌           | 375141/430135 [08:20<01:12, 753.36it/s][A
 87%|█████████████████████████████████████████████████████████████████████████████▋           | 375219/430135 [08:20<01:14, 732.31it/s][A
 87%|█████████████████████████████████████████████████████████████████████████████▋           | 375294/430135 [08:20<01:15, 730.24it/s][A
 87%|█████████████████████████████████████████████████████████████████████████████▋           | 375369/430135 [08:20<01:15, 721.50it/s][A
 87%|█████████████████████████████████████████████████████████████████████████████▋           | 375442/430135 [08:20<01:29, 611.06it/s][A
 87%|█████████████████████████████████████████████████████████████████████████████▋           | 375518/430135 [08:20<01:24, 647.71it/s][A
 87%|█████████████████████████████████████████████████████████████████████████████▋           | 375602/430135 [08:20<01:18, 698.57it/s][A
 87%|██████████████████████

 89%|███████████████████████████████████████████████████████████████████████████████▌         | 384775/430135 [08:33<01:08, 665.37it/s][A
 89%|███████████████████████████████████████████████████████████████████████████████▋         | 384853/430135 [08:34<01:05, 694.30it/s][A
 89%|███████████████████████████████████████████████████████████████████████████████▋         | 384929/430135 [08:34<01:03, 710.60it/s][A
 90%|███████████████████████████████████████████████████████████████████████████████▋         | 385011/430135 [08:34<01:01, 739.23it/s][A
 90%|███████████████████████████████████████████████████████████████████████████████▋         | 385088/430135 [08:34<01:00, 742.95it/s][A
 90%|███████████████████████████████████████████████████████████████████████████████▋         | 385164/430135 [08:34<01:10, 641.78it/s][A
 90%|███████████████████████████████████████████████████████████████████████████████▋         | 385245/430135 [08:34<01:05, 682.98it/s][A
 90%|██████████████████████

 92%|█████████████████████████████████████████████████████████████████████████████████▋       | 395044/430135 [08:48<00:47, 739.60it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▊       | 395133/430135 [08:48<00:44, 779.46it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▊       | 395217/430135 [08:48<00:44, 783.64it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▊       | 395317/430135 [08:48<00:41, 842.53it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▊       | 395406/430135 [08:48<00:40, 852.52it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▊       | 395494/430135 [08:48<00:42, 815.04it/s][A
 92%|█████████████████████████████████████████████████████████████████████████████████▊       | 395578/430135 [08:48<00:51, 676.96it/s][A
 92%|██████████████████████

 94%|████████████████████████████████████████████████████████████████████████████████████     | 406003/430135 [09:01<00:29, 816.97it/s][A
 94%|████████████████████████████████████████████████████████████████████████████████████     | 406137/430135 [09:02<00:25, 946.58it/s][A
 94%|███████████████████████████████████████████████████████████████████████████████████     | 406258/430135 [09:02<00:23, 1014.38it/s][A
 94%|████████████████████████████████████████████████████████████████████████████████████     | 406365/430135 [09:02<00:24, 961.95it/s][A
 94%|████████████████████████████████████████████████████████████████████████████████████     | 406468/430135 [09:02<00:24, 979.83it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████     | 406570/430135 [09:02<00:26, 892.55it/s][A
 95%|████████████████████████████████████████████████████████████████████████████████████▏    | 406663/430135 [09:02<00:30, 763.82it/s][A
 95%|██████████████████████

 97%|██████████████████████████████████████████████████████████████████████████████████████   | 416147/430135 [09:15<00:19, 711.49it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████   | 416220/430135 [09:15<00:19, 708.98it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████▏  | 416304/430135 [09:15<00:18, 745.56it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████▏  | 416380/430135 [09:15<00:23, 593.30it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████▏  | 416456/430135 [09:16<00:21, 632.91it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████▏  | 416533/430135 [09:16<00:20, 668.24it/s][A
 97%|██████████████████████████████████████████████████████████████████████████████████████▏  | 416610/430135 [09:16<00:19, 693.73it/s][A
 97%|██████████████████████

 99%|████████████████████████████████████████████████████████████████████████████████████████▏| 426033/430135 [09:29<00:05, 691.78it/s][A
 99%|████████████████████████████████████████████████████████████████████████████████████████▏| 426118/430135 [09:29<00:05, 731.92it/s][A
 99%|████████████████████████████████████████████████████████████████████████████████████████▏| 426196/430135 [09:29<00:05, 743.24it/s][A
 99%|████████████████████████████████████████████████████████████████████████████████████████▏| 426273/430135 [09:29<00:05, 745.92it/s][A
 99%|████████████████████████████████████████████████████████████████████████████████████████▏| 426355/430135 [09:29<00:04, 765.36it/s][A
 99%|████████████████████████████████████████████████████████████████████████████████████████▏| 426434/430135 [09:29<00:04, 769.63it/s][A
 99%|████████████████████████████████████████████████████████████████████████████████████████▎| 426519/430135 [09:29<00:04, 791.72it/s][A
 99%|██████████████████████

In [32]:
cui_columns = umls2020AB_df.filter(regex='.*2000.*_cuis').columns

In [33]:
cui_columns

Index(['sapbert_2000-NN_cuis'], dtype='object')

In [34]:
query_synonym_cuis = list(umls2020AB_df['2020AA_synonyms_cuis'])

for cui_col in cui_columns:
    print(cui_col)
    cui_name = cui_col.split('_cuis')[0]
    nearest_neighbors_cuis = umls2020AB_df[cui_col]

    #Calculating Recall @ 1,5,10,50,100
    recall_array = []

    for true_syn, top100 in tqdm(zip(query_synonym_cuis, nearest_neighbors_cuis), total=len(query_synonym_cuis)):

        true_syn = set(true_syn)

        if len(true_syn) > 0:
            recalls = []

            for n in [1,5,10,50,100,200,500,1000,2000]:

                topn = set(top100[:n])
                true_pos = topn.intersection(true_syn)

                recalls.append(len(true_pos)/len(true_syn))

            recall_array.append(recalls)
        else:
            recalls = []

            recall_array.append(recalls)

    umls2020AB_df['{}_cui_recall'.format(cui_name)] = recall_array

sapbert_2000-NN_cuis



  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  0%|▏                                                                                          | 639/430135 [00:00<01:07, 6388.19it/s][A
  0%|▎                                                                                         | 1278/430135 [00:00<01:08, 6293.71it/s][A
  1%|▍                                                                                         | 2154/430135 [00:00<00:57, 7408.58it/s][A
  1%|▌                                                                                         | 2922/430135 [00:00<00:56, 7514.12it/s][A
  1%|▊                                                                                         | 3675/430135 [00:00<00:59, 7216.65it/s][A
  1%|▉                                                                                         | 4439/430135 [00:00<00:57, 7349.18it/s][A
  1%|█                    

 29%|█████████████████████████▊                                                              | 126375/430135 [00:17<00:54, 5525.94it/s][A
 30%|██████████████████████████                                                              | 127463/430135 [00:17<00:45, 6713.91it/s][A
 30%|██████████████████████████▎                                                             | 128443/430135 [00:17<00:40, 7457.32it/s][A
 30%|██████████████████████████▍                                                             | 129254/430135 [00:17<00:40, 7428.19it/s][A
 30%|██████████████████████████▋                                                             | 130381/430135 [00:17<00:35, 8431.07it/s][A
 31%|██████████████████████████▉                                                             | 131510/430135 [00:17<00:32, 9208.54it/s][A
 31%|███████████████████████████                                                             | 132471/430135 [00:17<00:36, 8087.22it/s][A
 31%|██████████████████████

 71%|██████████████████████████████████████████████████████████████▌                         | 305908/430135 [00:37<00:23, 5344.84it/s][A
 71%|██████████████████████████████████████████████████████████████▋                         | 306490/430135 [00:37<00:22, 5472.91it/s][A
 72%|██████████████████████████████████████████████████████████████▉                         | 307840/430135 [00:37<00:15, 7759.25it/s][A
 73%|███████████████████████████████████████████████████████████████▌                       | 314283/430135 [00:38<00:04, 24152.20it/s][A
 74%|████████████████████████████████████████████████████████████████▏                      | 317524/430135 [00:38<00:04, 26537.67it/s][A
 74%|████████████████████████████████████████████████████████████████▊                      | 320234/430135 [00:38<00:06, 16113.63it/s][A
 75%|█████████████████████████████████████████████████████████████████▏                     | 322383/430135 [00:38<00:10, 10698.70it/s][A
 75%|██████████████████████

 93%|█████████████████████████████████████████████████████████████████████████████████▊      | 400136/430135 [00:53<00:08, 3577.76it/s][A
 93%|█████████████████████████████████████████████████████████████████████████████████▉      | 400499/430135 [00:53<00:08, 3441.86it/s][A
 93%|██████████████████████████████████████████████████████████████████████████████████      | 400847/430135 [00:53<00:08, 3365.03it/s][A
 93%|██████████████████████████████████████████████████████████████████████████████████      | 401187/430135 [00:53<00:08, 3239.30it/s][A
 93%|██████████████████████████████████████████████████████████████████████████████████▏     | 401514/430135 [00:53<00:08, 3188.28it/s][A
 93%|██████████████████████████████████████████████████████████████████████████████████▏     | 401835/430135 [00:53<00:08, 3172.28it/s][A
 93%|██████████████████████████████████████████████████████████████████████████████████▎     | 402154/430135 [00:53<00:08, 3111.46it/s][A
 94%|██████████████████████

In [35]:
query_synonym_cuis = list(umls2020AB_df['2020AA_synonyms_cuis'])
source_syns = umls2020AB_df['source_syns_cuis']

for cui_col in cui_columns:
    print(cui_col)
    cui_name = cui_col.split('_cuis')[0]
    nearest_neighbors_cuis = umls2020AB_df[cui_col]
    
    #Calculating Recall @ 1,5,10,50,100
    recall_array = []

    for true_syn, top100, source in tqdm(zip(query_synonym_cuis, nearest_neighbors_cuis, source_syns), total=len(query_synonym_cuis)):

        true_syn = set(true_syn)
        source = copy.deepcopy(list(set(source)))

        if len(true_syn) > 0:
            recalls = []

            if source is not None:
                source_syn_num = len(source)        
                source.extend(top100)
            else:
                source = top100
                source_syn_num = 0

            for n in [0,1,5,10,50,100,200,500,1000,2000]:
                topn = set(source[:n+source_syn_num])
                true_pos = topn.intersection(true_syn)

                recalls.append(len(true_pos)/len(true_syn))

            recall_array.append(recalls)
        else:
            recalls = []

            recall_array.append(recalls)

    umls2020AB_df['{}_source_syn_cui_recall'.format(cui_name)] = recall_array

sapbert_2000-NN_cuis



  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  0%|                                                                                           | 577/430135 [00:00<01:14, 5766.22it/s][A
  0%|▏                                                                                         | 1154/430135 [00:00<01:16, 5571.64it/s][A
  0%|▍                                                                                         | 1841/430135 [00:00<01:09, 6149.71it/s][A
  1%|▌                                                                                         | 2488/430135 [00:00<01:08, 6269.22it/s][A
  1%|▋                                                                                         | 3140/430135 [00:00<01:07, 6355.85it/s][A
  1%|▊                                                                                         | 3777/430135 [00:00<01:08, 6254.41it/s][A
  1%|▉                    

 28%|████████████████████████                                                               | 119081/430135 [00:16<00:25, 12392.07it/s][A
 28%|████████████████████████▎                                                              | 120353/430135 [00:17<00:29, 10476.09it/s][A
 28%|████████████████████████▊                                                               | 121473/430135 [00:17<00:42, 7283.16it/s][A
 28%|█████████████████████████                                                               | 122377/430135 [00:17<00:52, 5885.48it/s][A
 29%|█████████████████████████▏                                                              | 123119/430135 [00:17<00:54, 5601.12it/s][A
 29%|█████████████████████████▎                                                              | 123782/430135 [00:17<00:53, 5732.84it/s][A
 29%|█████████████████████████▍                                                              | 124434/430135 [00:17<00:52, 5823.34it/s][A
 29%|██████████████████████

 65%|█████████████████████████████████████████████████████████▏                              | 279822/430135 [00:37<00:16, 9049.44it/s][A
 66%|█████████████████████████████████████████████████████████                              | 282099/430135 [00:37<00:12, 11819.69it/s][A
 66%|█████████████████████████████████████████████████████████▉                              | 283491/430135 [00:37<00:15, 9316.93it/s][A
 66%|██████████████████████████████████████████████████████████▏                             | 284638/430135 [00:38<00:18, 7771.77it/s][A
 66%|██████████████████████████████████████████████████████████▍                             | 285591/430135 [00:38<00:20, 7186.86it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 286427/430135 [00:38<00:21, 6808.19it/s][A
 67%|██████████████████████████████████████████████████████████▊                             | 287183/430135 [00:38<00:22, 6266.25it/s][A
 67%|██████████████████████

 87%|████████████████████████████████████████████████████████████████████████████▌           | 374003/430135 [00:52<00:09, 5705.44it/s][A
 87%|████████████████████████████████████████████████████████████████████████████▋           | 374596/430135 [00:53<00:09, 5767.51it/s][A
 87%|████████████████████████████████████████████████████████████████████████████▊           | 375177/430135 [00:53<00:09, 5691.98it/s][A
 87%|████████████████████████████████████████████████████████████████████████████▉           | 375808/430135 [00:53<00:09, 5872.59it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████           | 376540/430135 [00:53<00:08, 6294.40it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████▏          | 377180/430135 [00:53<00:08, 6325.20it/s][A
 88%|█████████████████████████████████████████████████████████████████████████████▎          | 377815/430135 [00:53<00:09, 5712.18it/s][A
 88%|██████████████████████

In [36]:
umls2020AB_df['number_source_syn_cuis'] = [len(set(c)) for c in umls2020AB_df.source_syns_cuis] 
umls2020AB_df['number_source_syn_auis'] = [len(set(c)) for c in umls2020AB_df.source_syns] 
umls2020AB_df['number_source_syn_plus_auis'] = [len(set(c)) for c in umls2020AB_df.source_syns_plus] 

In [37]:
umls2020AB_df.describe()

Unnamed: 0,0,num_syms,number_source_syn_cuis,number_source_syn_auis,number_source_syn_plus_auis
count,430135.0,430135.0,430135.0,430135.0,430135.0
mean,11.46,7.5,0.89,2.9,5.23
std,9.86,35.94,0.34,3.84,10.92
min,3.0,0.0,0.0,0.0,0.0
25%,7.0,0.0,1.0,1.0,1.0
50%,10.0,0.0,1.0,1.0,2.0
75%,13.0,4.0,1.0,4.0,6.0
max,1104.0,550.0,7.0,65.0,2714.0


In [38]:
recall_df = []
names = []

for recall_col in umls2020AB_df.filter(regex='.*recall.*').columns:
    names.append(recall_col)
    recall_array = list(umls2020AB_df[recall_col].values)
    recall_df.append(pd.DataFrame(recall_array).agg(['mean']))

recall_df = pd.concat(recall_df)
recall_df['model'] = names

recall_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,model
mean,0.2,0.44,0.53,0.71,0.76,0.81,0.86,0.88,0.89,,sapbert_2000-NN_recall
mean,0.35,0.46,0.63,0.7,0.83,0.86,0.9,0.93,0.94,0.95,sapbert_2000-NN_source_syn_recall
mean,0.83,0.85,0.89,0.9,0.94,0.95,0.97,0.99,0.99,0.99,sapbert_2000-NN_source_syn_plus_recall
mean,0.72,0.83,0.86,0.91,0.92,0.93,0.94,0.95,0.95,,sapbert_2000-NN_cui_recall
mean,0.83,0.96,0.98,0.99,0.99,0.99,1.0,1.0,1.0,1.0,sapbert_2000-NN_source_syn_cui_recall


In [39]:
cui_columns

Index(['sapbert_2000-NN_cuis'], dtype='object')

In [40]:
umls2020AB_df['2020AA_synonyms_cuis']

8768549                                                   []
8769337                                                   []
8949912                                                   []
8763774                                                   []
8764953                                                   []
                                 ...                        
8529453    [C0042679, C0042679, C0042679, C0042679, C0042...
8529270    [C0041984, C0041984, C0041984, C0041984, C0041...
8529351    [C0042232, C0042232, C0042232, C0042232, C0042...
8601787    [C1425082, C1425082, C1425082, C1425082, C1425...
8940025                                                   []
Name: 2020AA_synonyms_cuis, Length: 430135, dtype: object

In [41]:
umls2020AB_df.source_syns_cuis

8768549                      []
8769337                      []
8949912                      []
8763774                      []
8764953                      []
                   ...         
8529453    [C0042679, C0042679]
8529270              [C0041984]
8529351    [C0042232, C0042232]
8601787                      []
8940025                      []
Name: source_syns_cuis, Length: 430135, dtype: object

In [42]:
umls2020AB_df['sapbert_2000-NN_cuis']

8768549    [C4058663, C5140346, C3818349, C3818349, C3818...
8769337    [C5207314, C5135790, C5136582, C4689308, C5136...
8949912    [C4046595, C4530008, C4291206, C4551034, C5228...
8763774    [C4519986, C3555441, C4056238, C4689534, C4056...
8764953    [C4735039, C4291270, C4519638, C3665116, C4522...
                                 ...                        
8529453    [C0042679, C0042679, C0042679, C0042679, C0042...
8529270    [C0041984, C0041984, C0041984, C0041984, C0041...
8529351    [C0042232, C1519910, C0042232, C0042232, C0042...
8601787    [C2827459, C3810343, C3538738, C4014722, C1425...
8940025    [C1555605, C2677109, C0018553, C1705310, C4684...
Name: sapbert_2000-NN_cuis, Length: 430135, dtype: object

In [43]:
#Getting Errors and Fine Tuning Dataset
all_candidates = []
query_synonym_cuis = list(umls2020AB_df['2020AA_synonyms_cuis'])
source_syns = umls2020AB_df['source_syns_cuis']

for cui_col in cui_columns:
    print(cui_col)
    cui_name = cui_col.split('_cuis')[0]
    nearest_neighbors_cuis = umls2020AB_df[cui_col]
    
    pair_dataset = []
    pair_labels = []
    errors = []

    for true_syn, topn, source in tqdm(zip(query_synonym_cuis, nearest_neighbors_cuis, source_syns), total=len(query_synonym_cuis)):

        if source is not None:
            source_syn_num = len(source)        
        else:
            source_syn_num = 0

        true_syn = set(true_syn)
        source_plus_top = copy.deepcopy(list(set(source)))
        source_plus_top.extend(topn)        
        
        all_candidates.append((true_syn, source_plus_top, [c in true_syn for c in source_plus_top]))
        
        missing_syns = []
        for n in [0,1,5,10,50,100,200,500,1000,2000]:
            topn = set(source_plus_top[:n+source_syn_num])
            errorsn = true_syn.difference(topn)
            
            missing_syns.append(errorsn)
            
        errors.append(missing_syns)

sapbert_2000-NN_cuis



  0%|                                                                                                       | 0/430135 [00:00<?, ?it/s][A
  0%|                                                                                           | 141/430135 [00:00<05:06, 1403.33it/s][A
  0%|                                                                                           | 329/430135 [00:00<04:15, 1681.66it/s][A
  0%|                                                                                           | 498/430135 [00:00<05:11, 1381.27it/s][A
  0%|▏                                                                                          | 683/430135 [00:00<04:38, 1542.57it/s][A
  0%|▏                                                                                          | 871/430135 [00:00<04:19, 1653.13it/s][A
  0%|▏                                                                                         | 1041/430135 [00:00<04:37, 1548.40it/s][A
  0%|▎                    

  4%|███▊                                                                                     | 18190/430135 [00:13<04:41, 1462.60it/s][A
  4%|███▊                                                                                     | 18341/430135 [00:13<05:10, 1324.68it/s][A
  4%|███▊                                                                                     | 18487/430135 [00:13<05:03, 1358.50it/s][A
  4%|███▊                                                                                     | 18658/430135 [00:13<04:43, 1451.49it/s][A
  4%|███▉                                                                                     | 18820/430135 [00:13<04:34, 1497.78it/s][A
  4%|███▉                                                                                     | 18973/430135 [00:13<04:53, 1400.68it/s][A
  4%|███▉                                                                                     | 19139/430135 [00:13<04:39, 1469.49it/s][A
  4%|███▉                  

  9%|███████▋                                                                                 | 37275/430135 [00:26<04:06, 1591.12it/s][A
  9%|███████▋                                                                                 | 37436/430135 [00:26<04:36, 1418.21it/s][A
  9%|███████▊                                                                                 | 37590/430135 [00:26<04:30, 1449.52it/s][A
  9%|███████▊                                                                                 | 37739/430135 [00:26<04:31, 1444.63it/s][A
  9%|███████▊                                                                                 | 37886/430135 [00:26<05:03, 1294.41it/s][A
  9%|███████▊                                                                                 | 38039/430135 [00:26<04:49, 1355.85it/s][A
  9%|███████▉                                                                                 | 38198/430135 [00:26<04:36, 1419.17it/s][A
  9%|███████▉              

 13%|███████████▎                                                                            | 55588/430135 [02:50<13:02:58,  7.97it/s][A
 13%|███████████▌                                                                             | 55749/430135 [02:50<9:02:57, 11.49it/s][A
 13%|███████████▌                                                                             | 55907/430135 [02:51<6:20:55, 16.37it/s][A
 13%|███████████▌                                                                             | 56114/430135 [02:51<4:04:29, 25.50it/s][A
 13%|███████████▋                                                                             | 56325/430135 [02:51<2:41:13, 38.64it/s][A
 13%|███████████▋                                                                             | 56501/430135 [02:51<1:55:22, 53.97it/s][A
 13%|███████████▋                                                                             | 56697/430135 [02:51<1:19:51, 77.94it/s][A
 13%|███████████▉          

 17%|███████████████▎                                                                         | 74307/430135 [03:04<04:45, 1246.08it/s][A
 17%|███████████████▍                                                                         | 74463/430135 [03:04<04:27, 1329.00it/s][A
 17%|███████████████▍                                                                         | 74623/430135 [03:04<04:13, 1401.19it/s][A
 17%|███████████████▍                                                                         | 74776/430135 [03:04<04:07, 1436.21it/s][A
 17%|███████████████▌                                                                         | 74923/430135 [03:04<04:34, 1295.21it/s][A
 17%|███████████████▌                                                                         | 75068/430135 [03:04<04:25, 1336.54it/s][A
 17%|███████████████▌                                                                         | 75213/430135 [03:04<04:19, 1365.92it/s][A
 18%|███████████████▌      

 22%|███████████████████▏                                                                     | 92529/430135 [03:17<03:50, 1462.40it/s][A
 22%|███████████████████▏                                                                     | 92698/430135 [03:17<03:41, 1524.27it/s][A
 22%|███████████████████▏                                                                     | 92854/430135 [03:17<04:14, 1327.57it/s][A
 22%|███████████████████▏                                                                     | 93007/430135 [03:17<04:04, 1380.04it/s][A
 22%|███████████████████▎                                                                     | 93168/430135 [03:17<03:53, 1441.40it/s][A
 22%|███████████████████▎                                                                     | 93317/430135 [03:17<04:27, 1259.30it/s][A
 22%|███████████████████▎                                                                     | 93475/430135 [03:17<04:11, 1339.90it/s][A
 22%|███████████████████▎  

 25%|██████████████████████▎                                                                 | 109053/430135 [03:30<04:49, 1109.65it/s][A
 25%|██████████████████████▎                                                                 | 109188/430135 [03:30<04:33, 1173.50it/s][A
 25%|██████████████████████▎                                                                 | 109320/430135 [03:30<04:24, 1213.44it/s][A
 25%|██████████████████████▍                                                                 | 109467/430135 [03:30<04:09, 1285.38it/s][A
 25%|██████████████████████▍                                                                 | 109598/430135 [03:30<04:39, 1148.19it/s][A
 26%|██████████████████████▍                                                                 | 109737/430135 [03:31<04:24, 1211.05it/s][A
 26%|██████████████████████▍                                                                 | 109886/430135 [03:31<04:08, 1286.95it/s][A
 26%|██████████████████████

 29%|█████████████████████████▊                                                              | 125897/430135 [03:43<04:18, 1177.65it/s][A
 29%|█████████████████████████▊                                                              | 126042/430135 [03:43<04:04, 1246.22it/s][A
 29%|█████████████████████████▊                                                              | 126180/430135 [03:43<03:57, 1282.18it/s][A
 29%|█████████████████████████▊                                                              | 126315/430135 [03:44<04:29, 1128.98it/s][A
 29%|█████████████████████████▊                                                              | 126455/430135 [03:44<04:13, 1198.13it/s][A
 29%|█████████████████████████▉                                                              | 126601/430135 [03:44<03:59, 1267.23it/s][A
 29%|█████████████████████████▉                                                              | 126739/430135 [03:44<03:53, 1298.02it/s][A
 29%|██████████████████████

 33%|█████████████████████████████▏                                                          | 142854/430135 [03:56<03:34, 1342.15it/s][A
 33%|█████████████████████████████▎                                                          | 142998/430135 [03:56<03:29, 1369.39it/s][A
 33%|█████████████████████████████▎                                                          | 143138/430135 [03:57<03:57, 1206.08it/s][A
 33%|█████████████████████████████▎                                                          | 143294/430135 [03:57<03:40, 1298.99it/s][A
 33%|█████████████████████████████▎                                                          | 143452/430135 [03:57<03:28, 1375.05it/s][A
 33%|█████████████████████████████▍                                                          | 143594/430135 [03:57<03:49, 1249.35it/s][A
 33%|█████████████████████████████▍                                                          | 143749/430135 [03:57<03:35, 1328.00it/s][A
 33%|██████████████████████

 37%|████████████████████████████████▌                                                       | 159180/430135 [04:10<03:28, 1301.11it/s][A
 37%|████████████████████████████████▌                                                       | 159315/430135 [04:10<03:47, 1189.35it/s][A
 37%|████████████████████████████████▌                                                       | 159444/430135 [04:10<03:42, 1214.10it/s][A
 37%|████████████████████████████████▋                                                       | 159583/430135 [04:10<03:34, 1261.20it/s][A
 37%|████████████████████████████████▋                                                       | 159723/430135 [04:10<03:28, 1297.66it/s][A
 37%|████████████████████████████████▋                                                       | 159856/430135 [04:10<03:54, 1151.78it/s][A
 37%|████████████████████████████████▋                                                       | 159999/430135 [04:10<03:40, 1223.13it/s][A
 37%|██████████████████████

 41%|███████████████████████████████████▉                                                    | 175475/430135 [04:23<03:52, 1093.73it/s][A
 41%|███████████████████████████████████▉                                                    | 175632/430135 [04:23<03:30, 1208.83it/s][A
 41%|███████████████████████████████████▉                                                    | 175771/430135 [04:23<03:22, 1253.97it/s][A
 41%|███████████████████████████████████▉                                                    | 175921/430135 [04:23<03:12, 1319.07it/s][A
 41%|████████████████████████████████████                                                    | 176060/430135 [04:23<03:32, 1196.47it/s][A
 41%|████████████████████████████████████                                                    | 176192/430135 [04:23<03:26, 1227.77it/s][A
 41%|████████████████████████████████████                                                    | 176366/430135 [04:24<03:05, 1366.71it/s][A
 41%|██████████████████████

 45%|███████████████████████████████████████▎                                                | 192020/430135 [04:36<03:14, 1224.84it/s][A
 45%|███████████████████████████████████████▎                                                | 192159/430135 [04:36<03:07, 1268.91it/s][A
 45%|███████████████████████████████████████▎                                                | 192289/430135 [04:36<03:31, 1122.50it/s][A
 45%|███████████████████████████████████████▎                                                | 192407/430135 [04:36<03:30, 1130.35it/s][A
 45%|███████████████████████████████████████▍                                                | 192539/430135 [04:37<03:21, 1180.07it/s][A
 45%|███████████████████████████████████████▍                                                | 192667/430135 [04:37<03:17, 1201.54it/s][A
 45%|███████████████████████████████████████▍                                                | 192790/430135 [04:37<03:54, 1012.26it/s][A
 45%|██████████████████████

 48%|██████████████████████████████████████████▍                                             | 207339/430135 [04:50<03:20, 1111.51it/s][A
 48%|██████████████████████████████████████████▍                                             | 207481/430135 [04:50<03:07, 1186.20it/s][A
 48%|██████████████████████████████████████████▍                                             | 207620/430135 [04:50<03:00, 1230.79it/s][A
 48%|██████████████████████████████████████████▌                                             | 207769/430135 [04:50<02:50, 1302.24it/s][A
 48%|██████████████████████████████████████████▌                                             | 207903/430135 [04:50<03:15, 1137.17it/s][A
 48%|██████████████████████████████████████████▌                                             | 208041/430135 [04:50<03:05, 1198.77it/s][A
 48%|██████████████████████████████████████████▌                                             | 208180/430135 [04:50<02:57, 1250.37it/s][A
 48%|██████████████████████

 52%|██████████████████████████████████████████████▏                                          | 223069/430135 [05:03<04:06, 839.12it/s][A
 52%|██████████████████████████████████████████████▏                                          | 223217/430135 [05:03<03:29, 985.70it/s][A
 52%|█████████████████████████████████████████████▋                                          | 223362/430135 [05:03<03:08, 1097.80it/s][A
 52%|█████████████████████████████████████████████▋                                          | 223501/430135 [05:03<03:19, 1033.21it/s][A
 52%|█████████████████████████████████████████████▊                                          | 223652/430135 [05:03<02:59, 1150.01it/s][A
 52%|█████████████████████████████████████████████▊                                          | 223802/430135 [05:03<02:46, 1241.05it/s][A
 52%|█████████████████████████████████████████████▊                                          | 223942/430135 [05:04<02:40, 1283.96it/s][A
 52%|██████████████████████

 55%|████████████████████████████████████████████████▊                                       | 238359/430135 [05:16<02:20, 1360.83it/s][A
 55%|████████████████████████████████████████████████▊                                       | 238523/430135 [05:16<02:13, 1439.80it/s][A
 55%|████████████████████████████████████████████████▊                                       | 238670/430135 [05:16<02:21, 1349.69it/s][A
 56%|████████████████████████████████████████████████▊                                       | 238828/430135 [05:16<02:15, 1412.64it/s][A
 56%|████████████████████████████████████████████████▉                                       | 238993/430135 [05:17<02:09, 1477.93it/s][A
 56%|████████████████████████████████████████████████▉                                       | 239149/430135 [05:17<02:16, 1397.01it/s][A
 56%|████████████████████████████████████████████████▉                                       | 239312/430135 [05:17<02:10, 1461.04it/s][A
 56%|██████████████████████

 59%|████████████████████████████████████████████████████▏                                   | 254943/430135 [05:29<02:13, 1317.09it/s][A
 59%|████████████████████████████████████████████████████▏                                   | 255079/430135 [05:29<02:13, 1312.60it/s][A
 59%|████████████████████████████████████████████████████▏                                   | 255231/430135 [05:30<02:07, 1369.18it/s][A
 59%|████████████████████████████████████████████████████▏                                   | 255371/430135 [05:30<02:26, 1190.41it/s][A
 59%|████████████████████████████████████████████████████▎                                   | 255509/430135 [05:30<02:21, 1237.93it/s][A
 59%|████████████████████████████████████████████████████▎                                   | 255647/430135 [05:30<02:16, 1274.83it/s][A
 59%|████████████████████████████████████████████████████▎                                   | 255790/430135 [05:30<02:12, 1316.37it/s][A
 59%|██████████████████████

 63%|████████████████████████████████████████████████████████                                 | 270791/430135 [05:43<03:32, 751.55it/s][A
 63%|████████████████████████████████████████████████████████                                 | 270920/430135 [05:43<03:05, 860.13it/s][A
 63%|████████████████████████████████████████████████████████                                 | 271026/430135 [05:43<03:09, 840.15it/s][A
 63%|████████████████████████████████████████████████████████                                 | 271150/430135 [05:43<02:51, 928.16it/s][A
 63%|███████████████████████████████████████████████████████▌                                | 271287/430135 [05:43<02:33, 1036.26it/s][A
 63%|███████████████████████████████████████████████████████▌                                | 271429/430135 [05:43<02:19, 1134.31it/s][A
 63%|███████████████████████████████████████████████████████▌                                | 271569/430135 [05:43<02:31, 1047.77it/s][A
 63%|██████████████████████

 66%|██████████████████████████████████████████████████████████▍                             | 285758/430135 [05:56<01:52, 1280.37it/s][A
 66%|██████████████████████████████████████████████████████████▍                             | 285901/430135 [05:56<01:49, 1321.80it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 286044/430135 [05:56<01:46, 1350.63it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 286181/430135 [05:56<02:00, 1195.75it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 286324/430135 [05:56<01:54, 1257.21it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 286466/430135 [05:56<01:50, 1300.71it/s][A
 67%|██████████████████████████████████████████████████████████▋                             | 286600/430135 [05:56<01:49, 1306.46it/s][A
 67%|██████████████████████

 70%|█████████████████████████████████████████████████████████████▉                          | 302545/430135 [06:09<01:43, 1233.71it/s][A
 70%|█████████████████████████████████████████████████████████████▉                          | 302695/430135 [06:09<01:37, 1304.72it/s][A
 70%|█████████████████████████████████████████████████████████████▉                          | 302843/430135 [06:09<01:34, 1352.43it/s][A
 70%|█████████████████████████████████████████████████████████████▉                          | 302982/430135 [06:09<01:46, 1195.33it/s][A
 70%|██████████████████████████████████████████████████████████████                          | 303117/430135 [06:09<01:42, 1234.55it/s][A
 71%|██████████████████████████████████████████████████████████████                          | 303259/430135 [06:10<01:38, 1283.98it/s][A
 71%|██████████████████████████████████████████████████████████████                          | 303402/430135 [06:10<01:35, 1322.03it/s][A
 71%|██████████████████████

 74%|█████████████████████████████████████████████████████████████████                       | 318126/430135 [06:22<01:27, 1275.94it/s][A
 74%|█████████████████████████████████████████████████████████████████                       | 318261/430135 [06:22<01:26, 1295.18it/s][A
 74%|█████████████████████████████████████████████████████████████████▏                      | 318393/430135 [06:22<01:38, 1139.64it/s][A
 74%|█████████████████████████████████████████████████████████████████▉                       | 318512/430135 [06:23<02:15, 825.49it/s][A
 74%|█████████████████████████████████████████████████████████████████▉                       | 318610/430135 [06:23<02:20, 791.85it/s][A
 74%|█████████████████████████████████████████████████████████████████▉                       | 318733/430135 [06:23<02:05, 886.76it/s][A
 74%|█████████████████████████████████████████████████████████████████▏                      | 318878/430135 [06:23<01:49, 1019.78it/s][A
 74%|██████████████████████

 78%|████████████████████████████████████████████████████████████████████▍                   | 334516/430135 [06:35<01:09, 1366.20it/s][A
 78%|████████████████████████████████████████████████████████████████████▍                   | 334657/430135 [06:35<01:09, 1377.18it/s][A
 78%|████████████████████████████████████████████████████████████████████▍                   | 334797/430135 [06:36<01:19, 1194.68it/s][A
 78%|████████████████████████████████████████████████████████████████████▌                   | 334947/430135 [06:36<01:14, 1273.62it/s][A
 78%|████████████████████████████████████████████████████████████████████▌                   | 335095/430135 [06:36<01:11, 1329.09it/s][A
 78%|████████████████████████████████████████████████████████████████████▌                   | 335244/430135 [06:36<01:09, 1367.27it/s][A
 78%|████████████████████████████████████████████████████████████████████▌                   | 335384/430135 [06:36<01:17, 1217.47it/s][A
 78%|██████████████████████

 82%|███████████████████████████████████████████████████████████████████████▉                | 351321/430135 [06:49<00:59, 1316.59it/s][A
 82%|███████████████████████████████████████████████████████████████████████▉                | 351456/430135 [06:49<00:59, 1314.65it/s][A
 82%|███████████████████████████████████████████████████████████████████████▉                | 351590/430135 [06:49<01:08, 1149.75it/s][A
 82%|███████████████████████████████████████████████████████████████████████▉                | 351712/430135 [06:49<01:07, 1167.46it/s][A
 82%|███████████████████████████████████████████████████████████████████████▉                | 351850/430135 [06:49<01:03, 1223.96it/s][A
 82%|████████████████████████████████████████████████████████████████████████                | 351985/430135 [06:49<01:02, 1258.57it/s][A
 82%|████████████████████████████████████████████████████████████████████████                | 352114/430135 [06:49<01:11, 1090.01it/s][A
 82%|██████████████████████

 85%|███████████████████████████████████████████████████████████████████████████             | 367197/430135 [07:02<00:55, 1124.89it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▏            | 367336/430135 [07:02<00:53, 1182.32it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▏            | 367470/430135 [07:02<00:51, 1223.02it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▏            | 367606/430135 [07:02<00:49, 1259.53it/s][A
 85%|███████████████████████████████████████████████████████████████████████████▏            | 367735/430135 [07:02<00:56, 1111.88it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▎            | 367873/430135 [07:02<00:52, 1179.71it/s][A
 86%|███████████████████████████████████████████████████████████████████████████▎            | 368003/430135 [07:02<00:52, 1175.90it/s][A
 86%|██████████████████████

 89%|██████████████████████████████████████████████████████████████████████████████▌         | 383702/430135 [07:15<00:35, 1323.58it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▌         | 383844/430135 [07:15<00:34, 1350.31it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▌         | 383983/430135 [07:15<00:39, 1177.28it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▌         | 384129/430135 [07:15<00:36, 1249.79it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▌         | 384281/430135 [07:16<00:34, 1322.35it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▋         | 384429/430135 [07:16<00:33, 1365.53it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▋         | 384569/430135 [07:16<00:37, 1219.53it/s][A
 89%|██████████████████████

 93%|█████████████████████████████████████████████████████████████████████████████████▊      | 399998/430135 [07:28<00:23, 1281.85it/s][A
 93%|█████████████████████████████████████████████████████████████████████████████████▊      | 400130/430135 [07:29<00:26, 1122.31it/s][A
 93%|█████████████████████████████████████████████████████████████████████████████████▉      | 400278/430135 [07:29<00:24, 1215.37it/s][A
 93%|█████████████████████████████████████████████████████████████████████████████████▉      | 400435/430135 [07:29<00:22, 1311.58it/s][A
 93%|█████████████████████████████████████████████████████████████████████████████████▉      | 400575/430135 [07:29<00:22, 1335.49it/s][A
 93%|█████████████████████████████████████████████████████████████████████████████████▉      | 400712/430135 [07:29<00:25, 1174.50it/s][A
 93%|██████████████████████████████████████████████████████████████████████████████████      | 400841/430135 [07:29<00:24, 1203.66it/s][A
 93%|██████████████████████

 97%|█████████████████████████████████████████████████████████████████████████████████████▎  | 416841/430135 [07:42<00:09, 1340.46it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▎  | 416979/430135 [07:42<00:11, 1174.03it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▎  | 417127/430135 [07:42<00:10, 1250.32it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▎  | 417276/430135 [07:42<00:09, 1314.58it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▍  | 417425/430135 [07:42<00:09, 1361.58it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▍  | 417565/430135 [07:42<00:10, 1190.78it/s][A
 97%|█████████████████████████████████████████████████████████████████████████████████████▍  | 417699/430135 [07:42<00:10, 1203.01it/s][A
 97%|██████████████████████

In [44]:
pickle.dump(errors, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/sapbert_source_cui_errors.p','wb'))

KeyboardInterrupt: 

In [37]:
pickle.dump(all_candidates, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/sapbert_candidates_cui_dataset.p','wb'))

In [41]:
pickle.dump(umls2020AB_df.cuis.values, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/umls2020AB_cuis.p','wb'))

In [45]:
all_candidates

KeyboardInterrupt: 

In [None]:
nearest_neighbors_auis = umls2020AB_df['lexlm_2000-NN_auis']

nearest_neighbors_strings = []

for nn_auis in tqdm(nearest_neighbors_auis):
    nn_strings = [aui2str[aui] for aui in nn_auis]
    
    nearest_neighbors_strings.append(nn_strings)
    
umls2020AB_df['lexlm_2000-NN_strings'] = nearest_neighbors_strings

In [46]:
recall_cols = umls2020AB_df.filter(regex='.*source_syn_cui_recall').columns

In [47]:
for recall_col in recall_cols:
    atn_recall = []
    for i,row in tqdm(umls2020AB_df.iterrows()):
        recalls = row[recall_col]

        if len(recalls) > 0:
            atn_recall.append(recalls)
        else:
            atn_recall.append([None for i in [0,1,5,10,50,100,200,500,1000,2000]])
        
    recall_col_name = recall_col.split('_recall')[0]
    
    for index,n in tqdm(enumerate([0,1,5,10,50,100,200,500,1000,2000])): 
        umls2020AB_df['R@{}_{}'.format(n,recall_col_name)] = np.array(atn_recall)[:,index] 


0it [00:00, ?it/s][A
1it [00:01,  1.51s/it][A
1472it [00:01, 1269.39it/s][A
2944it [00:01, 2744.85it/s][A
4433it [00:01, 4390.08it/s][A
5925it [00:01, 6097.12it/s][A
7423it [00:02, 7767.36it/s][A
8936it [00:02, 9325.58it/s][A
10422it [00:02, 10609.99it/s][A
11923it [00:02, 11705.81it/s][A
13438it [00:02, 12609.10it/s][A
14933it [00:02, 13246.00it/s][A
16447it [00:02, 13775.71it/s][A
17942it [00:02, 14095.83it/s][A
19436it [00:02, 14324.56it/s][A
20950it [00:02, 14562.89it/s][A
22449it [00:03, 14666.17it/s][A
23964it [00:03, 14809.01it/s][A
25467it [00:03, 14873.55it/s][A
26974it [00:03, 14930.38it/s][A
28494it [00:03, 15008.55it/s][A
30003it [00:03, 14973.00it/s][A
31511it [00:03, 15004.29it/s][A
33023it [00:03, 15037.33it/s][A
34530it [00:03, 15001.91it/s][A
36060it [00:03, 15089.34it/s][A
37576it [00:04, 15082.91it/s][A
39086it [00:04, 15001.58it/s][A
40602it [00:04, 15048.06it/s][A
42108it [00:04, 15032.09it/s][A
43624it [00:04, 15069.52it/s][A
45132i

In [48]:
umls2020AB_df.columns

Index(['0', 'strings', 'auis', '2020AA_synonyms', 'synonym_strings',
       'num_syms', 'sapbert_2000-NN_strings', 'sapbert_2000-NN_auis',
       'sapbert_2000-NN_dist', 'sapbert_2000-NN_recall', 'cuis', 'sem_types',
       'sem_groups', 'source_syns', 'source_syns_plus',
       'sapbert_2000-NN_source_syn_recall',
       'sapbert_2000-NN_source_syn_plus_recall', '2020AA_synonyms_cuis',
       'source_syns_cuis', 'sapbert_2000-NN_cuis',
       'sapbert_2000-NN_cui_recall', 'sapbert_2000-NN_source_syn_cui_recall',
       'number_source_syn_cuis', 'number_source_syn_auis',
       'number_source_syn_plus_auis', 'R@0_sapbert_2000-NN_source_syn_cui',
       'R@1_sapbert_2000-NN_source_syn_cui',
       'R@5_sapbert_2000-NN_source_syn_cui',
       'R@10_sapbert_2000-NN_source_syn_cui',
       'R@50_sapbert_2000-NN_source_syn_cui',
       'R@100_sapbert_2000-NN_source_syn_cui',
       'R@200_sapbert_2000-NN_source_syn_cui',
       'R@500_sapbert_2000-NN_source_syn_cui',
       'R@1000_sapbert_

In [49]:
umls2020AB_df['sources'] = [aui2scui[aui].split('|||')[1] for aui in umls2020AB_df.auis]

In [50]:
umls2020AB_df.groupby('sources').count().sort_values('0',ascending=False)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@0_sapbert_2000-NN_source_syn_cui,R@1_sapbert_2000-NN_source_syn_cui,R@5_sapbert_2000-NN_source_syn_cui,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui
sources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCBI,160042,160042,160042,160042,160042,160042,160042,160042,160042,160042,...,14288,14288,14288,14288,14288,14288,14288,14288,14288,14288
RXNORM,68695,68695,68695,68695,68695,68695,68695,68695,68695,68695,...,35930,35930,35930,35930,35930,35930,35930,35930,35930,35930
HGNC,54587,54586,54587,54587,54587,54587,54587,54587,54587,54587,...,50868,50868,50868,50868,50868,50868,50868,50868,50868,50868
MTHSPL,23293,23293,23293,23293,23293,23293,23293,23293,23293,23293,...,16502,16502,16502,16502,16502,16502,16502,16502,16502,16502
MEDCIN,22783,22783,22783,22783,22783,22783,22783,22783,22783,22783,...,9168,9168,9168,9168,9168,9168,9168,9168,9168,9168
MSH,15738,15738,15738,15738,15738,15738,15738,15738,15738,15738,...,10282,10282,10282,10282,10282,10282,10282,10282,10282,10282
SNOMEDCT_US,14597,14597,14597,14597,14597,14597,14597,14597,14597,14597,...,4627,4627,4627,4627,4627,4627,4627,4627,4627,4627
NCI,13758,13758,13758,13758,13758,13758,13758,13758,13758,13758,...,3531,3531,3531,3531,3531,3531,3531,3531,3531,3531
LNC,11300,11300,11300,11300,11300,11300,11300,11300,11300,11300,...,1931,1931,1931,1931,1931,1931,1931,1931,1931,1931
OMIM,7201,7201,7201,7201,7201,7201,7201,7201,7201,7201,...,2982,2982,2982,2982,2982,2982,2982,2982,2982,2982


In [51]:
len(umls2020AB_df[(umls2020AB_df.number_source_syn_cuis == 0) & (umls2020AB_df.num_syms > 0)]), len(umls2020AB_df[umls2020AB_df['R@0_sapbert_2000-NN_source_syn_cui'] == 0.0])

(28413, 28413)

In [52]:
at_0_errors = umls2020AB_df[umls2020AB_df['R@0_sapbert_2000-NN_source_syn_cui'] == 0.0]
count_df = at_0_errors.groupby('sem_groups').count()[['0']]
count_df['Group Perc'] = count_df['0']/len(at_0_errors)
display(count_df)

Unnamed: 0_level_0,0,Group Perc
sem_groups,Unnamed: 1_level_1,Unnamed: 2_level_1
Activities & Behaviors,14,0.0
Anatomy,116,0.0
Chemicals & Drugs,20903,0.74
Concepts & Ideas,170,0.01
Devices,421,0.01
Disorders,4220,0.15
Genes & Molecular Sequences,1382,0.05
Geographic Areas,3,0.0
Living Beings,238,0.01
Objects,113,0.0


In [53]:
at_0_errors.groupby('sources').count().sort_values('0',ascending=False)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@0_sapbert_2000-NN_source_syn_cui,R@1_sapbert_2000-NN_source_syn_cui,R@5_sapbert_2000-NN_source_syn_cui,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui
sources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MTHSPL,16502,16502,16502,16502,16502,16502,16502,16502,16502,16502,...,16502,16502,16502,16502,16502,16502,16502,16502,16502,16502
OMIM,2982,2982,2982,2982,2982,2982,2982,2982,2982,2982,...,2982,2982,2982,2982,2982,2982,2982,2982,2982,2982
MTH,2512,2512,2512,2512,2512,2512,2512,2512,2512,2512,...,2512,2512,2512,2512,2512,2512,2512,2512,2512,2512
GS,2036,2036,2036,2036,2036,2036,2036,2036,2036,2036,...,2036,2036,2036,2036,2036,2036,2036,2036,2036,2036
HPO,994,994,994,994,994,994,994,994,994,994,...,994,994,994,994,994,994,994,994,994,994
MDR,746,746,746,746,746,746,746,746,746,746,...,746,746,746,746,746,746,746,746,746,746
MMX,632,632,632,632,632,632,632,632,632,632,...,632,632,632,632,632,632,632,632,632,632
MMSL,575,575,575,575,575,575,575,575,575,575,...,575,575,575,575,575,575,575,575,575,575
GO,489,489,489,489,489,489,489,489,489,489,...,489,489,489,489,489,489,489,489,489,489
NDDF,459,459,459,459,459,459,459,459,459,459,...,459,459,459,459,459,459,459,459,459,459


In [54]:
umls2020AB_df.filter(regex='.*auis').columns

Index(['auis', 'sapbert_2000-NN_auis', 'number_source_syn_auis',
       'number_source_syn_plus_auis'],
      dtype='object')

In [55]:
pairs_per_row = []

for candidates, sapbert_aui_nn, string, aui, cui in tqdm(zip(all_candidates,umls2020AB_df['sapbert_2000-NN_auis'], umls2020AB_df.strings, umls2020AB_df.auis, umls2020AB_df.cuis)):
    
    true_syn, source_plus_top, labels = candidates
    
    preferred_cui_string = cui2preferred[cui]
    
    num_source = 2000 - len(source_plus_top)
    
    topk = []
    topk_labs = []
    topk_auis = []
    
    i = 0
    for c, l in zip(source_plus_top, labels):
        if i >= num_source:
            topk_auis.append(sapbert_aui_nn[i-num_source])
        else:
            topk_auis.append('')

        if c not in topk:
            topk.append(c)
            topk_labs.append(l)
        
        if len(topk) == 10:
            break
            
        i += 1
        
    
    pref_strings = [cui2preferred[c] for c in topk]
    
    pairs_per_row.append((string, aui, preferred_cui_string, pref_strings, cui, topk, topk_labs, topk_auis))


0it [00:00, ?it/s][A
2940it [00:00, 29101.75it/s][A
5851it [00:00, 25458.05it/s][A
8426it [00:00, 23754.92it/s][A
10820it [00:00, 21989.28it/s][A
13036it [00:00, 21381.17it/s][A
15182it [00:00, 20718.57it/s][A
17257it [00:00, 19622.03it/s][A
19225it [00:00, 17842.56it/s][A
21031it [00:01, 17612.83it/s][A
22948it [00:01, 18041.56it/s][A
24766it [00:01, 16854.47it/s][A
26551it [00:01, 17126.14it/s][A
28280it [00:01, 15374.81it/s][A
29855it [00:01, 14507.85it/s][A
31496it [00:01, 14953.94it/s][A
33339it [00:01, 15895.33it/s][A
35169it [00:01, 16559.60it/s][A
36850it [00:02, 14766.01it/s][A
38753it [00:02, 15898.14it/s][A
40393it [00:02, 15029.99it/s][A
42043it [00:02, 15424.76it/s][A
43621it [00:02, 15521.02it/s][A
45794it [00:02, 17282.57it/s][A
47549it [00:02, 15942.48it/s][A
49180it [00:02, 15934.56it/s][A
51419it [00:02, 17743.08it/s][A
53986it [00:03, 20008.34it/s][A
56020it [00:03, 18057.38it/s][A
57882it [00:03, 17164.30it/s][A
59956it [00:03, 18114.2

In [56]:
umls2020AB_df['top10_preferred_strings'] = pairs_per_row 

In [64]:
num_cuis = []

for string in umls2020AB_df.strings:
    cuis = str2cuis[str(string)]
    num_cuis.append(len(cuis))

umls2020AB_df['num_cuis_per_query_string'] = num_cuis

In [69]:
umls2020AB_df.groupby('num_cuis_per_query_string').count()/len(umls2020AB_df)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@5_sapbert_2000-NN_source_syn_cui,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui,sources,top10_preferred_strings
num_cuis_per_query_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.97,0.97,0.97,0.97,0.97,0.97,0.97,0.97,0.97,0.97,...,0.37,0.37,0.37,0.37,0.37,0.37,0.37,0.37,0.97,0.97
2,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
3,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
at_0_errors = umls2020AB_df[umls2020AB_df['R@50_sapbert_2000-NN_source_syn_cui'] == 0.0]

In [71]:
at_0_errors.groupby('num_cuis_per_query_string').count()/len(at_0_errors)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@5_sapbert_2000-NN_source_syn_cui,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui,sources,top10_preferred_strings
num_cuis_per_query_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,...,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9
2,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,...,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06
3,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
4,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
for i,g in at_0_errors.groupby('sem_groups'):
    print('Semantic Group: {}'.format(i))
    print('=' *80)
    print('=' *80)
    print('\n')

    sample_df = g.sample(min(5,len(g)), random_state = np.random.RandomState(42))
    for sample in sample_df.top10_preferred_strings:

        print('Query: {} ||| {}'.format(sample[0], aui2scui[sample[1]].split('|||')[1]))
        print('Correct CUI: {} ||| {}'.format(sample[2], sample[4]))
        print()
        for i,can in enumerate(zip(sample[3],sample[5],sample[6], sample[7])):
            can_s, can_cui, lab, aui = can
            print('Candidate {}: {} ||| {} ||| {} ||| {}'.format(i,can_s, can_cui, aui2str.get(aui, ''), lab))
        print('\n\n' + '='*80 + '\n\n')

Semantic Group: Activities & Behaviors


Query: Friendly behavior ||| OMIM
Correct CUI: Friendly behavior ||| C0679182

Candidate 0: Notable friendly behavior ||| C2749647 ||| Notable friendly behavior ||| False
Candidate 1: Friendly ||| C2700214 ||| Friendly ||| False
Candidate 2: Friendly personality ||| C2749816 ||| Friendly ||| False
Candidate 3: Over-friendly behavior ||| C4747897 ||| Friendly personality ||| False
Candidate 4: Friendly, sociable personality ||| C1844566 ||| Over-friendly behavior ||| False
Candidate 5: Medical Team was Friendly and Kind ||| C4553857 ||| Friendly, sociable personality ||| False
Candidate 6: Office Staff was Friendly and Kind ||| C4553856 ||| Were friendly and kind ||| False
Candidate 7: Overly friendly personality ||| C4746757 ||| Were friendly and kind ||| False
Candidate 8: Friendly behavior ||| C0679182 ||| Overly friendly personality ||| True
Candidate 9: Prosocial Behavior ||| C0871164 ||| friendliness ||| False




Semantic Group: Anatomy




In [72]:
preferred_term_df = pd.DataFrame(cui2preferred.items())

In [74]:
preferred_term_df.groupby(1).count().sort_values(0,ascending=False)

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
"""Call"" - postponed",1
Retroviral Vector,1
"Retroverted and incarcerated gravid uterus, antepartum",1
"Retroverted and incarcerated gravid uterus, delivered, with mention of postpartum complication",1
"Retroverted and incarcerated gravid uterus, postpartum",1
...,...
Goodyera bilamellata,1
Goodyera bomiensis,1
Goodyera brachyceras,1
Goodyera brachystegia,1


In [77]:
umls2020AB_df[umls2020AB_df['cuis'] == 'C1612374']

Unnamed: 0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui,sources,top10_preferred_strings,num_cuis_per_query_string
8617538,34,ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClean...,A32277078,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [SannyTiz...,"[A31248802, A29977087, A29969821, A29969820, A...","[24.19786, 24.682053, 24.682053, 25.122131, 25...","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,(ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClea...,1
8617658,34,ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClean...,A32274751,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [SannyTiz...,"[A31248802, A29977087, A29969821, A29969820, A...","[24.19786, 24.682053, 24.682053, 25.122131, 25...","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,(ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClea...,1
8617480,34,ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClean...,A32278789,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [SannyTiz...,"[A31248802, A29977087, A29969821, A29969820, A...","[24.19786, 24.682053, 24.682053, 25.122131, 25...","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,(ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClea...,1
8617822,33,ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClean...,A32337391,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [Tork Han...,"[A31463247, A19300419, A31248802, A28620165, A...","[19.899017, 20.056366, 20.17865, 20.598282, 20...","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,(ALCOHOL 70 mL in 100 mL TOPICAL GEL [SaniClea...,1
8617852,32,ALCOHOL 2649.787 mL in 3785.41 mL CUTANEOUS GE...,A32331709,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 331.1 mL in 473 mL TOPICAL GEL [Antim...,"[A31559354, A30929839, A18538111, A31561003, A...","[57.674576, 58.09012, 58.592194, 58.893677, 58...","[0.003424657534246575, 0.010273972602739725, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,(ALCOHOL 2649.787 mL in 3785.41 mL CUTANEOUS G...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616001,10,ALCOHOL 70 mL in 100 mL TOPICAL GEL,A32088239,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [ONE STEP...,"[A31562137, A24304804, A30280240, A31201506, A...","[2.827118, 3.7700806, 3.7700806, 3.7700806, 6....","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,"(ALCOHOL 70 mL in 100 mL TOPICAL GEL, A3208823...",2
8616002,10,ALCOHOL 70 mL in 100 mL TOPICAL GEL,A32088970,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [ONE STEP...,"[A31562137, A24304804, A30280240, A31201506, A...","[2.827118, 3.7700806, 3.7700806, 3.7700806, 6....","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,"(ALCOHOL 70 mL in 100 mL TOPICAL GEL, A3208897...",2
8615960,10,ALCOHOL 70 mL in 100 mL TOPICAL GEL,A31746729,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [ONE STEP...,"[A31562137, A24304804, A30280240, A31201506, A...","[2.827118, 3.7700806, 3.7700806, 3.7700806, 6....","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,"(ALCOHOL 70 mL in 100 mL TOPICAL GEL, A3174672...",2
8615959,10,ALCOHOL 70 mL in 100 mL TOPICAL GEL,A31729425,"[A28245165, A28678830, A20065575, A20049154, A...",[Ethanol 70% Topical application Gel/Jelly [EP...,292,[ALCOHOL 70 mL in 100 mL TOPICAL GEL [ONE STEP...,"[A31562137, A24304804, A30280240, A31201506, A...","[2.827118, 3.7700806, 3.7700806, 3.7700806, 6....","[0.003424657534246575, 0.017123287671232876, 0...",...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,MTHSPL,"(ALCOHOL 70 mL in 100 mL TOPICAL GEL, A3172942...",2


In [84]:
pickle.dump(umls2020AB_df, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_SAPBERT_Source_Info.p','wb'))

In [82]:
#Stratified Split using CUIs to avoid data leakage to test set

umls2020AB_cui_num_syms_df = umls2020AB_df[['cuis','num_syms']].drop_duplicates()

In [85]:
umls2020AB_cui_num_syms_df['no_syms'] = [n == 0 for n in umls2020AB_cui_num_syms_df.num_syms]

In [96]:
training = []
validation = []
testing = []

val = 0.10
test = 0.20

for i,g in umls2020AB_cui_num_syms_df.groupby('no_syms'):
    
    perm_g = g.sample(len(g),random_state=np.random.RandomState(42)).cuis.values
    
    training.extend(perm_g[:len(g) - int(len(g)*(val + test))])
    validation.extend(perm_g[len(g) - int(len(g)*(val + test)):len(g) - int(len(g)*(test))])
    testing.extend(perm_g[len(g) - int(len(g)*test):])
    
    assert(training[-1] != validation[0])
    assert(validation[-1] != testing[0])        

In [98]:
len(training), len(validation), len(testing)

(204012, 29144, 58288)

In [100]:
training = set(training)
validation = set(validation)
testing = set(testing)

In [101]:
split = []

for cui in umls2020AB_df.cuis:
     
    if cui in training:
        split.append('train')
    elif cui in validation:
        split.append('val')
    elif cui in testing:
        split.append('test')

In [102]:
umls2020AB_df['split'] = split

In [108]:
umls2020AB_df.groupby('split').count()

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui,sources,top10_preferred_strings,num_cuis_per_query_string
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
test,84912,84912,84912,84912,84912,84912,84912,84912,84912,84912,...,32700,32700,32700,32700,32700,32700,32700,84912,84912,84912
train,303361,303359,303361,303361,303361,303361,303361,303361,303361,303361,...,121327,121327,121327,121327,121327,121327,121327,303361,303361,303361
val,41862,41862,41862,41862,41862,41862,41862,41862,41862,41862,...,16050,16050,16050,16050,16050,16050,16050,41862,41862,41862


In [107]:
dedup_df = []

for i,g in tqdm(umls2020AB_df.groupby(['strings','cuis'])):

    for j, row in g.iterrows():
        dedup_df.append(row)
        break

dedup_df = pd.DataFrame(dedup_df)


  0%|                                                                                                       | 0/414998 [00:00<?, ?it/s][A
  0%|                                                                                           | 1/414998 [00:04<472:40:25,  4.10s/it][A
  0%|                                                                                            | 386/414998 [00:04<53:12, 129.85it/s][A
  0%|▏                                                                                           | 757/414998 [00:04<23:28, 294.12it/s][A
  0%|▎                                                                                          | 1173/414998 [00:04<12:58, 531.88it/s][A
  0%|▎                                                                                          | 1557/414998 [00:04<08:37, 798.96it/s][A
  0%|▍                                                                                         | 1942/414998 [00:04<06:10, 1114.46it/s][A
  1%|▌                    

 11%|█████████▍                                                                               | 44282/414998 [00:15<01:37, 3807.04it/s][A
 11%|█████████▌                                                                               | 44685/414998 [00:15<01:35, 3871.11it/s][A
 11%|█████████▋                                                                               | 45096/414998 [00:15<01:33, 3940.08it/s][A
 11%|█████████▊                                                                               | 45509/414998 [00:16<01:32, 3996.14it/s][A
 11%|█████████▊                                                                               | 45911/414998 [00:16<01:32, 4002.89it/s][A
 11%|█████████▉                                                                               | 46312/414998 [00:16<01:33, 3948.45it/s][A
 11%|██████████                                                                               | 46708/414998 [00:16<01:34, 3908.35it/s][A
 11%|██████████            

 22%|███████████████████▌                                                                     | 91502/414998 [03:39<01:20, 4003.81it/s][A
 22%|███████████████████▋                                                                     | 91905/414998 [03:39<01:20, 4009.17it/s][A
 22%|███████████████████▊                                                                     | 92306/414998 [03:39<01:20, 3998.09it/s][A
 22%|███████████████████▉                                                                     | 92709/414998 [03:39<01:20, 4007.22it/s][A
 22%|███████████████████▉                                                                     | 93113/414998 [03:39<01:20, 4003.99it/s][A
 23%|████████████████████                                                                     | 93518/414998 [03:40<01:20, 4015.40it/s][A
 23%|████████████████████▏                                                                    | 93935/414998 [03:40<01:19, 4060.01it/s][A
 23%|████████████████████▏ 

 33%|█████████████████████████████▎                                                          | 138411/414998 [03:51<01:10, 3910.35it/s][A
 33%|█████████████████████████████▍                                                          | 138803/414998 [03:51<01:10, 3911.07it/s][A
 34%|█████████████████████████████▌                                                          | 139195/414998 [03:51<01:10, 3894.96it/s][A
 34%|█████████████████████████████▌                                                          | 139587/414998 [03:51<01:10, 3901.96it/s][A
 34%|█████████████████████████████▋                                                          | 139978/414998 [03:51<01:10, 3903.26it/s][A
 34%|█████████████████████████████▊                                                          | 140369/414998 [03:51<01:11, 3867.23it/s][A
 34%|█████████████████████████████▊                                                          | 140759/414998 [03:51<01:10, 3874.50it/s][A
 34%|██████████████████████

 44%|███████████████████████████████████████                                                 | 184144/414998 [04:03<00:58, 3916.71it/s][A
 44%|███████████████████████████████████████▏                                                | 184540/414998 [04:03<00:58, 3928.75it/s][A
 45%|███████████████████████████████████████▏                                                | 184939/414998 [04:03<00:58, 3946.24it/s][A
 45%|███████████████████████████████████████▎                                                | 185334/414998 [04:03<00:58, 3938.13it/s][A
 45%|███████████████████████████████████████▍                                                | 185733/414998 [04:03<00:58, 3951.69it/s][A
 45%|███████████████████████████████████████▍                                                | 186129/414998 [04:03<00:58, 3917.71it/s][A
 45%|███████████████████████████████████████▌                                                | 186524/414998 [04:03<00:58, 3925.34it/s][A
 45%|██████████████████████

 55%|████████████████████████████████████████████████▊                                       | 230111/414998 [04:14<00:47, 3882.35it/s][A
 56%|████████████████████████████████████████████████▉                                       | 230501/414998 [04:14<00:47, 3887.38it/s][A
 56%|████████████████████████████████████████████████▉                                       | 230890/414998 [04:15<00:47, 3849.40it/s][A
 56%|█████████████████████████████████████████████████                                       | 231284/414998 [04:15<00:47, 3875.47it/s][A
 56%|█████████████████████████████████████████████████▏                                      | 231674/414998 [04:15<00:47, 3880.95it/s][A
 56%|█████████████████████████████████████████████████▏                                      | 232063/414998 [04:15<00:47, 3854.57it/s][A
 56%|█████████████████████████████████████████████████▎                                      | 232455/414998 [04:15<00:47, 3872.77it/s][A
 56%|██████████████████████

 66%|██████████████████████████████████████████████████████████▌                             | 275945/414998 [04:26<00:35, 3972.94it/s][A
 67%|██████████████████████████████████████████████████████████▌                             | 276343/414998 [04:26<00:35, 3910.91it/s][A
 67%|██████████████████████████████████████████████████████████▋                             | 276739/414998 [04:26<00:35, 3925.35it/s][A
 67%|██████████████████████████████████████████████████████████▊                             | 277132/414998 [04:26<00:35, 3922.12it/s][A
 67%|██████████████████████████████████████████████████████████▊                             | 277525/414998 [04:27<00:35, 3883.57it/s][A
 67%|██████████████████████████████████████████████████████████▉                             | 277914/414998 [04:27<00:35, 3879.44it/s][A
 67%|███████████████████████████████████████████████████████████                             | 278304/414998 [04:27<00:35, 3882.79it/s][A
 67%|██████████████████████

 78%|████████████████████████████████████████████████████████████████████▌                   | 323069/414998 [04:38<00:22, 4061.62it/s][A
 78%|████████████████████████████████████████████████████████████████████▌                   | 323479/414998 [04:38<00:22, 4070.46it/s][A
 78%|████████████████████████████████████████████████████████████████████▋                   | 323888/414998 [04:38<00:22, 4026.33it/s][A
 78%|████████████████████████████████████████████████████████████████████▊                   | 324299/414998 [04:38<00:22, 4048.55it/s][A
 78%|████████████████████████████████████████████████████████████████████▊                   | 324711/414998 [04:38<00:22, 4067.55it/s][A
 78%|████████████████████████████████████████████████████████████████████▉                   | 325122/414998 [04:39<00:22, 4078.03it/s][A
 78%|█████████████████████████████████████████████████████████████████████                   | 325535/414998 [04:39<00:21, 4091.64it/s][A
 79%|██████████████████████

 89%|██████████████████████████████████████████████████████████████████████████████▎         | 369217/414998 [04:50<00:11, 3909.24it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▍         | 369609/414998 [04:50<00:11, 3847.06it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▍         | 370015/414998 [04:50<00:11, 3907.87it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▌         | 370419/414998 [04:50<00:11, 3945.27it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▋         | 370819/414998 [04:50<00:11, 3960.53it/s][A
 89%|██████████████████████████████████████████████████████████████████████████████▋         | 371216/414998 [04:50<00:11, 3952.18it/s][A
 90%|██████████████████████████████████████████████████████████████████████████████▊         | 371612/414998 [04:50<00:11, 3873.71it/s][A
 90%|██████████████████████

In [109]:
dedup_df.groupby('split').count()

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui,sources,top10_preferred_strings,num_cuis_per_query_string
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
test,82291,82291,82291,82291,82291,82291,82291,82291,82291,82291,...,31589,31589,31589,31589,31589,31589,31589,82291,82291,82291
train,291896,291896,291896,291896,291896,291896,291896,291896,291896,291896,...,114830,114830,114830,114830,114830,114830,114830,291896,291896,291896
val,40809,40809,40809,40809,40809,40809,40809,40809,40809,40809,...,15627,15627,15627,15627,15627,15627,15627,40809,40809,40809


In [110]:
dedup_df.groupby('split').count()/len(dedup_df)

Unnamed: 0_level_0,0,strings,auis,2020AA_synonyms,synonym_strings,num_syms,sapbert_2000-NN_strings,sapbert_2000-NN_auis,sapbert_2000-NN_dist,sapbert_2000-NN_recall,...,R@10_sapbert_2000-NN_source_syn_cui,R@50_sapbert_2000-NN_source_syn_cui,R@100_sapbert_2000-NN_source_syn_cui,R@200_sapbert_2000-NN_source_syn_cui,R@500_sapbert_2000-NN_source_syn_cui,R@1000_sapbert_2000-NN_source_syn_cui,R@2000_sapbert_2000-NN_source_syn_cui,sources,top10_preferred_strings,num_cuis_per_query_string
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
test,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.2,0.2,0.2
train,0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7,...,0.28,0.28,0.28,0.28,0.28,0.28,0.28,0.7,0.7,0.7
val,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.1,0.1,0.1


In [111]:
dedup_df.columns

Index(['0', 'strings', 'auis', '2020AA_synonyms', 'synonym_strings',
       'num_syms', 'sapbert_2000-NN_strings', 'sapbert_2000-NN_auis',
       'sapbert_2000-NN_dist', 'sapbert_2000-NN_recall', 'cuis', 'sem_types',
       'sem_groups', 'source_syns', 'source_syns_plus',
       'sapbert_2000-NN_source_syn_recall',
       'sapbert_2000-NN_source_syn_plus_recall', '2020AA_synonyms_cuis',
       'source_syns_cuis', 'sapbert_2000-NN_cuis',
       'sapbert_2000-NN_cui_recall', 'sapbert_2000-NN_source_syn_cui_recall',
       'number_source_syn_cuis', 'number_source_syn_auis',
       'number_source_syn_plus_auis', 'R@0_sapbert_2000-NN_source_syn_cui',
       'R@1_sapbert_2000-NN_source_syn_cui',
       'R@5_sapbert_2000-NN_source_syn_cui',
       'R@10_sapbert_2000-NN_source_syn_cui',
       'R@50_sapbert_2000-NN_source_syn_cui',
       'R@100_sapbert_2000-NN_source_syn_cui',
       'R@200_sapbert_2000-NN_source_syn_cui',
       'R@500_sapbert_2000-NN_source_syn_cui',
       'R@1000_sapbert_

In [119]:
dedup_df_simple = dedup_df[['auis','strings','cuis', '2020AA_synonyms', 'source_syns', 'source_syns_plus','sapbert_2000-NN_auis', 'sapbert_2000-NN_cuis', 'source_syns_cuis','split']]

In [115]:
pickle.dump(dedup_df_simple, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_SAPBERT_Source_Info_Official_Split_Basic.p','wb'))

In [122]:
aui_splits = {}
cui_splits = {}

k = 100

for i, row in tqdm(dedup_df_simple.iterrows(), total=len(dedup_df_simple)):
    
    split = row['split']
    
    aui = row['auis']
    cui = row['cuis']
    syns = row['2020AA_synonyms']
    
    aui_samples = aui_splits.get(split, [])
    cui_samples = cui_splits.get(split, [])
    
    for aui_cand in row['source_syns_plus']:
        aui_sample = (aui, aui_cand, 1)
        cui_sample = (aui, cui, 1)
        
        aui_samples.append(aui_sample)
        cui_samples.append(cui_sample)
    
    for aui_cand, cui_cand in zip(row['sapbert_2000-NN_auis'][:k], row['sapbert_2000-NN_cuis'][:k]):
        if cui == cui_cand in syns:
            label = 1
        else:
            label = 0
            
        aui_sample = (aui, aui_cand, label)
        cui_sample = (aui, cui_cand, label)
        
        aui_samples.append(aui_sample)
        cui_samples.append(cui_sample)
        
    aui_splits[split] = aui_samples
    cui_splits[split] = cui_samples


  0%|                                                                                                       | 0/414996 [00:00<?, ?it/s][A
  0%|                                                                                            | 15/414996 [00:00<2:01:29, 56.93it/s][A
  0%|                                                                                           | 499/414996 [00:00<03:58, 1738.40it/s][A
  0%|▏                                                                                          | 946/414996 [00:00<02:36, 2650.34it/s][A
  0%|▎                                                                                         | 1363/414996 [00:00<02:11, 3139.69it/s][A
  0%|▍                                                                                         | 1784/414996 [00:00<01:58, 3475.72it/s][A
  1%|▍                                                                                         | 2180/414996 [00:00<01:54, 3616.33it/s][A
  1%|▌                    

 11%|█████████▌                                                                               | 44677/414996 [00:12<01:36, 3849.97it/s][A
 11%|█████████▋                                                                               | 45084/414996 [00:12<01:34, 3914.73it/s][A
 11%|█████████▊                                                                               | 45476/414996 [00:12<01:39, 3718.25it/s][A
 11%|█████████▊                                                                               | 45884/414996 [00:13<01:36, 3821.53it/s][A
 11%|█████████▉                                                                               | 46295/414996 [00:13<01:34, 3902.96it/s][A
 11%|██████████                                                                               | 46701/414996 [00:13<01:33, 3946.47it/s][A
 11%|██████████                                                                               | 47140/414996 [00:13<01:30, 4076.17it/s][A
 11%|██████████▏           

 22%|███████████████████▋                                                                     | 91553/414996 [00:24<01:26, 3752.78it/s][A
 22%|███████████████████▋                                                                     | 91976/414996 [00:25<01:23, 3890.31it/s][A
 22%|███████████████████▊                                                                     | 92373/414996 [00:25<01:22, 3911.63it/s][A
 22%|███████████████████▉                                                                     | 92781/414996 [00:25<01:21, 3960.74it/s][A
 22%|███████████████████▉                                                                     | 93179/414996 [00:25<01:22, 3912.63it/s][A
 23%|████████████████████                                                                     | 93613/414996 [00:25<01:19, 4037.49it/s][A
 23%|████████████████████▏                                                                    | 94059/414996 [00:25<01:17, 4160.35it/s][A
 23%|████████████████████▎ 

 33%|█████████████████████████████▍                                                          | 138585/414996 [00:37<01:28, 3123.23it/s][A
 33%|█████████████████████████████▍                                                          | 138941/414996 [00:37<01:25, 3237.37it/s][A
 34%|█████████████████████████████▌                                                          | 139379/414996 [00:37<01:17, 3548.26it/s][A
 34%|█████████████████████████████▋                                                          | 139757/414996 [00:37<01:16, 3578.44it/s][A
 34%|█████████████████████████████▋                                                          | 140132/414996 [00:37<01:16, 3583.25it/s][A
 34%|█████████████████████████████▊                                                          | 140518/414996 [00:37<01:14, 3661.73it/s][A
 34%|█████████████████████████████▉                                                          | 140940/414996 [00:38<01:11, 3822.80it/s][A
 34%|██████████████████████

 45%|███████████████████████████████████████▎                                                | 185469/414996 [00:49<01:00, 3808.37it/s][A
 45%|███████████████████████████████████████▍                                                | 185854/414996 [00:49<01:00, 3793.71it/s][A
 45%|███████████████████████████████████████▍                                                | 186236/414996 [00:49<01:03, 3628.57it/s][A
 45%|███████████████████████████████████████▌                                                | 186602/414996 [00:49<01:04, 3549.90it/s][A
 45%|███████████████████████████████████████▋                                                | 186959/414996 [00:49<01:04, 3542.98it/s][A
 45%|███████████████████████████████████████▋                                                | 187316/414996 [00:50<01:04, 3548.14it/s][A
 45%|███████████████████████████████████████▊                                                | 187672/414996 [00:50<01:04, 3524.63it/s][A
 45%|██████████████████████

 56%|█████████████████████████████████████████████████▎                                      | 232436/414996 [01:01<00:45, 4042.72it/s][A
 56%|█████████████████████████████████████████████████▎                                      | 232842/414996 [01:01<00:45, 3996.52it/s][A
 56%|█████████████████████████████████████████████████▍                                      | 233243/414996 [01:02<00:46, 3883.91it/s][A
 56%|█████████████████████████████████████████████████▌                                      | 233640/414996 [01:02<00:46, 3906.98it/s][A
 56%|█████████████████████████████████████████████████▋                                      | 234059/414996 [01:02<00:45, 3987.56it/s][A
 57%|█████████████████████████████████████████████████▋                                      | 234473/414996 [01:02<00:44, 4029.68it/s][A
 57%|█████████████████████████████████████████████████▊                                      | 234896/414996 [01:02<00:44, 4087.67it/s][A
 57%|██████████████████████

 68%|███████████████████████████████████████████████████████████▍                            | 280357/414996 [01:13<00:32, 4141.39it/s][A
 68%|███████████████████████████████████████████████████████████▌                            | 280772/414996 [01:13<00:32, 4106.50it/s][A
 68%|███████████████████████████████████████████████████████████▋                            | 281208/414996 [01:14<00:31, 4181.52it/s][A
 68%|███████████████████████████████████████████████████████████▋                            | 281627/414996 [01:14<00:32, 4163.42it/s][A
 68%|███████████████████████████████████████████████████████████▊                            | 282044/414996 [01:14<00:31, 4161.66it/s][A
 68%|███████████████████████████████████████████████████████████▉                            | 282474/414996 [01:14<00:31, 4201.75it/s][A
 68%|███████████████████████████████████████████████████████████▉                            | 282910/414996 [01:14<00:31, 4246.50it/s][A
 68%|██████████████████████

 79%|█████████████████████████████████████████████████████████████████████▎                  | 327129/414996 [01:26<00:21, 4054.97it/s][A
 79%|█████████████████████████████████████████████████████████████████████▍                  | 327536/414996 [01:26<00:21, 3995.66it/s][A
 79%|█████████████████████████████████████████████████████████████████████▌                  | 327937/414996 [01:26<00:22, 3930.43it/s][A
 79%|█████████████████████████████████████████████████████████████████████▌                  | 328331/414996 [01:26<00:22, 3920.72it/s][A
 79%|█████████████████████████████████████████████████████████████████████▋                  | 328728/414996 [01:26<00:21, 3931.58it/s][A
 79%|█████████████████████████████████████████████████████████████████████▊                  | 329122/414996 [01:26<00:22, 3873.89it/s][A
 79%|█████████████████████████████████████████████████████████████████████▉                  | 329537/414996 [01:26<00:21, 3953.10it/s][A
 80%|██████████████████████

 90%|███████████████████████████████████████████████████████████████████████████████▎        | 374236/414996 [01:38<00:10, 3881.75it/s][A
 90%|███████████████████████████████████████████████████████████████████████████████▍        | 374700/414996 [01:38<00:09, 4099.40it/s][A
 90%|███████████████████████████████████████████████████████████████████████████████▌        | 375162/414996 [01:38<00:09, 4250.15it/s][A
 91%|███████████████████████████████████████████████████████████████████████████████▋        | 375626/414996 [01:38<00:09, 4362.83it/s][A
 91%|███████████████████████████████████████████████████████████████████████████████▋        | 376090/414996 [01:38<00:08, 4444.43it/s][A
 91%|███████████████████████████████████████████████████████████████████████████████▊        | 376555/414996 [01:38<00:08, 4504.75it/s][A
 91%|███████████████████████████████████████████████████████████████████████████████▉        | 377014/414996 [01:38<00:08, 4527.90it/s][A
 91%|██████████████████████

In [124]:
pickle.dump(aui_splits,open('/data/Bodenreider_UMLS_DL/Interns/Bernal/aui_pairwise_data_splits.{}.p'.format(k),'wb'))
pickle.dump(cui_splits,open('/data/Bodenreider_UMLS_DL/Interns/Bernal/cui_pairwise_data_splits.{}.p'.format(k),'wb'))

In [None]:
train = pd.read_csv(train_path, sep='\t', quoting=3)


In [112]:
pickle.dump(dedup_df, open('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_SAPBERT_Source_Info_Official_Split.p','wb'))

In [81]:
len(umls2020AB_df), len(umls2020AB_df[['strings']].drop_duplicates())

(430135, 413578)

In [None]:
# sapbert_errors = pd.read_csv('/data/Bodenreider_UMLS_DL/Interns/Bernal/sapbert_errors_with_pubmed_candidates.csv')

In [None]:
# eval(sapbert_errors[pd.notna(sapbert_errors['pubmed_candidates'])].loc[430111,'pubmed_candidates'])

# pubmed_candidates = []

# for cs in tqdm(sapbert_errors.pubmed_candidates):
#     cs_auis = []
    
#     if pd.notna(cs):
#         cs = eval(cs)
        
#         cs = [c[0] for c in cs]
        
#         for c in cs:
#             cs_auis.extend(str2aui.get(c,[]))
            
#     pubmed_candidates.append(cs_auis)

In [None]:
# umls2020AB_df['pubmed_candidates'] = sapbert_errors['pubmed_candidates']

# #Calculating PubMed Candidate Recall @ 1,5,10,50,100
# recall_array = []
# # closest_dist_true = []
# # closest_dist_false = []

# query_synonym = list(umls2020AB_df['2020AA_synonyms'])
# nearest_neighbors = umls2020AB_df['sapbert_2000-NN_auis']

# for true_syn, top, pubmed in tqdm(zip(query_synonym, nearest_neighbors, pubmed_candidates)):
    
#     true_syn = set(true_syn)
    
#     if len(true_syn) > 0:
#         recalls = []

#         topn = set(top)
#         topn = topn.union(set(pubmed))
        
#         true_pos = topn.intersection(true_syn)

#         recalls.append(len(true_pos)/len(true_syn))

#         recall_array.append(recalls)
#     else:
#         recalls = []

#         recall_array.append(recalls)
        
# umls2020AB_df['pubmed_candidates_recall'] = recall_array

In [None]:
umls2020AB_df.columns

In [None]:
pd.set_option('max_colwidth',500)

In [None]:
string_cols = umls2020AB_df.filter(regex='.*NN_strings').columns

In [None]:
string_cols

In [None]:
for col in string_cols:
        
    errors = []
    
    for i, row in tqdm(umls2020AB_df.iterrows(),total=len(umls2020AB_df)):
        syns = set(row['synonym_strings'])
        if len(syns) > 0:      
            pred_syns = set(row[col])
            missed_syns = syns.difference(pred_syns)
            errors.append('|||'.join([str(s) for s in missed_syns]))
        else:
            errors.append([])
            
    umls2020AB_df[col.split('_2000')[0]+'_errors'] = errors

In [None]:
# umls2020AB_df[['strings','auis','2020AA_synonyms','synonym_strings','sapbert_errors']].to_csv('/data/Bodenreider_UMLS_DL/Interns/Bernal/umls_2020AB_SAPBERT_errors.csv')

In [None]:
# umls2020AB_df[(umls2020AB_df['R@2000_sapbert'] == 1.0) & (umls2020AB_df['R@2000_ubert_mlm'] < 1.0) & (umls2020AB_df['0'] < 15)][['0','strings','num_syms','ubert_mlm_errors','R@2000_sapbert','R@2000_ubert_mlm']]


In [None]:
# f = open('/data/Bodenreider_UMLS_DL/Interns/Bernal/UMLS2020AB_Full_NN_DataFrame_UpToSAPBERT_UBERT.p','rb')

In [None]:
# umls2020AB_df = pickle.load(f)

In [None]:
without_R100_errors = umls2020AB_df[umls2020AB_df['R@100_sapbert'] != 0]
len(without_R100_errors)

In [None]:
for recall_col in without_R100_errors.filter(regex='.*recall.*').columns:
    print(recall_col)
    recall_array = list(without_R100_errors[recall_col].values)
    display(pd.DataFrame(recall_array).describe())

In [None]:
for n in [1,5,10,50,100,200,500,1000,2000]:
    recall_at_n_cols = umls2020AB_df.filter(regex='R@{}_.*'.format(n)).columns
    umls2020AB_df['R@{}_oracle'.format(n)] = umls2020AB_df[recall_at_n_cols].max(axis=1)

In [None]:
100*umls2020AB_df.filter(regex='R@.*_oracle').describe()

In [None]:
nn_aui_cols = umls2020AB_df.filter(regex='.*NN_auis').columns

In [None]:
import itertools

subsets = []

for L in range(2, len(nn_aui_cols)+1):
    for subset in itertools.combinations(nn_aui_cols, L):
        subsets.append(subset)

In [None]:
for ss_i, nn_aui_cols_subset in enumerate(subsets):
    recall_array = []

    for i, row in tqdm(umls2020AB_df.iterrows(),total=len(umls2020AB_df)):

        true_syn = row['2020AA_synonyms']
        true_syn = set(true_syn)

        if len(true_syn) > 0:
            recalls = []

            for n in [1,5,10,50,100,200,500,1000,2000]:

                topn = []

                for nn_auis in nn_aui_cols_subset:
                    model_topn = row[nn_auis][:n]
                    topn.extend(model_topn)

                topn = set(topn)
                true_pos = topn.intersection(true_syn)

                recalls.append(len(true_pos)/len(true_syn))

            recall_array.append(recalls)
        else:
            recalls = []

            recall_array.append(recalls)

    umls2020AB_df['ensemble_comb_{}-NN_recall'.format(ss_i)] = recall_array

In [None]:
[(i,subset) for i, subset in enumerate(subsets)]

In [None]:
indices = []
dfs = []
i = 0

for recall_col in umls2020AB_df.filter(regex='.*recall.*').columns:
    print(recall_col)
    if 'ensemble' in recall_col:
        indices.append('||'.join(subsets[i]))
        i+=1
    else:
        indices.append(recall_col)
        
    recall_array = list(umls2020AB_df[recall_col].values)
    dfs.append(pd.DataFrame(recall_array).agg(['mean']))

In [None]:
ensemble = pd.concat(dfs)
ensemble.index = indices

In [None]:
ensemble.sort_values(4,ascending=False)

In [None]:
sem_group_analysis = []
sgs = []
num_auis = []

for sg, g in umls2020AB_df.groupby('sem_groups'):
    sem_group_analysis.append(g.filter(regex='R@.*').mean())
    sgs.append(sg)
    num_auis.append(len(g))
    
sem_group_analysis = pd.DataFrame(sem_group_analysis)    
sem_group_analysis['SemGroups'] = sgs
sem_group_analysis['NumAuis'] = num_auis

In [None]:
sem_group_analysis.filter(regex='SemGroups|NumAuis|.*@100_.*').set_index(['SemGroups','NumAuis'])

In [None]:
validation_df = []

for sg, g in umls2020AB_df.groupby('sem_groups'):
    validation_df.append(g.sample(int(len(g)*0.2), random_state=np.random.RandomState(42)))

validation_df = pd.concat(validation_df)

In [None]:
pd.set_option('max_colwidth',500)

In [None]:
sapbert_correct_ubert_errors = validation_df[(validation_df['R@2000_sapbert'] == 1.0) & (validation_df['R@2000_sapbert_ubert_1392096'] < 1.0)]


In [None]:
display_sample = sapbert_correct_ubert_errors[:100]

In [None]:
validation_df

In [None]:
ubert_correct = validation_df[(validation_df['num_syms'] > 3) & (validation_df['R@2000_sapbert_ubert_1392096'] > 0.5)]
display_sample = ubert_correct[:100]

# display_sample['ubert_mlm_candidate_list_end'] = [[(s,d) for s,d in zip(ss[-10:],ds[-10:])] for ss, ds in zip(display_sample['ubert_mlm_2000-NN_strings'], display_sample['ubert_mlm_2000-NN_dist'])]
display_sample['pubmedbert_candidate_list_start'] = [[(s,d) for s,d in zip(ss[:50],ds[:50])] for ss, ds in zip(display_sample['pubmedbert_2000-NN_strings'], display_sample['pubmedbert_2000-NN_dist'])]
display_sample['pubmedbert_candidate_list_end'] = [[(s,d) for s,d in zip(ss[-50:],ds[-50:])] for ss, ds in zip(display_sample['pubmedbert_2000-NN_strings'], display_sample['pubmedbert_2000-NN_dist'])]

display_sample['sapbert_ubert_candidate_list_start'] = [[(s,d) for s,d in zip(ss[:50],ds[:50])] for ss, ds in zip(display_sample['sapbert_ubert_1392096_2000-NN_strings'], display_sample['sapbert_ubert_1392096_2000-NN_dist'])]
display_sample['sapbert_ubert_candidate_list_end'] = [[(s,d) for s,d in zip(ss[-50:],ds[-50:])] for ss, ds in zip(display_sample['sapbert_ubert_1392096_2000-NN_strings'], display_sample['sapbert_ubert_1392096_2000-NN_dist'])]

In [None]:
for col in display_sample.filter(regex='.*list_.*').columns:
    
    new_col = []
    
    for row in display_sample[col]:
        
        new_row = []
        
        for elem in row:
            if elem not in new_row:
                new_row.append(elem)
            if len(new_row) > 10:
                break
                
        new_col.append(new_row)
        
    display_sample[col] = new_col

In [None]:
display_sample.filter(regex='^strings$|.*list_.*|.*sapbert_ubert.*errors')

In [None]:
display_sample = sapbert_correct_ubert_errors[:100]

# display_sample['ubert_mlm_candidate_list_end'] = [[(s,d) for s,d in zip(ss[-10:],ds[-10:])] for ss, ds in zip(display_sample['ubert_mlm_2000-NN_strings'], display_sample['ubert_mlm_2000-NN_dist'])]
display_sample['sapbert_ubert_candidate_list_start'] = [[(s,d) for s,d in zip(ss[:10],ds[:10])] for ss, ds in zip(display_sample['sapbert_ubert_1392096_2000-NN_strings'], display_sample['sapbert_ubert_1392096_2000-NN_dist'])]
display_sample['sapbert_ubert_candidate_list_end'] = [[(s,d) for s,d in zip(ss[-10:],ds[-10:])] for ss, ds in zip(display_sample['sapbert_ubert_1392096_2000-NN_strings'], display_sample['sapbert_ubert_1392096_2000-NN_dist'])]

display_sample.filter(regex='^strings$|.*sapbert_ubert.*list_.*|.*sapbert_ubert.*errors')

In [None]:
for recall_col in validation_df.filter(regex='.*recall.*').columns:
    print(recall_col)
    recall_array = list(umls2020AB_df[recall_col].values)
    display(pd.DataFrame(recall_array).describe())

In [None]:
pd.set_option('max_colwidth',2000)
pd.set_option('max_rows',2000)

In [None]:
umls2020AB_df['top10'] = [' || '.join([s + '   ' + str(d) + '   '+ str(a) for s,d,a in zip(ss[:10],ds[:10],auis[:10])]) for ss,ds,auis in zip(umls2020AB_df['lexlm_2000-NN_strings'], umls2020AB_df['lexlm_2000-NN_dist'], umls2020AB_df['lexlm_2000-NN_auis'])]

In [None]:
umls2020AB_df.filter(regex='^auis$|^strings$|top10|num_syms')

In [None]:
sanity_check = validation_df.sample(200, random_state=np.random.RandomState(42))

In [None]:
sanity_check['top10'] = [' || '.join([s + '   ' + str(d) + '   '+ str(a) for s,d,a in zip(ss[:10],ds[:10],auis[:10])]) for ss,ds,auis in zip(sanity_check['lexlm_2000-NN_strings'], sanity_check['lexlm_2000-NN_dist'], sanity_check['lexlm_2000-NN_auis'])]
sanity_check.filter(regex='^auis$|^strings$|top10|num_syms')[:10]

In [None]:
sanity_check['top10'] = [' || '.join([s + '   ' + str(d) + '   '+ str(a) for s,d,a in zip(ss[:10],ds[:10],auis[:10])]) for ss,ds,auis in zip(sanity_check['ubert_mlm_2000-NN_strings'], sanity_check['ubert_mlm_2000-NN_dist'], sanity_check['ubert_mlm_2000-NN_auis'])]
sanity_check.filter(regex='^auis$|^strings$|top10|num_syms')

In [None]:
sanity_check['top10'] = [' || '.join([s + '   ' + str(d) + '   '+ str(a) for s,d,a in zip(ss[:10],ds[:10],auis[:10])]) for ss,ds,auis in zip(sanity_check['sapbert_400-NN_strings'], sanity_check['sapbert_400-NN_dist'], sanity_check['sapbert_400-NN_auis'])]
sanity_check.filter(regex='^auis$|^strings$|top10|num_syms')

In [None]:
sanity_check['top10'] = [' || '.join([s + '   ' + str(d) for s,d in zip(ss[:10],ds[:10])]) for ss,ds in zip(sanity_check['pubmedbert_2000-NN_strings'], sanity_check['pubmedbert_2000-NN_dist'])]
sanity_check.filter(regex='^auis$|^strings$|top10|num_syms')

In [None]:
sanity_check['top10'] = [' || '.join([s + '   ' + str(d) for s,d in zip(ss[:10],ds[:10])]) for ss,ds in zip(sanity_check['krissbert_2000-NN_strings'], sanity_check['krissbert_2000-NN_dist'])]
sanity_check.filter(regex='^auis$|^strings$|top10|num_syms')

In [None]:
validation_df['top10'] = [s[:10] for s in validation_df['sapbert_400-NN_strings']]
validation_df['top10_auis'] = [s[:10] for s in validation_df['sapbert_400-NN_auis']]
validation_df['top10_dist'] = [s[:10] for s in validation_df['sapbert_400-NN_dist']]
validation_df['top100'] = [s[:100] for s in validation_df['sapbert_400-NN_strings']]
validation_df['top100_dist'] = [s[:100] for s in validation_df['sapbert_400-NN_dist']]

In [None]:
non_sym_validation = validation_df[validation_df['num_syms'] == 0]
non_sym_samples = []

for i,g in non_sym_validation.groupby('sem_groups'):
    print(i)
    if len(g) > 10:
        non_sym_sample = g.sample(10, random_state=np.random.RandomState(42))
        non_sym_sample = non_sym_sample.filter(regex='^auis$|^strings$|top10$|top10_')
        non_sym_sample['SemGroup'] = i
        non_sym_samples.append(non_sym_sample)

In [None]:
pd.concat(non_sym_samples)

In [None]:
errors = validation_df[validation_df['R@100_sapbert'] == 0.0]

In [None]:
errors.columns

In [None]:
errors.groupby('sem_groups').count()

In [None]:
error_samples = []

for i,g in errors.groupby('sem_groups'):
    if len(g) > 80:
        print(i)
        error_sample = g[['auis','strings','2020AA_synonyms','synonym_strings','num_syms','top10','top10_auis']].sample(10, random_state=np.random.RandomState(42))
        error_sample['SemGroup'] = i
        error_samples.append(error_sample)

In [None]:
pd.concat(error_samples)

In [None]:
distance_at_100 = []

for i,row in validation_df.iterrows():
    distances = row['sapbert_400-NN_dist']
    
    distance_at_100.append(distances[100])    
    
validation_df['sb_dist@100'] = distance_at_100

distance_100_threshold = validation_df[validation_df['num_syms'] > 0]['sb_dist@100'].mean()

In [None]:
#Calculating Recall & # of False Positives @ Distance Threshold in Validation Set

nearest_neighbors_auis = umls2020AB_df['sapbert_400-NN_auis']
nearest_neighbors_dist = umls2020AB_df['sapbert_400-NN_dist']

recall_array = []
fps = []

for true_syn, top_auis, top_dist in tqdm(zip(query_synonym_auis, nearest_neighbors_auis, nearest_neighbors_dist)):
    
    if len(true_syn) > 0:
        true_syn = set(true_syn)

        n = len(np.where(top_dist < distance_100_threshold)[0])

        topn = set(top_auis[:n])
        true_pos = topn.intersection(true_syn)

        recall_array.append(len(true_pos)/len(true_syn))
        fps.append(n-len(true_pos))
    else:
        recalls = None

        recall_array.append(recalls)

        n = len(np.where(top_dist < distance_100_threshold)[0])
        fps.append(n)

umls2020AB_df['sb_fps'] = fps
umls2020AB_df[umls2020AB_df['num_syms'] == 0].sb_fps.mean(),umls2020AB_df[umls2020AB_df['num_syms'] > 0].sb_fps.mean()

In [None]:
pd.DataFrame(recall_array).describe()

In [None]:
distance_at_100 = []

for i,row in validation_df.iterrows():
    distances = row['lexlm_2000-NN_dist']
    
    distance_at_100.append(distances[100])    
    
validation_df['lm_dist@100'] = distance_at_100

distance_100_threshold = validation_df[validation_df['num_syms'] > 0]['lm_dist@100'].mean()

In [None]:
#Calculating Recall & # of False Positives @ Distance Threshold in Validation Set

nearest_neighbors_auis = umls2020AB_df['lexlm_2000-NN_auis_x']
nearest_neighbors_dist = umls2020AB_df['lexlm_2000-NN_dist']

recall_array = []
fps = []

for true_syn, top_auis, top_dist in tqdm(zip(query_synonym_auis, nearest_neighbors_auis, nearest_neighbors_dist)):
    
    if len(true_syn) > 0:
        true_syn = set(true_syn)

        n = len(np.where(top_dist < distance_100_threshold)[0])

        topn = set(top_auis[:n])
        true_pos = topn.intersection(true_syn)

        recall_array.append(len(true_pos)/len(true_syn))
        fps.append(n-len(true_pos))
    else:
        recalls = None

        recall_array.append(recalls)

        n = len(np.where(top_dist < distance_100_threshold)[0])
        fps.append(n)

umls2020AB_df['lm_fps'] = fps
umls2020AB_df[umls2020AB_df['num_syms'] == 0].lm_fps.mean(),umls2020AB_df[umls2020AB_df['num_syms'] > 0].lm_fps.mean()

In [None]:
validation_df.columns

In [None]:
bins = []
large_bin = []

for i,g in umls2020AB_df.groupby('num_syms'):
    if i < 10:
        bins.append((i, g['R@100_SB'].mean()))
    else:
        large_bin.append(g)
        
bins.append(('10+',pd.concat(large_bin)['R@100_SB'].mean()))

In [None]:
pd.DataFrame(bins)

In [None]:
bins = []
large_bin = []

for i,g in umls2020AB_df.groupby('num_syms'):
    if i < 10:
        bins.append((i, g['R@100_LM'].mean()))
    else:
        large_bin.append(g)
        
bins.append(('10+',pd.concat(large_bin)['R@100_LM'].mean()))

In [None]:
pd.DataFrame(bins)