In [1]:
from tqdm.auto import tqdm
import itertools
import random

## load MRCONSO.RFF (and some basic preprocessing)

In [2]:
with open("/home/amfierens/Documents/Researcher/Text corpora/UMLS_metathesaurus/MRCONSO.RRF", "r") as f:
    lines = f.readlines()
print (len(lines))
print(lines[0])

17213334
C0000005|ENG|P|L0000005|PF|S0007492|Y|A26634265||M0019694|D012711|MSH|PEP|D012711|(131)I-Macroaggregated Albumin|0|N|256|



### use only English names

In [10]:
cleaned = []
count = 0
for l in tqdm(lines):
    lst = l.rstrip("\n").split("|")
    cui, lang, synonym = lst[0], lst[1], lst[14]
    if lang != "ENG": continue # comment this out if you need all languages
    row = cui+"||"+synonym.lower()
    cleaned.append(row)
print (len(cleaned))

  0%|          | 0/17213334 [00:00<?, ?it/s]

11919321


### remove duplicates

In [11]:
print (len(cleaned))
cleaned = list(set(cleaned)) 
print (len(cleaned))

11919321
9817186


In [12]:
cleaned[:3]

['C3797002||astragalus aureus willd.',
 'C0556734||physiological mobilization of the ankle',
 'C1084015||blennechis punctatus']

## add tradeneames (optional) 

Regard drug tradenames/brandnames from the relation file as synonym relations. This slightly boosts SapBERT's performance on some biomedical entity linking datasets (e.g. COMETA). MRREL.RRF can be extracted from the full UMLS release file: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsarchives04.html#2020AA.

In [7]:
# load MRCONSO.RFF
with open("2020AA/MRREL.RRF", "r") as f:
    lines = f.readlines()
print (len(lines))

FileNotFoundError: [Errno 2] No such file or directory: '2020AA/MRREL.RRF'

In [33]:
umls_dict = {} # constrauct cui to list of name dict
for line in tqdm(cleaned_do_dup):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

  0%|          | 0/9255769 [00:02<?, ?it/s]

In [13]:
tradename_mappings = {}
for l in tqdm(lines):
    if "has_tradename" in l or "tradename_of" in l:
        cells =l.split("|")
        head, tail = cells[0], cells[4]
        try: # if in CUI
            sfs = umls_dict[tail]
            tradename_mappings[head] = sfs
        except:
            continue
print (len(tradename_mappings))

  0%|          | 0/17213334 [00:00<?, ?it/s]

0


In [14]:
# add tradenames
print (len(cleaned))
for cui,synonyms in tradename_mappings.items():
    for s in synonyms:
        row = cui+"||"+ s.lower()
        cleaned.append(row)
print (len(cleaned))

9817186
9817186


### remove duplications, again

In [15]:
print (len(cleaned))
cleaned_do_dup = list(set(cleaned))
print (len(list(set(cleaned_do_dup))))

9817186
9817186


## positive pairs generation

In [16]:
umls_dict = {} # constrauct cui to list of name dict, again
for line in tqdm(cleaned_do_dup):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

  0%|          | 0/9817186 [00:00<?, ?it/s]

### generate!

In [17]:
def gen_pairs(input_list):
    return list(itertools.combinations(input_list, r=2))

In [18]:
gen_pairs([1,2,3]) # test

[(1, 2), (1, 3), (2, 3)]

In [19]:
pos_pairs = []
for k,v in tqdm(umls_dict.items()):
    pairs = gen_pairs(v)
    if len(pairs)>50: # if >50 pairs, then trim to 50 pairs
        pairs = random.sample(pairs, 50)
    for p in pairs:
        line = str(k) + "||" + p[0] + "||" + p[1]
        pos_pairs.append(line)

  0%|          | 0/4661936 [00:00<?, ?it/s]

In [20]:
print (len(pos_pairs))

11777512


In [21]:
pos_pairs[:3]

['C3797002||astragalus aureus willd.||astragalus aureus',
 'C0556734||physiological mobilization of the ankle||physiological mobilisatn-ankle',
 'C0556734||physiological mobilization of the ankle||physiological mobilization of the ankle (procedure)']

### save the pairwise positive training file

In [24]:
with open('/home/amfierens/Documents/Researcher/Text corpora/UMLS_metathesaurus/training_file_umls2020aa_en_uncased_no_dup_pairwise_pair_th50.txt', 'w') as f:
    for line in pos_pairs:
        f.write("%s\n" % line)