In [25]:
from tqdm.auto import tqdm
import itertools
import random

## load MRCONSO.RFF (and some basic preprocessing)

In [27]:
with open("2020AA/MRCONSO.RRF", "r") as f:
    lines = f.readlines()
print (len(lines))

14770662


### use only English names

In [28]:
cleaned = []
count = 0
for l in tqdm(lines):
    lst = l.rstrip("\n").split("|")
    cui, lang, synonym = lst[0], lst[1], lst[14]
    if lang != "ENG": continue # comment this out if you need all languages
    row = cui+"||"+synonym.lower()
    cleaned.append(row)
print (len(cleaned))

  0%|          | 0/14770662 [00:00<?, ?it/s]

10277246


### remove duplicates

In [29]:
print (len(cleaned))
cleaned = list(set(cleaned)) 
print (len(cleaned))

10277246
8740644


In [30]:
cleaned[:3]

['C3275247||protein alo17',
 'C1822631||scarna7 gene',
 'C2820056||babesia cf. divergens']

## add tradeneames (optional) 

Regard drug tradenames/brandnames from the relation file as synonym relations. This slightly boosts SapBERT's performance on some biomedical entity linking datasets (e.g. COMETA). MRREL.RRF can be extracted from the full UMLS release file: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsarchives04.html#2020AA.

In [32]:
# load MRCONSO.RFF
with open("2020AA/MRREL.RRF", "r") as f:
    lines = f.readlines()
print (len(lines))

83759392


In [33]:
umls_dict = {} # constrauct cui to list of name dict
for line in tqdm(cleaned_do_dup):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

  0%|          | 0/9255769 [00:02<?, ?it/s]

In [34]:
tradename_mappings = {}
for l in tqdm(lines):
    if "has_tradename" in l or "tradename_of" in l:
        cells =l.split("|")
        head, tail = cells[0], cells[4]
        try: # if in CUI
            sfs = umls_dict[tail]
            tradename_mappings[head] = sfs
        except:
            continue
print (len(tradename_mappings))

  0%|          | 0/83759392 [00:00<?, ?it/s]

133027


In [35]:
# add tradenames
print (len(cleaned))
for cui,synonyms in tradename_mappings.items():
    for s in synonyms:
        row = cui+"||"+ s.lower()
        cleaned.append(row)
print (len(cleaned))

8740644
9744537


### remove duplications, again

In [37]:
print (len(cleaned))
cleaned_do_dup = list(set(cleaned))
print (len(list(set(cleaned_do_dup))))

9744537
9741836


## positive pairs generation

In [38]:
umls_dict = {} # constrauct cui to list of name dict, again
for line in tqdm(cleaned_do_dup):
    cui, name = line.split("||")
    if cui in umls_dict:
        umls_dict[cui].append(name)
    else:
        umls_dict[cui] = [name]

  0%|          | 0/9741836 [00:00<?, ?it/s]

### generate!

In [39]:
def gen_pairs(input_list):
    return list(itertools.combinations(input_list, r=2))

In [40]:
gen_pairs([1,2,3]) # test

[(1, 2), (1, 3), (2, 3)]

In [41]:
pos_pairs = []
for k,v in tqdm(umls_dict.items()):
    pairs = gen_pairs(v)
    if len(pairs)>50: # if >50 pairs, then trim to 50 pairs
        pairs = random.sample(pairs, 50)
    for p in pairs:
        line = str(k) + "||" + p[0] + "||" + p[1]
        pos_pairs.append(line)

  0%|          | 0/4231008 [00:00<?, ?it/s]

In [42]:
print (len(pos_pairs))

11800791


In [43]:
pos_pairs[:3]

['C3275247||protein alo17||alk lymphoma oligomerization partner on chromosome 17',
 'C3275247||protein alo17||rnf213',
 'C3275247||protein alo17||ring finger protein 213']

### save the pairwise positive training file

In [44]:
with open('./training_file_umls2020aa_en_uncased_no_dup_pairwise_pair_th50.txt', 'w') as f:
    for line in pos_pairs:
        f.write("%s\n" % line)