In [1]:
mr_data = {
    "mrrel":{
        "cols":['CUI1', 'AUI1', 'STYPE1', 'REL', 'CUI2', 'AUI2', 'STYPE2', 'RELA', 'RUI', 'SRUI', 'SAB', 'SL', 'RG', 'DIR', 'SUPPRESS', 'CVF'],
        "not-required-cols": ['STYPE1', 'SL', 'RG', 'DIR', 'SUPPRESS', 'CVF'],
        "path": "../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRREL.RRF"
    },
    "mrconso":{
        "cols":['CUI', 'LAT', 'TS', 'LUI', 'STT', 'SUI', 'ISPREF', 'AUI', 'SAUI', 'SCUI', 'SDUI', 'SAB', 
                'TTY', 'CODE', 'STR', 'SRL', 'SUPPRESS', 'CVF'],
        "not-required-cols": ['TS', 'LUI', 'STT', 'SUI', 'ISPREF','TTY', 'SCUI', 'SDUI', 'CODE', 'SRL', 'SUPPRESS', 'CVF'],
        "path": "../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRCONSO.RRF"
    },
    "mrsty":{
        "cols":['CUI', 'TUI', 'STN', 'STY','ATUI', 'CVF'],
        "not-required-cols": ['CVF'],
        "path": "../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRSTY.RRF"
    },
    "mrsat":{
        "cols":['CUI', 'LUI', 'SUI', 'METAUI', 'STYPE', 'CODE','ATUI', 'SATUI', 'ATN', 'SAB', 'ATV', 'SUPPRESS','CVF'],
        "not-required-cols": ['SUPPRESS', 'CVF'],
        "path": "../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRSAT.RRF"
    }
    

}

# Defnitions

In [2]:
from tqdm.notebook import tqdm
import pandas as pd
from collections import defaultdict
from pprint import pprint
import pandas as pd
import json
import os

def load_print_columns(path, columns_name, not_required_cols, demo):
    df_data_dict = []
    false_line = 0
    with open(path, 'r') as file_obj:
        for index_line, line in tqdm(enumerate(file_obj)):
            requireds, line_status = {}, True
            
            for index_column, line_item in enumerate(line.split("|")[:-1]):
                if columns_name[index_column] not in not_required_cols:
                    requireds[columns_name[index_column]] = line_item
                    # if line_item == "":
                    #     line_status = False
            if line_status:    
                if demo:
                    df_data_dict.append(list(requireds.values()))
                    if index_line == 10:
                        df = pd.DataFrame(df_data_dict, columns=[column_name 
                                                                 for column_name in columns_name 
                                                                 if column_name not in not_required_cols])
                        display(df.head(4))
                        return df

                else:
                    yield requireds
            else:
                false_line += 1
    print("False lines:", false_line)

def get_generator(data):
    dataset = load_print_columns(mr_data[data]['path'], 
                             mr_data[data]['cols'], 
                             mr_data[data]['not-required-cols'], 
                             demo=False)
    return dataset

import matplotlib.pyplot as plt


def make_bar_plot(X, Y, figsize, title):
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0,0,1,1])
    ax.bar(X, Y)
    ax.set_title(title)
    plt.xticks(rotation=90)
    plt.show()

# 1. First Glance on MRREL, MRCONSO, and MRSTY files

In [5]:
for data in mr_data.keys():
    print(f"WORKING‌‌ ON‌‌:: {data.upper()}")
    
    print(*load_print_columns(mr_data[data]['path'], 
                              mr_data[data]['cols'], 
                              mr_data[data]['not-required-cols'], 
                              demo=True))
    
    print(next(load_print_columns(mr_data[data]['path'], 
                                  mr_data[data]['cols'], 
                                  mr_data[data]['not-required-cols'], 
                                  demo=False)))
    
    print("-"*100)

WORKING‌‌ ON‌‌:: MRREL
../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRREL.RRF


0it [00:00, ?it/s]

False lines: 0

../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRREL.RRF


0it [00:00, ?it/s]

False lines: 0


StopIteration: 

# 2. create UMLS_feb_skiped_bad_lines.tsv

In [6]:
sabs_to_consider = ['NCI', 'SNOMEDCT_US', 'MEDCIN']

In [7]:
print("Getting SAB and STRs from MRCONSO-ENG...")
sab_names = {}
for conso in tqdm(get_generator(data = 'MRCONSO'.lower())):
    if conso['LAT'] == 'ENG' and conso['SAB'] in sabs_to_consider:
        sab_names[conso['CUI']+"-"+conso['AUI']] = f"{conso['SAB']}@{conso['STR']}"

Getting SAB and STRs from MRCONSO-ENG...


0it [00:00, ?it/s]

../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRCONSO.RRF


0it [00:00, ?it/s]

False lines: 0


In [8]:
list(sab_names.keys())[0], sab_names[list(sab_names.keys())[0]]

IndexError: list index out of range

In [9]:

print("Getting CONSO-REL rels and ents based on intersections....")
conso_rel_intersection_rels_ent = defaultdict(int)
dataset = get_generator(data = 'MRREL'.lower())
sep = "\t"
for sample in tqdm(dataset):
    found = {}
    head, tail = False, False
    if sab_names.get(sample['CUI1']+"-"+sample['AUI1'], "NA") != "NA":
        found['head'] = f"{sep}".join(sab_names[sample['CUI1']+"-"+sample['AUI1']].split("@"))
        head = True
    if sab_names.get(sample['CUI2']+"-"+sample['AUI2'], "NA") != "NA":
        found['tail'] = f"{sep}".join(sab_names[sample['CUI2']+"-"+sample['AUI2']].split("@"))
        tail = True 
    if head and tail:
        # file.write(f"{sample['CUI1']}{sep}{sample['CUI2']}{sep}{sample['RELA']}{sep}{found['head']}{sep}{found['tail']}\n")
        conso_rel_intersection_rels_ent[f"{sample['CUI1']}{sep}{sample['AUI1']}{sep}{sample['RUI']}{sep}{sample['CUI2']}{sep}{sample['AUI2']}{sep}{sample['RELA']}{sep}{found['head']}{sep}{found['tail']}"] += 1

Getting CONSO-REL rels and ents based on intersections....


0it [00:00, ?it/s]

../../datasets/TaskA/UMLS/raw/umls-2022AB-metathesaurus-full/2022AB/META/MRREL.RRF


0it [00:00, ?it/s]

False lines: 0


In [10]:
print(f"Number of unique samples that avaliable in both MRCONSO-ENG and MRREL:{len(conso_rel_intersection_rels_ent)}")
print(f"Number of apperances of samples in both MRCONSO-ENG and MRREL:{sum(conso_rel_intersection_rels_ent.values())}")

with open("../../datasets/TaskA/UMLS/processed-3/UMLS_feb.tsv", "a") as file:
    for line in tqdm(conso_rel_intersection_rels_ent.keys()):
        file.write(str(line)+"\n")

Number of unique samples that avaliable in both MRCONSO-ENG and MRREL:0
Number of apperances of samples in both MRCONSO-ENG and MRREL:0


FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/TaskA/UMLS/processed-3/UMLS_feb.tsv'

In [11]:
df = pd.read_csv("../../datasets/TaskA/UMLS/processed-3/UMLS_feb.tsv", sep="\t", on_bad_lines='skip',
                 names=["CUI1", "AUI1", "RUI", "CUI2", "AUI2", "RELA", "SAB-CUI1", "STR-CUI1", "SAB-CUI2", "STR-CUI2"])

print(f"Previous shape of UMLS initial dataset was: 11455094")
print(f"New size of UMLS is: {df.shape[0]}")
print(f"Bad lines that skiped:‌ {11455094-df.shape[0]}")

FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/TaskA/UMLS/processed-3/UMLS_feb.tsv'

In [None]:
df.to_csv("../../datasets/TaskA/UMLS/processed-3/UMLS_feb_skiped_bad_lines.tsv", index=False)

# 3. Generate Ontos

In [3]:
# 'ATUI': 'AT17648347',
#  'CUI': 'C0000005',
#  'STN': 'A1.4.1.2.1.7',
#  'STY': 'Amino Acid, Peptide, or Protein',
#  'TUI': 'T116'}

df = pd.read_csv("../../datasets/TaskA/UMLS/processed-3/UMLS_feb_skiped_bad_lines.tsv")

In [5]:
df.head(5)

Unnamed: 0,CUI1,AUI1,RUI,CUI2,AUI2,RELA,SAB-CUI1,STR-CUI1,SAB-CUI2,STR-CUI2
0,C0000039,A22817493,R20277339,C0031610,A2941532,inverse_isa,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,SNOMEDCT_US,Phosphatidic acid
1,C0000039,A22817493,R13958460,C0523614,A3120482,has_component,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,SNOMEDCT_US,Dipalmitoylphosphatidylcholine measurement
2,C0000039,A22817493,R141817308,C0523614,A3120482,has_measured_component,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,SNOMEDCT_US,Dipalmitoylphosphatidylcholine measurement
3,C0000039,A22817493,R142021020,C0216971,A3008866,same_as,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,SNOMEDCT_US,Colfosceril palmitate
4,C0000052,A27769867,R141668442,C0019495,A3493340,inverse_isa,SNOMEDCT_US,"1,4-alpha-Glucan branching enzyme",SNOMEDCT_US,Hexosyltransferase


## 3.1 NCI

In [41]:
# ['NCI', 'SNOMEDCT_US', 'MEDCIN']
sab_to_consider = 'NCI'

sab_rel_df = df[df['SAB-CUI1'].isin([sab_to_consider]) & df['SAB-CUI2'].isin([sab_to_consider])].reset_index()
sab_ents_list = sab_rel_df['CUI1'].tolist() + sab_rel_df['CUI2'].tolist()
sab_ents_dict = {sab:"OK" for sab in list(set(sab_ents_list))}

entity_type_dict = {}

for mrsty in tqdm(get_generator(data = 'MRSTY'.lower())):
    if sab_ents_dict.get(mrsty['CUI'], "NA") != "NA":
        if mrsty['CUI'] in entity_type_dict.get(mrsty['CUI'], "NA") != "NA":
            entity_type_dict[mrsty['CUI']][0].append(mrsty['TUI'])
            entity_type_dict[mrsty['CUI']][1].append(mrsty['STN'])
            entity_type_dict[mrsty['CUI']][2].append(mrsty['STY'])
        else:
            entity_type_dict[mrsty['CUI']] = [[mrsty['TUI']], [mrsty['STN']], [mrsty['STY']]]

print(f"\nOut of {len(sab_ents_dict)} Unique CUIs for '{sab_to_consider}' we found Entity types for {len(entity_type_dict)}'s")

print("\nA few samples from unique_cuis:", list(sab_ents_dict.keys())[:5])

print(f"\nExamples: {list(sab_ents_dict.keys())[10]}: {entity_type_dict[list(sab_ents_dict.keys())[10]]}")


stn2str, tui2str = {}, {}
tui2stn, tuis, stns = {}, [], []

for cui, items in entity_type_dict.items():
    tui, stn, string = items[0], items[1], items[2]
    for item1, item2, item3 in zip(tui, stn, string):
        if stn2str.get(item2, "NA") != "NA":
            if stn2str.get(item2) != item3:
                print(f"conflict-stn2str: {item1}, {item2}, {item3} and ==> {stn2str[item2]}")
        else:
            stn2str[item2] = item3
            
        if tui2str.get(item1, "NA") != "NA":
            if tui2str.get(item1) != item3:
                print(f"conflict-tui2str: {item1}, {item2}, {item3} and ==> {tui2str[item1]}")
        else:
            tui2str[item1] = item3 
            
        if tui2stn.get(item1, "NA") != "NA":
            if tui2stn.get(item1) != item2:
                print(f"conflict-tui2stn: {item1}, {item2}, {item3} and ==> {tui2stn[item1]}")
        else:
            tui2stn[item1] = item2 
        
        
        
        tuis.append(item1)
        stns.append(item2)

assert len(stn2str) == len(tui2str)
assert len(tui2str) == len(tui2stn)
assert len(tui2str) == len(set(tuis))
assert len(set(tuis)) == len(set(stns))

final_json = {
    "SAB": sab_to_consider,
    "STN2STR": stn2str,
    "TUI2STR": tui2str,
    "TUIs": list(set(tuis)),
    "STNs": list(set(stns)),
    "TUI2STN": tui2stn
}

with open(sab_to_consider+"_hierarchy.json", "w", encoding="utf-8") as outfile:
     json.dump(final_json, outfile, indent=4, ensure_ascii=False)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

False lines: 0

Out of 172538 Unique CUIs for 'NCI' we found Entity types for 172538's

A few samples from unique_cuis: ['C0796615', 'C0298456', 'C4527089', 'C0458924', 'C0855054']

Examples: C0728966: [['T121'], ['A1.4.1.1.1'], ['Pharmacologic Substance']]


## 3.2 MEDCIN

In [9]:
# ['NCI', 'SNOMEDCT_US', 'MEDCIN']
sab_to_consider = 'MEDCIN'

sab_rel_df = df[df['SAB-CUI1'].isin([sab_to_consider]) & df['SAB-CUI2'].isin([sab_to_consider])].reset_index()
sab_ents_list = sab_rel_df['CUI1'].tolist() + sab_rel_df['CUI2'].tolist()
sab_ents_dict = {sab:"OK" for sab in list(set(sab_ents_list))}

entity_type_dict = {}

for mrsty in tqdm(get_generator(data = 'MRSTY'.lower())):
    if sab_ents_dict.get(mrsty['CUI'], "NA") != "NA":
        if mrsty['CUI'] in entity_type_dict.get(mrsty['CUI'], "NA") != "NA":
            entity_type_dict[mrsty['CUI']][0].append(mrsty['TUI'])
            entity_type_dict[mrsty['CUI']][1].append(mrsty['STN'])
            entity_type_dict[mrsty['CUI']][2].append(mrsty['STY'])
        else:
            entity_type_dict[mrsty['CUI']] = [[mrsty['TUI']], [mrsty['STN']], [mrsty['STY']]]

print(f"\nOut of {len(sab_ents_dict)} Unique CUIs for '{sab_to_consider}' we found Entity types for {len(entity_type_dict)}'s")

print("\nA few samples from unique_cuis:", list(sab_ents_dict.keys())[:5])

print(f"\nExamples: {list(sab_ents_dict.keys())[10]}: {entity_type_dict[list(sab_ents_dict.keys())[10]]}")


stn2str, tui2str = {}, {}
tui2stn, tuis, stns = {}, [], []

for cui, items in entity_type_dict.items():
    tui, stn, string = items[0], items[1], items[2]
    for item1, item2, item3 in zip(tui, stn, string):
        if stn2str.get(item2, "NA") != "NA":
            if stn2str.get(item2) != item3:
                print(f"conflict-stn2str: {item1}, {item2}, {item3} and ==> {stn2str[item2]}")
        else:
            stn2str[item2] = item3
            
        if tui2str.get(item1, "NA") != "NA":
            if tui2str.get(item1) != item3:
                print(f"conflict-tui2str: {item1}, {item2}, {item3} and ==> {tui2str[item1]}")
        else:
            tui2str[item1] = item3 
            
        if tui2stn.get(item1, "NA") != "NA":
            if tui2stn.get(item1) != item2:
                print(f"conflict-tui2stn: {item1}, {item2}, {item3} and ==> {tui2stn[item1]}")
        else:
            tui2stn[item1] = item2 
        
        
        
        tuis.append(item1)
        stns.append(item2)

assert len(stn2str) == len(tui2str)
assert len(tui2str) == len(tui2stn)
assert len(tui2str) == len(set(tuis))
assert len(set(tuis)) == len(set(stns))

final_json = {
    "SAB": sab_to_consider,
    "STN2STR": stn2str,
    "TUI2STR": tui2str,
    "TUIs": list(set(tuis)),
    "STNs": list(set(stns)),
    "TUI2STN": tui2stn
}

with open(sab_to_consider+"_hierarchy.json", "w", encoding="utf-8") as outfile:
     json.dump(final_json, outfile, indent=4, ensure_ascii=False)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

False lines: 0

Out of 377587 Unique CUIs for 'MEDCIN' we found Entity types for 377587's

A few samples from unique_cuis: ['C2087929', 'C2139321', 'C2182110', 'C2012112', 'C2077859']

Examples: C2123362: [['T034'], ['A2.2.1'], ['Laboratory or Test Result']]


## 3.3 SNOMEDCT_US

In [10]:
# ['NCI', 'SNOMEDCT_US', 'MEDCIN']
sab_to_consider = 'SNOMEDCT_US'

sab_rel_df = df[df['SAB-CUI1'].isin([sab_to_consider]) & df['SAB-CUI2'].isin([sab_to_consider])].reset_index()
sab_ents_list = sab_rel_df['CUI1'].tolist() + sab_rel_df['CUI2'].tolist()
sab_ents_dict = {sab:"OK" for sab in list(set(sab_ents_list))}

entity_type_dict = {}

for mrsty in tqdm(get_generator(data = 'MRSTY'.lower())):
    if sab_ents_dict.get(mrsty['CUI'], "NA") != "NA":
        if mrsty['CUI'] in entity_type_dict.get(mrsty['CUI'], "NA") != "NA":
            entity_type_dict[mrsty['CUI']][0].append(mrsty['TUI'])
            entity_type_dict[mrsty['CUI']][1].append(mrsty['STN'])
            entity_type_dict[mrsty['CUI']][2].append(mrsty['STY'])
        else:
            entity_type_dict[mrsty['CUI']] = [[mrsty['TUI']], [mrsty['STN']], [mrsty['STY']]]

print(f"\nOut of {len(sab_ents_dict)} Unique CUIs for '{sab_to_consider}' we found Entity types for {len(entity_type_dict)}'s")

print("\nA few samples from unique_cuis:", list(sab_ents_dict.keys())[:5])

print(f"\nExamples: {list(sab_ents_dict.keys())[10]}: {entity_type_dict[list(sab_ents_dict.keys())[10]]}")


stn2str, tui2str = {}, {}
tui2stn, tuis, stns = {}, [], []

for cui, items in entity_type_dict.items():
    tui, stn, string = items[0], items[1], items[2]
    for item1, item2, item3 in zip(tui, stn, string):
        if stn2str.get(item2, "NA") != "NA":
            if stn2str.get(item2) != item3:
                print(f"conflict-stn2str: {item1}, {item2}, {item3} and ==> {stn2str[item2]}")
        else:
            stn2str[item2] = item3
            
        if tui2str.get(item1, "NA") != "NA":
            if tui2str.get(item1) != item3:
                print(f"conflict-tui2str: {item1}, {item2}, {item3} and ==> {tui2str[item1]}")
        else:
            tui2str[item1] = item3 
            
        if tui2stn.get(item1, "NA") != "NA":
            if tui2stn.get(item1) != item2:
                print(f"conflict-tui2stn: {item1}, {item2}, {item3} and ==> {tui2stn[item1]}")
        else:
            tui2stn[item1] = item2 
        
        
        
        tuis.append(item1)
        stns.append(item2)

assert len(stn2str) == len(tui2str)
assert len(tui2str) == len(tui2stn)
assert len(tui2str) == len(set(tuis))
assert len(set(tuis)) == len(set(stns))

final_json = {
    "SAB": sab_to_consider,
    "STN2STR": stn2str,
    "TUI2STR": tui2str,
    "TUIs": list(set(tuis)),
    "STNs": list(set(stns)),
    "TUI2STN": tui2stn
}

with open(sab_to_consider+"_hierarchy.json", "w", encoding="utf-8") as outfile:
     json.dump(final_json, outfile, indent=4, ensure_ascii=False)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

False lines: 0

Out of 417172 Unique CUIs for 'SNOMEDCT_US' we found Entity types for 417172's

A few samples from unique_cuis: ['C2711921', 'C0228062', 'C0230393', 'C1276816', 'C3204456']

Examples: C2960408: [['T060'], ['B1.3.1.2'], ['Diagnostic Procedure']]
