### Load concepts and create mappings

In [1]:
# load MRCONSO.RFF
BASE_UMLS_DIR = "/home/zm324/workspace/data/kgs/umls/2020AB/"
MRCONSO = BASE_UMLS_DIR + "META/MRCONSO.RRF"  # the MRCONSO.RFF file
with open(MRCONSO, "r") as f:
    lines = f.readlines()

In [2]:
atom_string_map = {}
cuid_string_map = {}

from tqdm.auto import tqdm

for l in tqdm(lines):
    cells = l.split("|")
    cuid = cells[0]
    aid = cells[7]
    string = cells[14]
    atom_string_map[aid] = string
    cuid_string_map[cuid] = string
len(atom_string_map)

HBox(children=(FloatProgress(value=0.0, max=15938386.0), HTML(value='')))




15938386

### Read relations using additional labels

In [3]:
# load MRCONSO.RFF
MRREL = BASE_UMLS_DIR + "META/MRREL.RRF"
with open(MRREL, "r") as f:
    lines = f.readlines()

In [None]:
import pandas as pd

header_str = "CUI1 | AUI1 | STYPE1 | REL | CUI2 | AUI2 | STYPE2 | RELA | RUI | SRUI | SAB | SL | RG | DIR | SUPPRESS |CVF"
headers = [col.strip() for col in header_str.split("|")]
MRREL_pd = pd.read_csv(MRREL, sep="|", header=None)
col_map = {}
for idx, head in enumerate(headers):
    col_map[idx] = head
MRREL_pd = MRREL_pd.rename(columns=col_map)

In [None]:
MRREL_pd_Snomed = MRREL_pd[MRREL_pd["SAB"] == "SNOMEDCT_US"]  # Only use SNOMEDCT_US
MRREL_pd_Snomed_RO = MRREL_pd_Snomed[
    MRREL_pd_Snomed["REL"] == "RO"
]  # Only use general relations

In [None]:
MRREL_pd_Snomed_RO.loc[:, "STR1"] = MRREL_pd_Snomed_RO["CUI1"].apply(
    lambda cuid: cuid_string_map[cuid]
)
MRREL_pd_Snomed_RO.loc[:, "STR2"] = MRREL_pd_Snomed_RO["CUI2"].apply(
    lambda cuid: cuid_string_map[cuid]
)
MRREL_pd_Snomed_RO.loc[:, "RELA"] = MRREL_pd_Snomed_RO["RELA"].apply(
    lambda rela: rela.replace("_", " ")
)
kg_pd_snomed_ro = MRREL_pd_Snomed_RO[["STR1", "RELA", "STR2"]]

### Create new Ids for entities and relations

In [None]:
entity_list = kg_pd_snomed_ro["STR1"].unique()
id2entity = {i: entity_list[i] for i in range(len(entity_list))}
entity2id = {entity_list[i]: i for i in range(len(entity_list))}

rel_list = kg_pd_snomed_ro["RELA"].unique()
id2rel = {i: rel_list[i] for i in range(len(rel_list))}
rel2id = {rel_list[i]: i for i in range(len(rel_list))}
MRREL_pd_Snomed_RO.loc[:, "eid1"] = MRREL_pd_Snomed_RO["STR1"].apply(
    lambda ent_str: entity2id[ent_str]
)
MRREL_pd_Snomed_RO.loc[:, "eid2"] = MRREL_pd_Snomed_RO["STR2"].apply(
    lambda ent_str: entity2id[ent_str]
)
MRREL_pd_Snomed_RO.loc[:, "rid"] = MRREL_pd_Snomed_RO["RELA"].apply(
    lambda rel_str: rel2id[rel_str]
)

### Save to files

In [None]:
OUT_DIR = "./"  # OUTPUT DIR

with open(OUT_DIR + "entity2id.txt", "w") as f:
    f.write(f"{len(entity2id)}\n")
    for k, v in entity2id.items():
        f.write(f"{k}\t{v}\n")

with open(OUT_DIR + "relation2id.txt", "w") as f:
    f.write(f"{len(rel2id)}\n")
    for k, v in rel2id.items():
        f.write(f"{k}\t{v}\n")

with open(OUT_DIR + "train2id.txt", "w") as f:
    f.write(f"{len(MRREL_pd_Snomed_RO.index)}\n")
    for idx, item in MRREL_pd_Snomed_RO.iterrows():
        f.write(f"{item['eid1']}\t{item['eid2']}\t{item['rid']}\n")