### Load triples from the raw files

In [1]:
import pandas as pd
from tqdm.auto import tqdm

umls_folder = "/home/zm324/workspace/data/kgs/umls/2021AA"
random_state = 2021
sampled_num = 1000

### Load the final selected relation list

In [2]:
selected_rels = []
with open("final_19_relation.txt") as f:
    lines = f.readlines()
    for l in tqdm(lines):
        selected_rels.append(l.strip())
selected_rels

  0%|          | 0/19 [00:00<?, ?it/s]

['disease_has_normal_tissue_origin',
 'disease_has_normal_cell_origin',
 'disease_may_have_molecular_abnormality',
 'disease_has_associated_anatomic_site',
 'gene_product_has_associated_anatomy',
 'gene_product_has_biochemical_function',
 'may_prevent',
 'disease_may_have_associated_disease',
 'gene_associated_with_disease',
 'disease_mapped_to_gene',
 'disease_has_abnormal_cell',
 'gene_product_plays_role_in_biological_process',
 'occurs_after',
 'has_physiologic_effect',
 'may_treat',
 'gene_product_encoded_by_gene',
 'gene_encodes_gene_product',
 'associated_morphology_of',
 'disease_may_have_finding']

In [3]:
# load MRCONSO.RFF
atom_string_map = {}
cuid_string_map = {}
with open(f"{umls_folder}/META/MRCONSO.RRF", "r") as f:
    lines = f.readlines()

    for l in tqdm(lines):
        cells = l.split("|")
        cuid = cells[0]
        aid = cells[7]
        string = cells[14]
        atom_string_map[aid] = string
        cuid_string_map[cuid] = string
print("concepts num:", len(lines))

  0%|          | 0/16132274 [00:00<?, ?it/s]

concepts num: 16132274


## Read relations using additional labels

In [4]:
concept_file = f"{umls_folder}/META/MRREL.RRF"
col_names = [
    i.strip()
    for i in "CUI1 | AUI1 | STYPE1 | REL | CUI2 | AUI2 | STYPE2 | RELA | RUI | SRUI | SAB | SL | RG | DIR | SUPPRESS |CVF".split(
        "|"
    )
] + [""]
rel_df = pd.read_csv(
    concept_file, sep="|", header=None, names=col_names, low_memory=False
)

In [5]:
rel_df.groupby(["REL"]).size().sort_values(ascending=False).to_csv(
    "MRREL.REL.SIZE", sep="|", header=None
)
rel_df.groupby(["RELA"]).size().sort_values(ascending=False).to_csv(
    "MRREL.RELA.SIZE", sep="|", header=None
)
rel_df.groupby(["SAB"]).size().sort_values(ascending=False).to_csv(
    "MRREL.SAB.SIZE", sep="|", header=None
)

In [6]:
rel_df.groupby(["REL"])["RELA"].nunique().sort_values(ascending=False).to_csv(
    "MRREL.REL.RELA.SIZE", sep="|", header=None
)

In [7]:
rel_df = rel_df[rel_df.RELA.isin(selected_rels)]
rel_df = rel_df[rel_df.SAB.isin(["SNOMEDCT_US", "NCI", "MED-RT"])]
print(rel_df.RELA.nunique(), len(rel_df.index))

19 558730


In [8]:
rel_df["STR1"] = rel_df.apply(
    lambda x: atom_string_map[x.AUI1]
    if x.AUI1 in atom_string_map
    else cuid_string_map[x.CUI1],
    axis=1,
)
rel_df["STR2"] = rel_df.apply(
    lambda x: atom_string_map[x.AUI2]
    if x.AUI2 in atom_string_map
    else cuid_string_map[x.CUI2],
    axis=1,
)

In [9]:
final_df = rel_df.rename(
    columns={
        "STR2": "head_name",
        "CUI2": "head_cui",
        "RELA": "rel",
        "STR1": "tail_name",
        "CUI1": "tail_cui",
    }
)[["head_name", "head_cui", "rel", "tail_name", "tail_cui"]]
final_df.head()

Unnamed: 0,head_name,head_cui,rel,tail_name,tail_cui
20853,Swelling,C0038999,associated_morphology_of,Swollen abdomen,C0000731
20854,Swelling,C0038999,associated_morphology_of,Swollen abdomen,C0000731
20855,Swelling,C0038999,associated_morphology_of,Swollen abdomen,C0000731
21701,Mass,C0577559,associated_morphology_of,Abdominal mass,C0000734
21702,Mass,C0577559,associated_morphology_of,Abdominal mass,C0000734


In [10]:
def map_names_2_cuis(item):
    tail_names = item["tail_names"]
    tail_names_list = item["tail_names_list"]
    tail_cuis_list = item["tail_cuis_list"]
    name2cui = {}
    for name, cui in zip(tail_names_list, tail_cuis_list):
        name2cui[name] = cui
    tail_cuis = []
    for name in tail_names:
        tail_cuis.append(name2cui[name])
    return tail_cuis


unique_query_with = (
    final_df.groupby(["head_cui", "head_name", "rel"])["tail_cui"]
    .apply(list)
    .rename("tail_cuis_list")
    .reset_index()
)
unique_query_with.loc[:, "tail_names_list"] = (
    final_df.groupby(["head_cui", "head_name", "rel"])["tail_name"]
    .apply(list)
    .rename("tail_names_list")
    .reset_index()["tail_names_list"]
)
unique_query_with.loc[:, "tail_names"] = (
    final_df.groupby(["head_cui", "head_name", "rel"])["tail_name"]
    .unique()
    .rename("tail_names")
    .reset_index()["tail_names"]
)
unique_query_with.loc[:, "tail_cuis"] = unique_query_with.apply(
    map_names_2_cuis, axis=1
)
# unique_query_with.loc[:,"num_tail_cuis"] = unique_query_with["tail_cuis"].apply(len)
unique_query_with.loc[:, "tail_cuis_len"] = unique_query_with["tail_cuis"].apply(len)
unique_query_with.loc[:, "tail_names_len"] = unique_query_with["tail_names"].apply(len)

In [11]:
unique_query_with.loc[:, "tail_names"] = unique_query_with["tail_names"].apply(
    lambda x: " || ".join(x)
)
unique_query_with.loc[:, "tail_cuis"] = unique_query_with["tail_cuis"].apply(
    lambda x: " || ".join(x)
)

In [12]:
from rouge import Rouge

rouge = Rouge()


def get_rouge(item):
    hypothesis = item["head_name"]
    references = item["tail_names"].split("||")
    scores = 0
    for reference in references:
        score = rouge.get_scores(hypothesis.lower(), reference.lower())
        scores += score[0]["rouge-l"]["f"]
    return scores / len(references)


def get_avg_match(item):
    hypothesis = item["head_name"]
    references = item["tail_names"].split("||")
    score = 0
    for reference in references:
        if reference.lower() in hypothesis.lower():
            score += 1
    return score / len(references)


unique_query_with["avg_match"] = unique_query_with.apply(get_avg_match, axis=1)
unique_query_with["avg_rouge_l"] = unique_query_with.apply(get_rouge, axis=1)

In [13]:
unique_query_with = unique_query_with[unique_query_with["tail_names_len"] <= 10]
unique_query_with.groupby("rel").size()

rel
associated_morphology_of                          1943
disease_has_abnormal_cell                        11942
disease_has_associated_anatomic_site             14322
disease_has_normal_cell_origin                   10199
disease_has_normal_tissue_origin                 10916
disease_mapped_to_gene                            2856
disease_may_have_associated_disease               1309
disease_may_have_finding                          5138
disease_may_have_molecular_abnormality            3096
gene_associated_with_disease                      1360
gene_encodes_gene_product                         5367
gene_product_encoded_by_gene                      6187
gene_product_has_associated_anatomy               3695
gene_product_has_biochemical_function             3402
gene_product_plays_role_in_biological_process     5913
has_physiologic_effect                            4819
may_prevent                                       1747
may_treat                                         3864
occurs

In [14]:
unique_query_with.groupby(["rel"])["avg_match"].mean()

rel
associated_morphology_of                         0.128277
disease_has_abnormal_cell                        0.000436
disease_has_associated_anatomic_site             0.091761
disease_has_normal_cell_origin                   0.036840
disease_has_normal_tissue_origin                 0.011427
disease_mapped_to_gene                           0.000350
disease_may_have_associated_disease              0.003565
disease_may_have_finding                         0.000000
disease_may_have_molecular_abnormality           0.000000
gene_associated_with_disease                     0.000000
gene_encodes_gene_product                        0.003109
gene_product_encoded_by_gene                     0.001587
gene_product_has_associated_anatomy              0.009120
gene_product_has_biochemical_function            0.126984
gene_product_plays_role_in_biological_process    0.006566
has_physiologic_effect                           0.000000
may_prevent                                      0.064682
may_treat 

In [17]:
save_dir = "./by_relation_1k/"
import os

os.makedirs(save_dir, exist_ok=True)
for source in selected_rels:
    source_df = unique_query_with[unique_query_with["rel"] == source]
    print(source, len(source_df.index))
    sample_df = source_df.sample(sampled_num, random_state=random_state)
    sample_df.to_csv(f"{save_dir}{source}_{sampled_num}.csv", index=None)
    sample_df = sample_df[sample_df["avg_match"] < 0.1]
    sample_df = sample_df[sample_df["avg_rouge_l"] < 0.1]
    hard_sampled_num = len(sample_df.index)
    print("hard-", source, hard_sampled_num)
    sample_df.to_csv(f"{save_dir}{source}_{hard_sampled_num}_hard.csv", index=None)

disease_has_normal_tissue_origin 10916
hard- disease_has_normal_tissue_origin 842
disease_has_normal_cell_origin 10199
hard- disease_has_normal_cell_origin 812
disease_may_have_molecular_abnormality 3096
hard- disease_may_have_molecular_abnormality 989
disease_has_associated_anatomic_site 14322
hard- disease_has_associated_anatomic_site 726
gene_product_has_associated_anatomy 3695
hard- gene_product_has_associated_anatomy 956
gene_product_has_biochemical_function 3402
hard- gene_product_has_biochemical_function 491
may_prevent 1747
hard- may_prevent 861
disease_may_have_associated_disease 1309
hard- disease_may_have_associated_disease 842
gene_associated_with_disease 1360
hard- gene_associated_with_disease 997
disease_mapped_to_gene 2856
hard- disease_mapped_to_gene 984
disease_has_abnormal_cell 11942
hard- disease_has_abnormal_cell 778
gene_product_plays_role_in_biological_process 5913
hard- gene_product_plays_role_in_biological_process 888
occurs_after 5831
hard- occurs_after 623
has