In [None]:
# this notebook will take several hours to run
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
from itertools import chain
import networkx as nx
import os
import sys
sys.path.append('../')

from fact_check.constants import UMLS_DIR, DATA_DIR

SEMMEDDB_DIR = DATA_DIR/'semmeddb'

pd.set_option('display.max_rows', 200)

In [None]:
g1 = pd.read_csv(SEMMEDDB_DIR/'semmedVER43_2023_R_PREDICATION.csv' 
           , encoding='ISO-8859-1',
                header = None).drop(columns = [12, 13, 14])

g1.columns = ['PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 'SUBJECT_CUI', 'SUBJECT_NAME',
                 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI', 'OBJECT_NAME', 'OBJECT_SEMTYPE',
             'OBJECT_NOVELTY']

g2 = pd.read_csv(SEMMEDDB_DIR/'semmedVER43_2023_R_PREDICATION_AUX.csv',
                 on_bad_lines='skip',
                header = None)

g2.columns = ['PREDICATION_AUX_ID', 'PREDICATION_ID', 'SUBJECT_TEXT', 'SUBJECT_DIST', 'SUBJECT_MAXDIST',
                 'SUBJECT_START_INDEX', 'SUBJECT_END_INDEX', 'SUBJECT_SCORE', 'INDICATOR_TYPE',
             'PREDICATE_START_INDEX', 'PREDICATE_END_INDEX', 'OBJECT_TEXT', 'OBJECT_DIST', 'OBJECT_MAXDIST',
                 'OBJECT_START_INDEX', 'OBJECT_END_INDEX', 'OBJECT_SCORE', 'CURR_TIMESTAMP']

In [None]:
df = g1.merge(g2, on = 'PREDICATION_ID', how = 'inner')

In [None]:
removes = [df['OBJECT_SCORE'].str.isnumeric() == False,
           df['SUBJECT_SCORE'].str.isnumeric() == False,
#            df['OBJECT_CUI'].str.contains('|', regex = False),
#            df['SUBJECT_CUI'].str.contains('|', regex = False),
            ~df['OBJECT_CUI'].str.contains('C', regex = False), # will need to remove again after exploding pipes
           ~df['SUBJECT_CUI'].str.contains('C', regex = False), # removing only at the end takes up too much memory
          ]
    
df = df[~np.logical_or.reduce(removes)]

In [None]:
#  keep a minimal set of columns for speed. Most notably drops confidence scores
df = df[['PREDICATION_ID', 'PREDICATE', 'SUBJECT_CUI',
       'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
       'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY']] 

deal with pipes in object, subject, or both, indicating multiple edges in a single row

processing code taken from https://github.com/mmayers12/semmed/blob/master/prepare/01-initial_data_clean.ipynb

In [None]:
multi_start = df['SUBJECT_CUI'].str.contains('|', regex=False)
multi_end = df['OBJECT_CUI'].str.contains('|', regex=False)
pipe_lines = df[multi_start | multi_end]
good_lines = df[~multi_start & ~multi_end]

multi_start_subset = multi_start[multi_start | multi_end]
multi_end_subset = multi_end[multi_start | multi_end]
multi_both_subset = multi_start_subset & multi_end_subset

start_only_subset = multi_start_subset & ~multi_end_subset
end_only_subset = multi_end_subset & ~multi_start_subset

In [None]:
# pipes in subjects
start_id_split = pipe_lines.loc[start_only_subset, 'SUBJECT_CUI'].str.split('|')
start_name_split = pipe_lines.loc[start_only_subset, 'SUBJECT_NAME'].str.split('|')

start_lens = start_id_split.apply(len)
all_cols = list(pipe_lines.columns)

start_cols = all_cols[:]
start_cols.remove('SUBJECT_CUI')
start_cols.remove('SUBJECT_NAME')

new_starts = dict()
for c in start_cols:
    tmp = pipe_lines.loc[start_only_subset, c].apply(lambda x: [x]) * start_lens
    new_starts[c] = [x for x in chain(*tmp.values)]
fixed_starts = pd.DataFrame(new_starts)
fixed_starts['SUBJECT_CUI'] = [x for x in chain(*start_id_split.values)]
fixed_starts['SUBJECT_NAME'] = [x for x in chain(*start_name_split.values)]

fixed_starts = fixed_starts[all_cols]

In [None]:
# pipes in objects
end_id_split = pipe_lines.loc[end_only_subset, 'OBJECT_CUI'].str.split('|')
end_name_split = pipe_lines.loc[end_only_subset, 'OBJECT_NAME'].str.split('|')

end_lens = end_id_split.apply(len)
end_lens1 = end_name_split.apply(len)
bad_lines = pipe_lines.loc[end_only_subset][(end_lens != end_lens1)].index

pipe_lines = pipe_lines.drop(bad_lines) # bad lines where number of pipes are different across ID and name
end_only_subset = end_only_subset.drop(bad_lines)
multi_both_subset = multi_both_subset.drop(bad_lines)

end_id_split = pipe_lines.loc[end_only_subset, 'OBJECT_CUI'].str.split('|')
end_name_split = pipe_lines.loc[end_only_subset, 'OBJECT_NAME'].str.split('|')
end_lens = end_id_split.apply(len)

end_cols = all_cols[:]
end_cols.remove('OBJECT_CUI')
end_cols.remove('OBJECT_NAME')

new_ends = dict()
for c in end_cols:
    tmp = pipe_lines.loc[end_only_subset, c].apply(lambda x: [x]) * end_lens
    new_ends[c] = [x for x in chain(*tmp.values)]
    
fixed_ends = pd.DataFrame(new_ends)
fixed_ends['OBJECT_CUI'] = [x for x in chain(*end_id_split.values)]
fixed_ends['OBJECT_NAME'] = [x for x in chain(*end_name_split.values)]

fixed_ends = fixed_ends[all_cols]

In [None]:
# pipes in both subject and object
start_id_split = pipe_lines.loc[multi_both_subset, 'SUBJECT_CUI'].str.split('|')
start_name_split = pipe_lines.loc[multi_both_subset, 'SUBJECT_NAME'].str.split('|')
start_lens = start_id_split.apply(len)

end_id_split = pipe_lines.loc[multi_both_subset, 'OBJECT_CUI'].str.split('|')
end_name_split = pipe_lines.loc[multi_both_subset, 'OBJECT_NAME'].str.split('|')
end_lens = end_id_split.apply(len)

start_id_split = start_id_split * end_lens
start_name_split = start_name_split * end_lens

end_id_split = end_id_split * start_lens
end_name_split = end_name_split * start_lens

sorting_df = pd.DataFrame()
sorting_df['ID'] = start_id_split
sorting_df['NAME'] = start_name_split

sorted_start_id_split = sorting_df['ID'].apply(lambda x: sorted(x))
sorted_start_name_split = sorting_df.apply(lambda row: [x for y,x in sorted(zip(row['ID'], row['NAME']))], axis = 1)

both_cols = all_cols[:]
both_cols.remove('SUBJECT_CUI')
both_cols.remove('SUBJECT_NAME')
both_cols.remove('OBJECT_CUI')
both_cols.remove('OBJECT_NAME')

new_both = dict()
for c in both_cols:
    tmp = pipe_lines.loc[multi_both_subset, c].apply(lambda x: [x]) * (start_lens * end_lens)
    new_both[c] = [x for x in chain(*tmp.values)]
    
fixed_both = pd.DataFrame(new_both)

fixed_both['SUBJECT_CUI'] = [x for x in chain(*sorted_start_id_split.values)]
fixed_both['SUBJECT_NAME'] = [x for x in chain(*sorted_start_name_split.values)]

fixed_both['OBJECT_CUI'] = [x for x in chain(*end_id_split.values)]
fixed_both['OBJECT_NAME'] = [x for x in chain(*end_name_split.values)]

fixed_both = fixed_both[all_cols]

In [None]:
df = pd.concat([good_lines, fixed_starts, fixed_ends, fixed_both]).reset_index(drop=True)

Continuing on...

In [None]:
removes = [ ~df['OBJECT_CUI'].str.contains('C', regex = False),
           ~df['SUBJECT_CUI'].str.contains('C', regex = False),
          ]
    
df = df[~np.logical_or.reduce(removes)]

In [None]:
num_refs = df.groupby(['SUBJECT_CUI', 'OBJECT_CUI', 'PREDICATE']).size()

In [None]:
cutoff = 10 

In [None]:
select_rows = num_refs[num_refs >= cutoff]
df_sel = (df.merge(select_rows.to_frame().rename(columns = {0: 'n_refs'}).reset_index(),
                 on = ['SUBJECT_CUI', 'OBJECT_CUI', 'PREDICATE'])
         .drop_duplicates(subset = ['SUBJECT_CUI', 'OBJECT_CUI', 'PREDICATE']))

In [None]:
if (SEMMEDDB_DIR/'sem_types.csv').is_file():
    sem_types = pd.read_csv(SEMMEDDB_DIR/'sem_types.csv', sep = '|').set_index('abbv')['sem_type'].to_dict()
    df_sel['SUBJECT_SEMTYPE_STR'] = df_sel['SUBJECT_SEMTYPE'].map(sem_types)
    df_sel['OBJECT_SEMTYPE_STR'] = df_sel['OBJECT_SEMTYPE'].map(sem_types)

In [None]:
to_drop = ['aapp', 'gngm', 'celf', 'moft', 'genf']
df_sel = df_sel[(~df_sel['OBJECT_SEMTYPE'].isin(to_drop)) & (~df_sel['SUBJECT_SEMTYPE'].isin(to_drop))]

In [None]:
df_sel.shape # (874459, 13)

## Add in SNOMED hierarchy

In [None]:
hier = pd.read_csv(UMLS_DIR/'MRHIER.RRF', sep = '|', header = None)
hier = hier[hier[4] == 'SNOMEDCT_US'].dropna(subset = [5])

atom_cui_mapping = pd.read_csv(UMLS_DIR/'MRCONSO.RRF', sep = '|', header = None)
atom_cui_mapping = atom_cui_mapping[atom_cui_mapping[11] == "SNOMEDCT_US"][[0, 7, 14]]
atom_cui_mapping.columns = ['cui', 'atom', 'str_label']

hier['atom_list'] = hier[6].apply(lambda x: x.split('.'))

exploded_hier = hier[[0, 'atom_list']].explode('atom_list')

merged_df = exploded_hier.rename(columns = {0: 'SUBJECT_CUI', 'atom_list': 'atom'}).merge(atom_cui_mapping, on = 'atom')

merged_df = merged_df.rename(columns = {'cui': 'OBJECT_CUI'}).drop_duplicates(subset = ['SUBJECT_CUI', 'OBJECT_CUI'])
merged_df['PREDICATE'] = 'ISA'
merged_df = merged_df[~(merged_df.OBJECT_CUI.isin(['C2720507', 'C0037088']))] # generic concepts

In [None]:
add_rows = [('C0428881', 'C0871470'), # Non-invasive systolic arterial pressure
            ('C0428881', 'C1306620'),
           ('C0428884', 'C0428883'), # Non-invasive diastolic arterial pressure
            ('C0428884', 'C1305849')]
add_rows_df = pd.DataFrame(add_rows, columns = ['SUBJECT_CUI', 'OBJECT_CUI']).assign(PREDICATE = 'ISA')

In [None]:
merged_df = pd.concat((merged_df, add_rows_df), ignore_index = True).drop_duplicates(subset = ['SUBJECT_CUI', 'OBJECT_CUI'])

In [None]:
# get the str names for each of the CUIs. This is approximate since we are taking the name of the atoms.
name_mapping = (atom_cui_mapping[['cui', 'str_label']].groupby('cui').agg({'str_label': 'first'})
                ['str_label'].to_dict()
               )

In [None]:
merged_df = merged_df.drop(columns = ['atom', 'str_label']).assign(
    SUBJECT_NAME = merged_df['SUBJECT_CUI'].map(name_mapping),
    OBJECT_NAME = merged_df['OBJECT_CUI'].map(name_mapping),
    SUBJECT_NOVELTY = 1,
    OBJECT_NOVELTY = 1,
    PREDICATION_ID = np.arange(df_sel['PREDICATION_ID'].max() + 1, df_sel['PREDICATION_ID'].max() + 1 + len(merged_df))
)

In [None]:
df_final = pd.concat((df_sel, merged_df), ignore_index = True).drop_duplicates(subset = ['SUBJECT_CUI', 'OBJECT_CUI', 'PREDICATE'])

In [None]:
df_final.reset_index(drop = True).to_csv(SEMMEDDB_DIR/f'semmeddb_processed_{cutoff}.csv', index = False)

In [None]:
df_final # 7229042 rows