 ## Building a SNOMED-CT MedCAT Concept Database
 - populating with ICD-10 and OPCS-4 mapped infos

In [12]:
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.prepare_cdb import PrepareCDB

In [13]:
import os
import pandas as pd
import pickle

## To create a new CDB

In [14]:
vocab_dat = '/Users/shek/Desktop/medcat/base_vocabulary.dat'
cdb_csv = '/Users/shek/Desktop/medcat/snomed_cdb_csv_SNOMED-CT-full_UK_drug_ext_Release_20200228.csv'

In [15]:
snomed_csv = pd.read_csv(cdb_csv)

In [None]:
%%time
vocab = Vocab()
vocab.load_dict(vocab_dat)
prep = PrepareCDB(vocab=vocab)
csv_paths = [cdb_csv]
cdb = prep.prepare_csvs(csv_paths)

In [11]:
# save the CDB
cdb.save_dict('/Users/shek/Desktop/medcat/20191001_snomed_cdb.dat')

## To extend a new snomed release to an existing CDB

In [None]:
# path to old snomed CDB
path = r"/Users/shek/Desktop/medcat/SNOMED_UK/"
file = r"0.2.9+ SNOMED_CT_UK_wDRUGS_20200228.dat"

In [None]:
# load old CDB
snomed_cdb = CDB()
snomed_cdb.load_dict(os.path.join(path,file))

In [None]:
# load new snomed csv you wish to append
csv_path = r"/Users/shek/Documents/GitHub/SNOMED-CT_Analysis/"
csv_file = r"snomed_cdb_csv_SNOMED-CT-full_UK_drug_ext_Release_20200401.csv"
snomed_csv = [os.path.join(csv_path, csv_file)]

In [None]:
prep = PrepareCDB(pretrained_cdb=snomed_cdb) # pass the existing cdb here
cdb = prep.prepare_csvs(snomed_csv, only_new=True) # only add new concepts

In [None]:
# save the new combined CDB
cdb.save_dict(os.path.join(csv_path, '0.2.9+ SNOMED_CT_UK_wDRUGS_20200401.cdb'))

In [None]:
# Check the concepts in the latest edition are present:
cdb.cui2original_names["S-1240751000000100"] # replace with cui only in latest edition

## Train Model on MIMIC-III Notes

In [None]:
cat = CAT(cdb=snomed_cdb, vocab=vocab)

In [32]:
df = pd.read_csv('~/tdy_ehr/mimic_tidy/data/mimic3/raw/notes_mimic3.csv')
all_notes = df.text.tolist()

In [None]:
%%time
for text in all_notes:
    cat(text)

In [61]:
snomed_cdb.save_dict('./medcat_models/snomed_cdb.dat')

In [53]:
cat.train = False

In [75]:
cui2icd10_mappings = pickle.load(open('medcat_models/icd10_mapping_condensed.pickle', 'rb'))
cui2opcs4_mappings = pickle.load(open('medcat_models/opcs_mapping_condensed.pickle', 'rb'))

In [109]:
icd10_descs = pickle.load(open('medcat_models/icd10_uk_codes.pickle', 'rb'))
icd10_descs = {key[0:3] + '.' + key[3:] if len(key) > 3 else key: val for key, val in icd10_descs.items()}
opcs4_descs = pickle.load(open('medcat_models/opcs_codes_desc.pickle', 'rb'))

In [136]:
%%time
for cui, icd10codes in cui2icd10_mappings.items():
    icd_codes = [{'chapter': icd_code, 'name': icd10_descs[icd_code]} for icd_code in icd10codes if icd_code in icd10_descs]
    if cui in snomed_cdb.cui2info:
        snomed_cdb.cui2info[cui]['icd10'] = icd_codes
for cui, opcs4codes in cui2opcs4_mappings.items():
    opcs_codes = [{'code': opcs_code, 'name': opcs4_descs[opcs_code]} for opcs_code in opcs4codes if opcs_code in opcs4_descs]
    if cui in snomed_cdb.cui2info:
        snomed_cdb.cui2info[cui]['opcs4'] = opcs_codes

CPU times: user 2.27 s, sys: 217 ms, total: 2.49 s
Wall time: 2.48 s


In [145]:
snomed_cdb.save_dict('./medcat_models/0.2.8+_snomed_cdb_mimic.dat')

### Limit CDB to only include ICD-10 / OPCS-4 Concepts

In [147]:
cuis_to_keep = [cui for cui in snomed_cdb.cui2names.keys() if 'icd10' in snomed_cdb.cui2info[cui] or 'opcs4' in snomed_cdb.cui2info[cui]]

In [150]:
%%time
snomed_cdb.filter_by_cui(cuis_to_keep)

FYI - with large CDBs this can take a long time.
Gathering CUIs 
Cleaning up CUI maps...
removed 10k concepts, 145809 to go...
removed 10k concepts, 135809 to go...
removed 10k concepts, 125809 to go...
removed 10k concepts, 115809 to go...
removed 10k concepts, 105809 to go...
removed 10k concepts, 95809 to go...
removed 10k concepts, 85809 to go...
removed 10k concepts, 75809 to go...
removed 10k concepts, 65809 to go...
removed 10k concepts, 55809 to go...
removed 10k concepts, 45809 to go...
removed 10k concepts, 35809 to go...
removed 10k concepts, 25809 to go...
removed 10k concepts, 15809 to go...
removed 10k concepts, 5809 to go...
Done CUI cleaning
Cleaning names...
Done all
CPU times: user 2.07 s, sys: 64.8 ms, total: 2.13 s
Wall time: 2.13 s


In [156]:
snomed_cdb.save_dict('./medcat_models/0.2.8+_snomed_cdb_mimic_icd10_opcs4_only.dat')