 ## Building a SNOMED-CT MedCAT Concept Database
 - populating with ICD-10 and OPCS-4 mapped infos

In [None]:
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.prepare_cdb import PrepareCDB

In [72]:
import os
import pandas as pd
import pickle

In [47]:
vocab_dat = '/home/ubuntu/medcat_models/base_vocabulary.dat'
cdb_csv = '/home/ubuntu/medcat_models/snomed_cdb_csv_SNOMED-CT-UK_Release_20191001.csv'

In [127]:
snomed_csv = pd.read_csv(cdb_csv)

In [None]:
set(snomed_csv.cui.unique()) - set(snomed_cdb.cui2names.keys())

In [17]:
%%time
vocab = Vocab()
vocab.load_dict(vocab_dat)
prep = PrepareCDB(vocab=vocab)
csv_paths = [cdb_csv]
cdb = prep.prepare_csvs(csv_paths)

Done: 0
Done: 10000
Done: 20000
Done: 30000
Done: 40000
Done: 50000
Done: 60000
Done: 70000
Done: 80000
Done: 90000
Done: 100000
Done: 110000
Done: 120000
Done: 130000
Done: 140000
Done: 150000
Done: 160000
Done: 170000
Done: 180000
Done: 190000
Done: 200000
Done: 210000
Done: 220000
Done: 230000
Done: 240000
Done: 250000
Done: 260000
Done: 270000
Done: 280000
Done: 290000
Done: 300000
Done: 310000
Done: 320000
Done: 330000
Done: 340000
Done: 350000
Done: 360000
Done: 370000
Done: 380000
Done: 390000
Done: 400000
Done: 410000
Done: 420000
Done: 430000
Done: 440000
Done: 450000
Done: 460000
Done: 470000
Done: 480000
Done: 490000
Done: 500000
Done: 510000
Done: 520000
Done: 530000
Done: 540000
Done: 550000
Done: 560000
Done: 570000
Done: 580000
Done: 590000
Done: 600000
Done: 610000
Done: 620000
Done: 630000
Done: 640000
Done: 650000
Done: 660000
Done: 670000
Done: 680000
Done: 690000
Done: 700000
Done: 710000
Done: 720000
Done: 730000
Done: 740000
Done: 750000
Done: 760000
Done: 770000


In [28]:
snomed_cdb.save_dict('./medcat_models/snomed_cdb.dat')

In [49]:
cat = CAT(cdb=snomed_cdb, vocab=vocab)

## Train Model on MIMIC-III Notes

In [32]:
df = pd.read_csv('~/tdy_ehr/mimic_tidy/data/mimic3/raw/notes_mimic3.csv')
all_notes = df.text.tolist()

In [None]:
%%time
for text in all_notes:
    cat(text)

In [61]:
snomed_cdb.save_dict('./medcat_models/snomed_cdb.dat')

In [53]:
cat.train = False

In [75]:
cui2icd10_mappings = pickle.load(open('medcat_models/icd10_mapping_condensed.pickle', 'rb'))
cui2opcs4_mappings = pickle.load(open('medcat_models/opcs_mapping_condensed.pickle', 'rb'))

In [109]:
icd10_descs = pickle.load(open('medcat_models/icd10_uk_codes.pickle', 'rb'))
icd10_descs = {key[0:3] + '.' + key[3:] if len(key) > 3 else key: val for key, val in icd10_descs.items()}
opcs4_descs = pickle.load(open('medcat_models/opcs_codes_desc.pickle', 'rb'))

In [136]:
%%time
for cui, icd10codes in cui2icd10_mappings.items():
    icd_codes = [{'chapter': icd_code, 'name': icd10_descs[icd_code]} for icd_code in icd10codes if icd_code in icd10_descs]
    if cui in snomed_cdb.cui2info:
        snomed_cdb.cui2info[cui]['icd10'] = icd_codes
for cui, opcs4codes in cui2opcs4_mappings.items():
    opcs_codes = [{'code': opcs_code, 'name': opcs4_descs[opcs_code]} for opcs_code in opcs4codes if opcs_code in opcs4_descs]
    if cui in snomed_cdb.cui2info:
        snomed_cdb.cui2info[cui]['opcs4'] = opcs_codes

CPU times: user 2.27 s, sys: 217 ms, total: 2.49 s
Wall time: 2.48 s


In [145]:
snomed_cdb.save_dict('./medcat_models/0.2.8+_snomed_cdb_mimic.dat')

### Limit CDB to only include ICD-10 / OPCS-4 Concepts

In [147]:
cuis_to_keep = [cui for cui in snomed_cdb.cui2names.keys() if 'icd10' in snomed_cdb.cui2info[cui] or 'opcs4' in snomed_cdb.cui2info[cui]]

In [150]:
%%time
snomed_cdb.filter_by_cui(cuis_to_keep)

FYI - with large CDBs this can take a long time.
Gathering CUIs 
Cleaning up CUI maps...
removed 10k concepts, 145809 to go...
removed 10k concepts, 135809 to go...
removed 10k concepts, 125809 to go...
removed 10k concepts, 115809 to go...
removed 10k concepts, 105809 to go...
removed 10k concepts, 95809 to go...
removed 10k concepts, 85809 to go...
removed 10k concepts, 75809 to go...
removed 10k concepts, 65809 to go...
removed 10k concepts, 55809 to go...
removed 10k concepts, 45809 to go...
removed 10k concepts, 35809 to go...
removed 10k concepts, 25809 to go...
removed 10k concepts, 15809 to go...
removed 10k concepts, 5809 to go...
Done CUI cleaning
Cleaning names...
Done all
CPU times: user 2.07 s, sys: 64.8 ms, total: 2.13 s
Wall time: 2.13 s


In [156]:
snomed_cdb.save_dict('./medcat_models/0.2.8+_snomed_cdb_mimic_icd10_opcs4_only.dat')