# Use ETM to detect common topics in transferable, untransferable, and undecidable codes

In [1]:
from embedded_topic_model.utils import preprocessing
from embedded_topic_model.utils import embedding
from embedded_topic_model.models.etm import ETM
import pandas as pd
import os

In [2]:
def get_topics(docs, ignore_words=[]):
    """ 
    Get topics using customized procedure
    :param docs list[list[str]]: 
    :param ignore words list[str]: words to be removed from vocabulary
    """
    vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
        docs, 
        min_df=0.01, 
        max_df=0.75, 
        train_size=0.85, 
    )
    
    embeddings_mapping = embedding.create_word2vec_embedding_from_dataset(docs)

    # Training an ETM instance
    etm_instance = ETM(
        vocabulary,
        embeddings=embeddings_mapping, # You can pass here the path to a word2vec file or
                                    # a KeyedVectors instance
        num_topics=20,
        epochs=300,
        debug_mode=True,
        train_embeddings=False, # Optional. If True, ETM will learn word embeddings jointly with
                                # topic embeddings. By default, is False. If 'embeddings' argument
                                # is being passed, this argument must not be True
    )

    etm_instance.fit(train_dataset)

    topics = etm_instance.get_topics(20)
    topic_coherence = etm_instance.get_topic_coherence(20)
    topic_diversity = etm_instance.get_topic_diversity(20)

    return topics, topic_coherence, topic_diversity
        

In [3]:
""" 
Read in codes of different types
"""

output_dir = os.path.join(os.path.expanduser("~"), f"OTTEHR/outputs/mimic")
print(f"Will save outputs to {output_dir}")

mimic_df = pd.read_csv(os.path.join(output_dir, "selected_summary_mimic.csv"), header=0, index_col = 0)

Will save outputs to /home/wanxinli/OTTEHR/outputs/mimic


In [4]:
trans_codes = mimic_df.loc[mimic_df['transferable'] == 1]
trans_documents = list(trans_codes['long title'])

untrans_codes = mimic_df.loc[mimic_df['transferable'] == -1]
untrans_documents = list(untrans_codes['long title'])

undec_codes = mimic_df.loc[mimic_df['transferable'] == 0]
undec_documents = list(undec_codes['long title'])

In [21]:
vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
        trans_documents, 
        min_df=0.01, 
        max_df=0.75, 
        train_size=0.85, 
    )

print(vocabulary)
print(train_dataset['counts'])

['specified', 'gestation', 'encephalopathy', 'deficiency', 'causing', 'hypertension', 'weeks', 'status', 'joint', 'episode', 'other', 'surgical', 'with', 'myocardial', 'disease', 'acid', 'reaction', 'later', 'not', 'misadventure', 'use', 'initial', 'conditions', 'procedure', 'venous', 'site', 'patient', 'steroids', 'fall', 'in', 'due', 'mention', 'procedures', 'or', 'at', 'as', 'time', 'operation', 'completed', 'essential', 'classified', 'cerebral', 'tobacco', 'and', 'chronic', 'of', 'sleep', 'blood', 'elsewhere', 'history', 'infection', 'infarction', 'without', 'effects', 'collapse', 'pulmonary', 'unspecified', 'iatrogenic', 'hemorrhage', 'anemia', 'abnormal', 'to', 'care', 'disorder', 'apnea', 'replacement']
[array([1]) array([1, 1, 1, 1])
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1]) array([1, 1])
 array([1]) array([1]) array([1]) array([1, 1, 1, 1, 1]) array([1])
 array([1, 1]) array([1, 1, 1, 1, 1]) array([1, 1, 1, 1]) array([1])
 array([1, 1]) array([1]) array([1])
 array([1

In [5]:
trans_documents

['Methicillin susceptible Staphylococcus aureus in conditions classified elsewhere and of unspecified site',
 'Human immunodeficiency virus [HIV] disease',
 'Streptococcus infection in conditions classified elsewhere and of unspecified site, streptococcus, group D [Enterococcus]',
 'Candidiasis of mouth',
 'Unspecified viral hepatitis C without hepatic coma',
 'Unspecified protein-calorie malnutrition',
 'Unspecified acquired hypothyroidism',
 'Mixed acid-base balance disorder',
 'Dehydration',
 'Anxiety state, unspecified',
 'Dysthymic disorder',
 'Other and unspecified alcohol dependence, unspecified',
 'Pure hypercholesterolemia',
 'Hyposmolality and/or hyponatremia',
 'Acidosis',
 'Alkalosis',
 'Hypovolemia',
 'Hyperpotassemia',
 'Hypopotassemia',
 'Obesity, unspecified',
 'Iron deficiency anemia secondary to blood loss (chronic)',
 'Iron deficiency anemia, unspecified',
 'Diabetes with other specified manifestations, type II or unspecified type, not stated as uncontrolled',
 'Othe

In [6]:
untrans_documents == trans_documents

False

In [7]:
trans_topics, trans_topic_coherence, trans_topic_diversity = get_topics(trans_documents)

Topics before training: [['misadventure', 'mention', 'later', 'patient', 'abnormal', 'at', 'time', 'without', 'causing', 'or'], ['abnormal', 'with', 'patient', 'or', 'causing', 'later', 'without', 'to', 'operation', 'acid'], ['abnormal', 'causing', 'patient', 'or', 'with', 'later', 'to', 'acid', 'operation', 'cerebral'], ['encephalopathy', 'alcohol', 'pulmonary', 'pneumonia', 'weeks', 'apnea', 'status', 'joint', 'hypertension', 'hemorrhage'], ['abnormal', 'later', 'as', 'or', 'cerebral', 'time', 'misadventure', 'initial', 'episode', 'patient'], ['abnormal', 'causing', 'cerebral', 'patient', 'with', 'to', 'acid', 'reaction', 'operation', 'due'], ['encephalopathy', 'alcohol', 'pulmonary', 'collapse', 'pneumonia', 'hypertension', 'replacement', 'essential', 'hemorrhage', 'sleep'], ['abnormal', 'later', 'patient', 'causing', 'misadventure', 'or', 'acid', 'with', 'mention', 'operation'], ['abnormal', 'patient', 'later', 'causing', 'to', 'or', 'cerebral', 'operation', 'misadventure', 'acid']

In [8]:
untrans_topics, untrans_topic_coherence, untrans_topic_diversity = get_topics(untrans_documents)

Topics before training: [['neoplasm', 'transplanted', 'history', 'cirrhosis', 'valve', 'delivered', 'renal', 'behavioral', 'hepatitis', 'bypass'], ['without', 'with', 'or', 'of', 'against', 'inoculation', 'stage', 'kidney', 'mention', 'vaccination'], ['tachypnea', 'sepsis', 'hydrocephalus', 'bypass', 'fibrillation', 'bradycardia', 'liver', 'septicemia', 'newborn', 'transplanted'], ['bypass', 'hydrocephalus', 'status', 'tachypnea', 'bradycardia', 'shock', 'fibrillation', 'transplanted', 'cirrhosis', 'newborn'], ['fibrillation', 'hydrocephalus', 'sepsis', 'tachypnea', 'shock', 'bradycardia', 'bypass', 'newborn', 'septicemia', 'status'], ['renal', 'for', 'necrosis', 'behavioral', 'shock', 'unspecified', 'disease', 'section', 'status', 'stage'], ['sepsis', 'fibrillation', 'shock', 'hydrocephalus', 'transplanted', 'newborn', 'bradycardia', 'bypass', 'history', 'status'], ['hydrocephalus', 'fibrillation', 'bypass', 'bradycardia', 'sepsis', 'newborn', 'status', 'septicemia', 'tachypnea', 'sho

In [9]:
undec_topics, undec_topic_coherence, undec_topic_diversity = get_topics(undec_documents)

Topics before training: [['mention', 'infants', 'without', 'misadventure', 'obstructive', 'primary', 'of', 'diabetes', 'neoplasm', 'systolic'], ['susceptible', 'diabetes', 'due', 'on', 'vascular', 'pneumonia', 'coagulation', 'alcohol', 'to', 'hemorrhage'], ['section', 'born', 'abnormal', 'causing', 'cesarean', 'implant', 'delivered', 'by', 'patient', 'or'], ['abnormal', 'causing', 'implant', 'patient', 'section', 'born', 'cesarean', 'or', 'as', 'by'], ['misadventure', 'time', 'later', 'at', 'of', 'without', 'mention', 'abnormal', 'procedure', 'mellitus'], ['cesarean', 'born', 'delivered', 'by', 'section', 'abnormal', 'causing', 'patient', 'implant', 'later'], ['primary', 'infants', 'diabetes', 'obstructive', 'systolic', 'diastolic', 'on', 'hemorrhage', 'secondary', 'failure'], ['primary', 'infants', 'diabetes', 'obstructive', 'diastolic', 'systolic', 'on', 'heart', 'hemorrhage', 'failure'], ['born', 'cesarean', 'abnormal', 'delivered', 'by', 'patient', 'implant', 'later', 'the', 'at'],

In [10]:
print(trans_topics, trans_topic_coherence, trans_topic_diversity)

[['abnormal', 'later', 'patient', 'or', 'causing', 'misadventure', 'without', 'cerebral', 'to', 'mention', 'operation', 'time', 'with', 'reaction', 'of', 'at', 'as', 'procedures', 'acid', 'other'], ['abnormal', 'later', 'patient', 'or', 'causing', 'cerebral', 'to', 'misadventure', 'without', 'operation', 'with', 'mention', 'reaction', 'time', 'at', 'as', 'acid', 'procedures', 'of', 'initial'], ['abnormal', 'later', 'patient', 'or', 'causing', 'misadventure', 'without', 'mention', 'operation', 'cerebral', 'time', 'to', 'at', 'with', 'reaction', 'of', 'as', 'procedures', 'acid', 'initial'], ['abnormal', 'later', 'patient', 'or', 'causing', 'misadventure', 'without', 'cerebral', 'to', 'operation', 'mention', 'with', 'time', 'reaction', 'at', 'of', 'as', 'procedures', 'acid', 'initial'], ['abnormal', 'later', 'patient', 'or', 'causing', 'misadventure', 'without', 'cerebral', 'operation', 'to', 'mention', 'time', 'with', 'of', 'reaction', 'at', 'as', 'procedures', 'acid', 'initial'], ['abno

In [11]:
print(untrans_topics, untrans_topic_coherence, untrans_topic_diversity)

[['without', 'with', 'or', 'of', 'inoculation', 'kidney', 'mention', 'vaccination', 'stage', 'not', 'against', 'as', 'type', 'and', 'for', 'in', 'classified', 'by', 'uncontrolled', 'unspecified'], ['without', 'with', 'or', 'of', 'kidney', 'inoculation', 'mention', 'vaccination', 'stage', 'not', 'against', 'as', 'type', 'and', 'for', 'in', 'classified', 'by', 'uncontrolled', 'unspecified'], ['without', 'with', 'or', 'of', 'inoculation', 'kidney', 'mention', 'vaccination', 'stage', 'not', 'against', 'as', 'type', 'and', 'for', 'in', 'classified', 'by', 'uncontrolled', 'unspecified'], ['without', 'with', 'or', 'of', 'inoculation', 'kidney', 'mention', 'vaccination', 'stage', 'not', 'against', 'as', 'type', 'and', 'for', 'in', 'classified', 'by', 'uncontrolled', 'unspecified'], ['without', 'with', 'or', 'of', 'kidney', 'inoculation', 'mention', 'vaccination', 'stage', 'not', 'against', 'as', 'type', 'and', 'for', 'in', 'classified', 'by', 'uncontrolled', 'unspecified'], ['without', 'with',

In [12]:
print(undec_topics, undec_topic_coherence, undec_topic_diversity)

[['of', 'misadventure', 'mention', 'without', 'later', 'time', 'at', 'abnormal', 'neoplasm', 'malignant', 'or', 'mellitus', 'patient', 'type', 'reaction', 'procedure', 'causing', 'implant', 'history', 'aneurysm'], ['of', 'mention', 'misadventure', 'without', 'later', 'time', 'at', 'neoplasm', 'malignant', 'abnormal', 'mellitus', 'type', 'or', 'reaction', 'patient', 'history', 'procedure', 'aneurysm', 'rupture', 'atherosclerosis'], ['of', 'later', 'misadventure', 'abnormal', 'time', 'mention', 'at', 'without', 'patient', 'or', 'causing', 'implant', 'procedure', 'as', 'reaction', 'mellitus', 'type', 'uncontrolled', 'neoplasm', 'the'], ['of', 'misadventure', 'later', 'mention', 'without', 'time', 'at', 'abnormal', 'patient', 'or', 'reaction', 'causing', 'neoplasm', 'mellitus', 'procedure', 'implant', 'type', 'malignant', 'as', 'uncontrolled'], ['of', 'mention', 'misadventure', 'without', 'later', 'time', 'at', 'neoplasm', 'malignant', 'abnormal', 'mellitus', 'type', 'or', 'reaction', 'pat