### Author: Boris Kundu (kunduboris@gmail.com, 470-746-3137) ###

### Problem Statement ###
Apply common Machine Learning techniques on relevant sections 
(chief complaint, history of present illness, and discharge diagnosis) 
of clinical data (clinical_notes\training_data\*.txt) to uncover 
common underlying factors for a given medical condition.

### 1. Setup libraries ###
1. Install packages
2. Import packages
3. Download other libraries (as needed)

In [24]:
# Install packages

#!pip install --upgrade pip
#!pip install pandas
#!pip install numpy

#!pip install nltk

#!pip install spacy

#!pip install sklearn
#!pip install pyLDAvis

# Import packages

import os
import string
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')

from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import pyLDAvis
import pyLDAvis.sklearn

### 2. Data loading ###
1. Define relevant sections
2. Parse given text files
3. Create raw dataset

In [25]:
# Initialize sections
relevant_sections = {
                    'complaint': ('chief complaint:','major surgical or invasive procedure:'),
                    'history': ('history of present illness:','past medical history:'),
                    'diagnosis': ('discharge diagnosis:','discharge condition:')
                    }

In [26]:
# Read data from directory
def read_clinical_notes(data_directory):
    if data_directory[-1] != '/':
        data_directory += '/'
    all_files = os.listdir(data_directory)
    text_files = [fn for fn in all_files if fn[-4:] == '.txt']
    clinical_records = {}
    for file in text_files:
        with open (data_directory+file,'rt') as curr_file:
            clinical_records[file[:len(file)-4]] = curr_file.read().lower()
    print(f'Total clinical records found: {len(text_files)}')
    return clinical_records

In [27]:
# Crate raw dataset
def create_raw_dataset(records, relevant_sections):
    record_list = []
    for k, v in records.items():
        rec = {}
        rec['id'] = int(k)
        for section, (prefix,suffix) in relevant_sections.items():
            rec[section] = v[v.find(prefix)+len(prefix):v.find(suffix)]
        record_list.append(rec)
    return pd.DataFrame(record_list) 

In [28]:
# Get all clinical records
records = read_clinical_notes('training_data')

Total clinical records found: 303


In [29]:
# Get data frame from raw dataset
raw = create_raw_dataset(records,relevant_sections)
raw['underlying_factors'] = raw['complaint'] + ' ' + raw['history'] + ' ' + raw['diagnosis']
# Drop separate sections
raw.drop(['history', 'diagnosis'], axis = 1, inplace = True)

In [30]:
# Check raw data 
raw.head()

Unnamed: 0,id,complaint,underlying_factors
0,100035,"\npost-cardiac arrest, asthma exacerbation\n\n","\npost-cardiac arrest, asthma exacerbation\n\n..."
1,100039,\nabdominal pain\n\n,\nabdominal pain\n\n \n38 yo f w/ h/o all in r...
2,100187,\nsob\n\n,\nsob\n\n \n64 yo woman w/ h/o recurrent pes s...
3,100229,"\nhypotension with elevated lactate, code seps...","\nhypotension with elevated lactate, code seps..."
4,100564,\nsvc thrombosis\n\n,\nsvc thrombosis\n\n \n43 yo male with hx of r...


In [31]:
# Check info
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  303 non-null    int64 
 1   complaint           303 non-null    object
 2   underlying_factors  303 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.2+ KB


### 3. Data pre-processing ###

1. Clean data (punctuations, white spaces etc.)
2. Tokenize text
3. Remove stopwords
4. Lemmatize text
5. Filter unwanted Part of Speech

In [32]:
# Corpus specific stop words
custom_stop_words = ['history','date','birth','sex','patient','hospital', 'po' ,'mg', 'yo', 'year', 'male', 'female','hp','bp','rr']
# POS filtering NOUN, VERB, ADJECTIVE, PROVERB
allowed_tags = ['NN','NNP','NNPS','NNS',
                'VB','VBD','VBG','VBN','VBP','VBZ',
                'JJ','JJR', 'JJS', 
                'RB', 'RBR', 'RBS']

In [33]:
# Removes punctuations
def remove_punctuation(text):
    for punc in string.punctuation:
        text = text.replace(punc, ' ')
    return text

In [34]:
# Clean data
def clean(dataset):
    sections = [col for col in dataset.columns if dataset[col].dtype == object]
    # Replace any text between [] including []
    dataset = dataset.replace(r'[\[].*?[\]]', ' ', regex = True)
    for sec in sections:
        # Remove new line chaarcters '\n'
        dataset[sec] = dataset[sec].str.replace('\n',' ')
        # Remove punctuations
        dataset[sec] = dataset[sec].apply(remove_punctuation)
        # Remove numbers
        dataset[sec] = dataset[sec].str.replace(r'\d+', ' ', regex = True)
        # Remove single characters
        dataset[sec] = dataset[sec].str.replace(r'\b\w\b', ' ', regex = True)
        # Remove sequence of white spaces
        dataset[sec] = dataset[sec].str.replace(r'\s+', ' ', regex = True)
        # Remove any leading or trailing white spaces
        dataset[sec] = dataset[sec].str.strip()
    return dataset

In [35]:
# Get all stop words
def get_stop_words(custom_stop_words):
    stop_words = STOP_WORDS.union(set(stopwords.words('english')))
    stop_words = stop_words.union(set(custom_stop_words))
    return stop_words

In [36]:
# Create doc/sentence from tokens/words/lemmas
def word2doc(words):
    text_out = ' '.join(words)
    return text_out

In [37]:
# Tokenize text
def tokenize_text(text):
    stop_words = get_stop_words(custom_stop_words)
    tokens = [token for token in nltk.tokenize.word_tokenize(text) if token not in stop_words]
    return tokens

In [38]:
# Filter words that are not NOUN, VERB, ADJECTIVE, PROVERB
def lemmatize_pos_filtering(words):
    stop_words = get_stop_words(custom_stop_words)
    wordnet_lemmatizer = WordNetLemmatizer()
    filtered_words = []
    for word in words:
        tag = nltk.pos_tag([word])[0][1]
        if tag in allowed_tags:
            tag = tag[0].lower()
            if tag == 'j':
                tag = 'a'
            else:
                tag = tag if tag in ['r', 'n', 'v'] else None
            if not tag:
                lemma = word
            else:
                lemma = wordnet_lemmatizer.lemmatize(word, tag)
            if lemma not in stop_words:
                filtered_words.append(lemma)
    return filtered_words

In [39]:
# Prepare data
def prepare_data(df, input_feature, out_feature):
    # Get tokens
    df[out_feature + '_Tokens'] = df[input_feature].apply(tokenize_text)
    # Get POS filtered words
    df[out_feature + '_Lemmas'] = df[out_feature + '_Tokens'].apply(lemmatize_pos_filtering)
    # Get POS filtered text
    df[out_feature] = df[out_feature + '_Lemmas'].apply(word2doc)
    return df

In [40]:
# Clean textual data
df = clean(raw)

In [41]:
# Check for null values
df.isna().sum()

id                    0
complaint             0
underlying_factors    0
dtype: int64

In [42]:
# Prepare underlying factors
df = prepare_data(df, 'underlying_factors', 'prepared_corpus')

In [43]:
# Check head
df.head().T

Unnamed: 0,0,1,2,3,4
id,100035,100039,100187,100229,100564
complaint,post cardiac arrest asthma exacerbation,abdominal pain,sob,hypotension with elevated lactate code sepsis,svc thrombosis
underlying_factors,post cardiac arrest asthma exacerbation mr is ...,abdominal pain yo all in remission cord transp...,sob yo woman recurrent pes filter gib while an...,hypotension with elevated lactate code sepsis ...,svc thrombosis yo male with hx of rectal ca dm...
prepared_corpus_Tokens,"[post, cardiac, arrest, asthma, exacerbation, ...","[abdominal, pain, remission, cord, transplant,...","[sob, woman, recurrent, pes, filter, gib, anti...","[hypotension, elevated, lactate, code, sepsis,...","[svc, thrombosis, hx, rectal, dmii, histoplasm..."
prepared_corpus_Lemmas,"[post, cardiac, arrest, asthma, exacerbation, ...","[abdominal, pain, remission, cord, transplant,...","[sob, woman, recurrent, pe, filter, gib, antic...","[hypotension, elevate, lactate, code, sepsis, ...","[svc, thrombosis, hx, rectal, dmii, histoplasm..."
prepared_corpus,post cardiac arrest asthma exacerbation mr old...,abdominal pain remission cord transplant anthr...,sob woman recurrent pe filter gib anticoagulat...,hypotension elevate lactate code sepsis yom pm...,svc thrombosis hx rectal dmii histoplasmosis p...


### 4. Modeling ###

1. Setup pipeline and parameters for LDA
2. Perform hyperparameter tuning for LDA
3. Use best parameters of model for topic modeling

In [44]:
# Define pipeline and parameters for LDA
pipeline_LDA = Pipeline([
                ('bow', CountVectorizer()),
                ('lda', LatentDirichletAllocation())]
                )
params_LDA = {
            'bow__max_df': [0.2,0.3,0.4],
            'bow__min_df': [10,15,20],
            'bow__ngram_range': [(2,2), (2,3), (3,3)],
            'lda__n_components': [3, 4, 5],
            'lda__learning_decay' : [0.6, 0.7, 0.8],
            'lda__max_iter': [10, 11, 12]
            }

In [45]:
# Perform hyperparameter tuning
def hyperparameter_tuning(pipeline, params, data, jobs = 4):
    # Init Grid Search Class
    model = GridSearchCV(estimator = pipeline, param_grid = params, n_jobs = jobs)
    # Do the Grid Search
    model.fit(data)
    # Best Model
    best_estimator = model.best_estimator_
    # Model Parameters
    print("Best Params: ", model.best_params_)
    # Score
    print("Best Score: ", model.best_score_)
    return best_estimator

In [46]:
# Get best LDA model
best_lda_model = hyperparameter_tuning(pipeline_LDA, params_LDA, df['prepared_corpus'], jobs = 4)

Best Params:  {'bow__max_df': 0.2, 'bow__min_df': 20, 'bow__ngram_range': (3, 3), 'lda__learning_decay': 0.8, 'lda__max_iter': 10, 'lda__n_components': 3}
Best Score:  -158.36655327781645


In [47]:
# Get vector model
bow = best_lda_model.named_steps['bow']
# Get vectorized data
count_vectors = bow.transform(df['prepared_corpus'])
# Get topic Model
lda = best_lda_model.named_steps['lda']
# Get lDA output
lda_output = lda.transform(count_vectors)

In [48]:
# Visualize topics
# Create panel
lda_display = pyLDAvis.sklearn.prepare(lda, count_vectors, bow, sort_topics=False)
# Display panel
pyLDAvis.display(lda_display)

### 5.	Predictions ###
1. Identify topics (underlying factors) for existing documents (conditions)
2. Identify top features (factors) in each topic
3. Predict topic (underlying factors) for a new text (condition)

In [49]:
# Get document topc matrix
def create_document_topic_matrix(topic_data, data):
    # Create topic names
    top_names = ["Topic " + str(c + 1) for c in range(lda.n_components)]
    # Create doc names
    doc_names = ["Doc " + str(rec) for rec in data['id'].values]
    # Create data frame
    df_doc_top = pd.DataFrame(np.round(topic_data, 4), columns = top_names, index = doc_names)
    # Get main topic for each doc
    main_topic = np.argmax(df_doc_top.values, axis=1) + 1
    df_doc_top['Main Topic'] = main_topic
    return df_doc_top

In [50]:
# Get topc feature(word) matrix
def create_topic_word_matrix(topic_model = lda, vector_model = bow):
    # Create data frame
    df_top_word = pd.DataFrame(topic_model.components_)
    # Set column and index
    df_top_word.columns = vector_model.get_feature_names()
    df_top_word.index = ["Topic " + str(c + 1) for c in range(topic_model.n_components)]
    return df_top_word

In [51]:
# Get top n words for each topic
def get_top_words(vector_model = bow, topic_model = lda, top_words = 20):
    words = np.array(vector_model.get_feature_names())
    topic_words = []
    for topic_weights in topic_model.components_:
        top_words_pos = (-topic_weights).argsort()[:top_words]
        topic_words.append(words.take(top_words_pos))
    return topic_words

In [58]:
# Predict topic for gievn condition
def predict_topic(list_of_conditions, topic_words, vector_model = bow, topic_model = lda):
    # Create data frame from list of conditions
    df_conditions = pd.DataFrame(list_of_conditions)
    df_conditions.columns = ['Search Condition']
    df_conditions.index = ['Query ' + str(i+1) for i in range(len(list_of_conditions))]
    # Clean text
    df_conditions = clean(df_conditions)
    # Prepare query description
    df_conditions = prepare_data(df_conditions, 'Search Condition', 'prepared_query')
    # Vectorize input
    vector_input = vector_model.transform(df_conditions['prepared_query'])
    # LDA transform
    topic_proba_scores = topic_model.transform(vector_input)
    # Get topics
    topics = []
    for prob_scores in topic_proba_scores:
        topic = topic_words.iloc[np.argmax(prob_scores), :].values.tolist()
        topics.append(topic)
    # Create data frame to return
    search_conditions = df_conditions['prepared_query'].values
    df_topics = pd.DataFrame(list(zip(search_conditions, topics)))
    df_topics.columns = ['Search Condition','Topic Words']
    df_topics.index = df_conditions.index
    return df_topics

In [53]:
# Get document topic data frame
document_topic = create_document_topic_matrix(lda_output, df)
# Check head
document_topic.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Main Topic
Doc 100035,0.3333,0.3333,0.3333,1
Doc 100039,0.1668,0.1669,0.6663,3
Doc 100187,0.3333,0.3333,0.3333,1
Doc 100229,0.0556,0.4963,0.4481,2
Doc 100564,0.3333,0.3333,0.3333,1


In [54]:
# Check topic distribution
topic_distribution = document_topic['Main Topic'].value_counts().reset_index()
topic_distribution.columns = ['Topic', 'Total Documents']
topic_distribution.head()

Unnamed: 0,Topic,Total Documents
0,1,177
1,3,70
2,2,56


In [55]:
# Check topic word matrix
df_top_word = create_topic_word_matrix(lda, bow)
df_top_word.T.head(-5)

Unnamed: 0,Topic 1,Topic 2,Topic 3
acute renal failure,40.277297,0.34307,0.379633
alter mental status,0.341335,37.319595,0.339071
chill night sweat,0.337334,0.335638,27.327028
coronary artery disease,0.344927,47.318616,0.336457
cough shortness breath,0.337897,0.338918,24.323185
denies chest pain,0.350815,0.375363,31.273822
denies fever chill,0.35121,0.344598,43.304192
dictate medquist job,57.315907,0.350455,0.333637
fever chill night,0.337275,0.335459,25.327265
nausea vomit diarrhea,0.340167,0.345132,34.314701


In [56]:
# Get top words
topic_words = get_top_words(bow, lda, 10)        

# Create topic words data frame
df_top_words = pd.DataFrame(topic_words)
df_top_words.columns = ['Word '+ str(i+1) for i in range(df_top_words.shape[1])]
df_top_words.index = ['Topic '+ str(i+1) for i in range(df_top_words.shape[0])]

# Check data
df_top_words.head()

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15
Topic 1,dictate medquist job,acute renal failure,denies fever chill,denies chest pain,pain nausea vomit,coronary artery disease,alter mental status,tenderness rhinorrhea congestion,sinus tenderness rhinorrhea,nausea vomit diarrhea,review system hpi,cough shortness breath,chill night sweat,fever chill night,vomit diarrhea constipation
Topic 2,coronary artery disease,alter mental status,pain nausea vomit,denies chest pain,review system hpi,dictate medquist job,nausea vomit diarrhea,denies fever chill,acute renal failure,cough shortness breath,chill night sweat,tenderness rhinorrhea congestion,sinus tenderness rhinorrhea,fever chill night,vomit diarrhea constipation
Topic 3,denies fever chill,nausea vomit diarrhea,denies chest pain,chill night sweat,fever chill night,vomit diarrhea constipation,cough shortness breath,review system hpi,sinus tenderness rhinorrhea,tenderness rhinorrhea congestion,pain nausea vomit,acute renal failure,alter mental status,coronary artery disease,dictate medquist job


In [59]:
# Define new search condtitions
list_of_conditions = ['congestive heart failure', 'denies fever chills', 'frequent vomiting, diarrhea, and nausea', 'admission discharge service']
# Get topics for new condistions
df_condition_factors = predict_topic(list_of_conditions, df_top_words, bow, lda)
# Display topics
df_condition_factors.head()

Unnamed: 0,Search Condition,Topic Words
Query 1,congestive heart failure,"[dictate medquist job, acute renal failure, de..."
Query 2,denies fever chill,"[denies fever chill, nausea vomit diarrhea, de..."
Query 3,frequent vomit diarrhea nausea,"[dictate medquist job, acute renal failure, de..."
Query 4,admission discharge service,"[dictate medquist job, acute renal failure, de..."
