### Author: Boris Kundu (kunduboris@gmail.com, 470-746-3137) ###

### Problem Statement ###
Apply common Machine Learning techniques on relevant sections 
(chief complaint, history of present illness, and discharge diagnosis)
of clinical data (clinical_notes\training_data\*.txt) to uncover common 
underlying factors for a given medical condition.

### 1. Setup libraries ###
1. Install packages
2. Import packages
3. Download other libraries (as neded)

In [323]:
# Install packages

#!pip install --upgrade pip
#!pip install pandas
#!pip install numpy

#!pip install nltk

#!pip install spacy

#!pip install sklearn
#!pip install pyLDAvis

# Import packages

import os
import string
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')

from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import pyLDAvis
import pyLDAvis.sklearn

### 2. Data loading ###
1. Define relevant sections
2. Parse given text files
3. Create raw dataset

In [324]:
# Initialize sections
relevant_sections = {
                    'complaint': ('chief complaint:','major surgical or invasive procedure:'),
                    'history': ('history of present illness:','past medical history:'),
                    'diagnosis': ('discharge diagnosis:','discharge condition:')
                    }

In [325]:
# Read data from directory
def read_clinical_notes(data_directory):
    if data_directory[-1] != '/':
        data_directory += '/'
    all_files = os.listdir(data_directory)
    text_files = [fn for fn in all_files if fn[-4:] == '.txt']
    clinical_records = {}
    for file in text_files:
        with open (data_directory+file,'rt') as curr_file:
            clinical_records[file[:len(file)-4]] = curr_file.read().lower()
    print(f'Total clinical records found: {len(text_files)}')
    return clinical_records

In [326]:
# Crate raw dataset
def create_raw_dataset(records,relevant_sections):
    record_list = []
    for k, v in records.items():
        rec = {}
        rec['id'] = int(k)
        for section, (prefix,suffix) in relevant_sections.items():
            rec[section] = v[v.find(prefix)+len(prefix):v.find(suffix)]
        record_list.append(rec)
    return pd.DataFrame(record_list) 

In [327]:
# Get all clinical records
records = read_clinical_notes('training_data')

Total clinical records found: 303


In [328]:
# Get data frame from raw dataset
raw = create_raw_dataset(records,relevant_sections)
raw['underlying_factors'] = raw['complaint'] + ' ' + raw['history'] + ' ' + raw['diagnosis']
# Drop separate sections
raw.drop(['history', 'diagnosis'], axis = 1, inplace = True)

In [329]:
# Check raw data 
raw.head()

Unnamed: 0,id,complaint,underlying_factors
0,100035,"\npost-cardiac arrest, asthma exacerbation\n\n","\npost-cardiac arrest, asthma exacerbation\n\n..."
1,100039,\nabdominal pain\n\n,\nabdominal pain\n\n \n38 yo f w/ h/o all in r...
2,100187,\nsob\n\n,\nsob\n\n \n64 yo woman w/ h/o recurrent pes s...
3,100229,"\nhypotension with elevated lactate, code seps...","\nhypotension with elevated lactate, code seps..."
4,100564,\nsvc thrombosis\n\n,\nsvc thrombosis\n\n \n43 yo male with hx of r...


In [330]:
# Check info
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  303 non-null    int64 
 1   complaint           303 non-null    object
 2   underlying_factors  303 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.2+ KB


### 3. Data pre-processing ###

1. Clean data (punctuations, white spaces etc.)
2. Tokenize text
3. Remove stopwords
4. Lemmatize text
5. Filter unwanted Parts of Speech

In [331]:
# Corpus specific stop words
custom_stop_words = ['history','date','birth','sex','patient','hospital', 'po' ,'mg']
# POS filtering NOUN, VERB, ADJECTIVE, PROVERB
allowed_tags = ['NN','NNP','NNPS','NNS',
                'VB','VBD','VBG','VBN','VBP','VBZ',
                'JJ','JJR', 'JJS', 
                'RB', 'RBR', 'RBS']

In [332]:
# Removes punctuations
def remove_punctuation(text):
    for punc in string.punctuation:
        text = text.replace(punc, ' ')
    return text

In [333]:
# Clean data
def clean(dataset):
    sections = [col for col in dataset.columns if dataset[col].dtype == object]
    # Replace any text between [] including []
    dataset = dataset.replace(r'[\[].*?[\]]', ' ', regex = True)
    for sec in sections:
        # Replace new line chaarcters '\n'
        dataset[sec] = dataset[sec].str.replace('\n',' ')
        # Remove punctuations
        dataset[sec] = dataset[sec].apply(remove_punctuation)
        # Remove numbers
        dataset[sec] = dataset[sec].str.replace(r'\d+', ' ', regex = True)
        # Remove single characters
        dataset[sec] = dataset[sec].str.replace(r'\b\w\b', ' ', regex = True)
        # Remove sequence of white spaces
        dataset[sec] = dataset[sec].str.replace(r'\s+', ' ', regex = True)
        # Remove any leading or trailing white spaces
        dataset[sec] = dataset[sec].str.strip()
    return dataset

In [334]:
# Get all stop words
def get_stop_words(custom_stop_words):
    stop_words = STOP_WORDS.union(set(stopwords.words('english')))
    stop_words = stop_words.union(set(custom_stop_words))
    return stop_words

In [335]:
# Create doc/sentece from tokens/words/lemmas
def word2doc(words):
    text_out = ' '.join(words)
    return text_out

In [336]:
# Tokenize text
def tokenize_text(text):
    stop_words = get_stop_words(custom_stop_words)
    tokens = [token for token in nltk.tokenize.word_tokenize(text) if token not in stop_words]
    return tokens

In [337]:
# Lemmatization
def lemmatize_text(words):
    stop_words = get_stop_words(custom_stop_words)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(w) for w in words]
    lemmas = [lemma for lemma in lemmas if lemma not in stop_words]
    return lemmas

In [338]:
# Remove stop words
def remove_stop_words(text):
    stop_words = get_stop_words(custom_stop_words)
    no_stop_words = [word for word in text if word not in stop_words]
    return no_stop_words

In [339]:
# Filter words that are not NOUN, VERB, ADJECTIVE, PROVERB
def pos_filtering(words):
    filtered_words = []
    for word in words:
        if nltk.pos_tag([word])[0][1] in allowed_tags:
            filtered_words.append(word)
    return filtered_words

In [340]:
# Clean textual data
df = clean(raw)

In [341]:
# Check for null values
df.isna().sum()

id                    0
complaint             0
underlying_factors    0
dtype: int64

In [342]:
# Gert tokens
df['tokens'] = df['underlying_factors'].apply(tokenize_text)

In [343]:
# Get tokenized text
df['tokenized_text'] = df['tokens'].apply(word2doc)

In [344]:
# Get lemmas
df['lemmas'] = df['tokens'].apply(lemmatize_text)

In [345]:
# Get lemmatized text
df['lemmatized_text'] = df['lemmas'].apply(word2doc)

In [346]:
# Get POS filtered words
df['pos_filtered_words'] = df['lemmas'].apply(pos_filtering)

In [347]:
# Get POS filtered text
df['pos_filtered_text'] = df['pos_filtered_words'].apply(word2doc)

In [348]:
# Check head
df.head().T

Unnamed: 0,0,1,2,3,4
id,100035,100039,100187,100229,100564
complaint,post cardiac arrest asthma exacerbation,abdominal pain,sob,hypotension with elevated lactate code sepsis,svc thrombosis
underlying_factors,post cardiac arrest asthma exacerbation mr is ...,abdominal pain yo all in remission cord transp...,sob yo woman recurrent pes filter gib while an...,hypotension with elevated lactate code sepsis ...,svc thrombosis yo male with hx of rectal ca dm...
tokens,"[post, cardiac, arrest, asthma, exacerbation, ...","[abdominal, pain, yo, remission, cord, transpl...","[sob, yo, woman, recurrent, pes, filter, gib, ...","[hypotension, elevated, lactate, code, sepsis,...","[svc, thrombosis, yo, male, hx, rectal, dmii, ..."
tokenized_text,post cardiac arrest asthma exacerbation mr yea...,abdominal pain yo remission cord transplant an...,sob yo woman recurrent pes filter gib anticoag...,hypotension elevated lactate code sepsis yom p...,svc thrombosis yo male hx rectal dmii histopla...
lemmas,"[post, cardiac, arrest, asthma, exacerbation, ...","[abdominal, pain, yo, remission, cord, transpl...","[sob, yo, woman, recurrent, pe, filter, gib, a...","[hypotension, elevated, lactate, code, sepsis,...","[svc, thrombosis, yo, male, hx, rectal, dmii, ..."
lemmatized_text,post cardiac arrest asthma exacerbation mr yea...,abdominal pain yo remission cord transplant an...,sob yo woman recurrent pe filter gib anticoagu...,hypotension elevated lactate code sepsis yom p...,svc thrombosis yo male hx rectal dmii histopla...
pos_filtered_words,"[post, cardiac, arrest, asthma, exacerbation, ...","[abdominal, pain, yo, remission, cord, transpl...","[sob, yo, woman, recurrent, pe, filter, gib, a...","[hypotension, elevated, lactate, code, sepsis,...","[svc, thrombosis, yo, male, hx, rectal, dmii, ..."
pos_filtered_text,post cardiac arrest asthma exacerbation mr yea...,abdominal pain yo remission cord transplant an...,sob yo woman recurrent pe filter gib anticoagu...,hypotension elevated lactate code sepsis yom p...,svc thrombosis yo male hx rectal dmii histopla...


### 4. Modeling ###

1. Setup pipeline and parameters for LDA
2. Perform hyperparameter tuning for LDA
3. Use best parameters of model for topic modeling

In [349]:
# Define pipeline and parameters for LDA
pipeline_LDA = Pipeline([
                ('bow', CountVectorizer()),
                ('lda', LatentDirichletAllocation())]
                )
params_LDA = {
            'bow__max_df': (0.6, 0.7, 0.8),
            'bow__ngram_range': [(2,2), (2,3), (3,3)],
            'lda__n_components': [4, 5, 6],
            'lda__learning_decay' : [0.6, 0.7, 0.8],
            'lda__max_iter': [9, 10, 11]
            }

In [350]:
# Perform hyperparameter tuning
def hyperparameter_tuning(pipeline, params, data, jobs = 4):
    # Init Grid Search Class
    model = GridSearchCV(estimator = pipeline, param_grid = params, n_jobs = jobs, return_train_score='warn', error_score = 'raise')
    # Do the Grid Search
    model.fit(data)
    # Best Model
    best_estimator = model.best_estimator_
    # Model Parameters
    print("Best Params: ", model.best_params_)
    # Score
    print("Best Score: ", model.best_score_)
    return (best_estimator, model)

In [351]:
# Get best LDA model
(best_lda_model, gs_model) = hyperparameter_tuning(pipeline_LDA, params_LDA, df['pos_filtered_text'], jobs = 4)

Best Params:  {'bow__max_df': 0.7, 'bow__ngram_range': (3, 3), 'lda__learning_decay': 0.6, 'lda__max_iter': 11, 'lda__n_components': 4}
Best Score:  -50309.104896431614


In [352]:
# Get vector model
bow = best_lda_model.named_steps['bow']
# Get vectorized data
count_vectors = count_vectors = bow.transform(df['pos_filtered_text'])
# Get topic odel
lda = best_lda_model.named_steps['lda']
# Get lDA output
lda_output = lda.transform(count_vectors)

In [353]:
# Visualize topics
# Create panel
lda_display = pyLDAvis.sklearn.prepare(lda, count_vectors, bow, sort_topics=False)
# Display panel
pyLDAvis.display(lda_display)

### 5.	Predictions ###
1. Identify topics (underlying factors) for existing documents (conditions)
2. Identify top features (factors) in each topic
3. Predict topic (underlying factors) for a new text (condition)

In [354]:
# Get document topc matrix
def create_document_topic_matrix(topic_data, data):
    # Create topic names
    top_names = ["Topic " + str(c + 1) for c in range(lda.n_components)]
    # Create doc names
    doc_names = ["Doc " + str(rec) for rec in data['id'].values]
    # Create data frame
    df_doc_top = pd.DataFrame(np.round(topic_data, 4), columns = top_names, index = doc_names)
    # Get main topic for each doc
    main_topic = np.argmax(df_doc_top.values, axis=1) + 1
    df_doc_top['Main Topic'] = main_topic
    return df_doc_top

In [355]:
# Get topc feature(word) matrix
def create_topic_word_matrix(topic_model = lda, vector_model = bow):
    # Create data frame
    df_top_word = pd.DataFrame(topic_model.components_)
    # Set column and index
    df_top_word.columns = vector_model.get_feature_names()
    df_top_word.index = ["Topic " + str(c + 1) for c in range(topic_model.n_components)]
    return df_top_word

In [356]:
# Get top n words for each topic
def get_top_words(vector_model = bow, topic_model = lda, top_words = 20):
    words = np.array(vector_model.get_feature_names())
    topic_words = []
    for topic_weights in topic_model.components_:
        top_words_pos = (-topic_weights).argsort()[:top_words]
        topic_words.append(words.take(top_words_pos))
    return topic_words

In [357]:
# Predict topic for gievn condition
def predict_topic(list_of_conditions, topic_words, vector_model = bow, topic_model = lda):
    # Create data frame from list of conditions
    df_conditions = pd.DataFrame(list_of_conditions)
    df_conditions.columns = ['Search Condition']
    df_conditions.index = ['Query ' + str(i+1) for i in range(len(list_of_conditions))]
    # Clean text
    df_conditions = clean(df_conditions)
    # Gert tokens
    df_conditions['tokens'] = df_conditions['Search Condition'].apply(tokenize_text)
    # Get tokenized text
    df_conditions['tokenized_text'] = df_conditions['tokens'].apply(word2doc)
    # Get lemmas
    df_conditions['lemmas'] = df_conditions['tokens'].apply(lemmatize_text)
    # Get lemmatized text
    df_conditions['lemmatized_text'] = df_conditions['lemmas'].apply(word2doc)
    # Get POS filtered words
    df_conditions['pos_filtered_words'] = df_conditions['lemmas'].apply(pos_filtering)
    # Get POS filtered text
    df_conditions['pos_filtered_text'] = df_conditions['pos_filtered_words'].apply(word2doc)
    # Vectorize input
    vector_input = vector_model.transform(df_conditions['pos_filtered_text'])
    # LDA transform
    topic_proba_scores = topic_model.transform(vector_input)
    # Get topics
    topics = []
    for prob_scores in topic_proba_scores:
        topic = topic_words.iloc[np.argmax(prob_scores), :].values.tolist()
        topics.append(topic)
    # Create data frame to return
    search_conditions = df_conditions['pos_filtered_text'].values
    df_topics = pd.DataFrame(list(zip(search_conditions, topics)))
    df_topics.columns = ['Search Condition','Topic Words']
    df_topics.index = df_conditions.index
    return df_topics

In [358]:
# Get document topic data frame
document_topic = create_document_topic_matrix(lda_output, df)
# Check head
document_topic.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Main Topic
Doc 100035,0.0018,0.0018,0.9945,0.0018,3
Doc 100039,0.9936,0.0021,0.0021,0.0021,1
Doc 100187,0.0012,0.0012,0.0012,0.9964,4
Doc 100229,0.001,0.001,0.001,0.997,4
Doc 100564,0.0029,0.9912,0.0029,0.0029,2


In [359]:
# Check topic distribution
topic_distribution = document_topic['Main Topic'].value_counts().reset_index(name = 'Total Documents')
topic_distribution.columns = ['Topic', 'Total Documents']
topic_distribution.head()

Unnamed: 0,Topic,Total Documents
0,2,88
1,1,79
2,4,71
3,3,65


In [360]:
# Check topic word matrix
df_top_word = create_topic_word_matrix(lda, bow)
df_top_word.T.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4
aaa cm av,0.250002,1.249995,0.250002,0.250002
aaa graft infected,0.250001,0.250001,1.249997,0.250001
aaa graft site,1.249986,0.250004,0.250005,0.250005
aaa hypercarbia secondary,0.250007,1.249979,0.250007,0.250007
aaa left renal,0.25,0.25,0.25,2.249999


In [361]:
# Get top words
topic_words = get_top_words(bow, lda, 15)        

# Create topic words data frame
df_top_words = pd.DataFrame(topic_words)
df_top_words.columns = ['Word '+ str(i+1) for i in range(df_top_words.shape[1])]
df_top_words.index = ['Topic '+ str(i+1) for i in range(df_top_words.shape[0])]

# Check data
df_top_words.head()

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15
Topic 1,congestive heart failure,altered mental status,acute renal failure,admission discharge service,unit admission discharge,dictated medquist job,hr bp rr,high dose il,red blood cell,packed red blood,left upper extremity,gastroesophageal reflux disease,pain nausea vomiting,upper gastrointestinal bleed,coronary artery disease
Topic 2,denies fever chill,coronary artery disease,chronic obstructive pulmonary,obstructive pulmonary disease,intensive care unit,chill night sweat,dictated medquist job,fever chill night,denies chest pain,year old male,review system hpi,nausea vomiting diarrhea,vomiting diarrhea constipation,tenderness rhinorrhea congestion,sinus tenderness rhinorrhea
Topic 3,tablet sig tablet,congestive heart failure,tablet delayed release,disp tablet refill,dictated medquist job,chronic obstructive pulmonary,obstructive pulmonary disease,tablet daily daily,sig tablet daily,ejection fraction percent,year old male,nausea vomiting diarrhea,urinary tract infection,year old woman,acute renal failure
Topic 4,admission discharge service,unit admission discharge,intensive care unit,dictated medquist job,coronary artery disease,mouth twice day,urinary tract infection,fingersticks unit fingersticks,unit fingersticks unit,hr bp rr,denies chest pain,day prior admission,discharge service addendum,discharge disposition extended,extended care facility


In [366]:
# Define new search condtitions
list_of_conditions = ['congestive heart failure', 'denies fever chills', 'frequent vomiting, diarrhea, and nausea', 'admission discharge service']
# Get topics for new condistions
df_condition_factors = predict_topic(list_of_conditions, df_top_words, bow, lda)
# Display topics
df_condition_factors.head()

Unnamed: 0,Search Condition,Topic Words
Query 1,congestive heart failure,"[tablet sig tablet, congestive heart failure, ..."
Query 2,denies fever chill,"[denies fever chill, coronary artery disease, ..."
Query 3,frequent vomiting diarrhea nausea,"[congestive heart failure, altered mental stat..."
Query 4,admission discharge service,"[admission discharge service, unit admission d..."


### Thank you for your time !!! ###