## <font color = darkblue>  Latent Dirichlett Analysis (LDA)


In [11]:

import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time

print(time.strftime("%H:%M:%S"))


# ----------------------------------
# load patient notes data
# ----------------------------------

df = pd.read_csv('/Users/p.mittal/Library/Mobile Documents/com~apple~CloudDocs/Roux/Courses/NLP/Assignments/Assignment 1 BOW/Subset_patient_notes.csv')

df.shape
print(time.strftime("%H:%M:%S"))


df.head()



17:32:52
17:32:52


Unnamed: 0,pn_num,case_num,pn_history
0,540,0,Patient is a 17 year old male who presents due...
1,1245,0,17 yo M patients comes to the office c/o palpi...
2,1848,0,Pt is a 17 yo M presenting with palpitations. ...
3,10603,1,20 yo F presents in ED with Right lower quadra...
4,10897,1,Ms. Powelton is a 20 yo F c/o abdominal pain.\...


## <font color = darkblue>  Text Preprocessing


In [12]:
# ----------------------------------
# Remove punctuation
# ----------------------------------

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df['pn_history'] = df['pn_history'].apply(remove_punctuation)

# ----------------------------------
# Remove stop words
# ----------------------------------
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['pn_history'] = df['pn_history'].apply(remove_stop_words)

# ----------------------------------
# Lower case
# ----------------------------------
def lower_case(text):
    return text.lower()

df['pn_history'] = df['pn_history'].apply(lower_case)

# ----------------------------------
# Tokenization
# ----------------------------------
def tokenize(text):
    return text.split()

df['pn_history'] = df['pn_history'].apply(tokenize)

# ----------------------------------
# # Stemming
# ----------------------------------
# from nltk.stem import PorterStemmer

# stemmer = PorterStemmer()

# def stem(text):
#     return [stemmer.stem(word) for word in text]

# df['note'] = df['note'].apply(stem)

# # Lemmatization
# from nltk.stem import WordNetLemmatizer

# lemmatizer = WordNetLemmatizer()

# def lemmatize(text):

#     return [lemmatizer.lemmatize(word) for word in text]

# df['note'] = df['note'].apply(lemmatize)

# ----------------------------------
# Convert to string
# ----------------------------------
def to_string(text):
    return ' '.join(text)

df['pn_history'] = df['pn_history'].apply(to_string)

print(df.head())


vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['pn_history'])

X.shape


# convert X to a pandas dataframe
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X_df.head()

   pn_num  case_num                                         pn_history
0     540         0  patient 17 year old male presents due 23 month...
1    1245         0  17 yo m patients comes office co palpitations ...
2    1848         0  pt 17 yo m presenting palpitations over past t...
3   10603         1  20 yo f presents ed right lower quadrant pain ...
4   10897         1  ms powelton 20 yo f co abdominal pain abdomina...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/p.mittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,04,051,051ppd,10,1015,1020,10y,11,12,1200,...,yesterday,yo,yof,yom,young,youngest,yperthyroidism,yr,yrs,zigzag
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,2,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [13]:
# ALtenatively Apply tfidfvectorizer instead
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_tfidf  = vectorizer.fit_transform(df['pn_history'])

X_tfidf.shape



(100, 1883)

In [14]:
# Expand stop words list by adding numbers from 1 - 100000, Also add words that have digits in them
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stop_words2  = list(stop_words)

for i in range(1, 100000):
    stop_words2.append(str(i))

for word in vectorizer.get_feature_names_out():
    if any(char.isdigit() for char in word):
        stop_words2.append(word)


vectorizer = TfidfVectorizer(stop_words=stop_words2, max_features=1000)

X_tfidf2  = vectorizer.fit_transform(df['pn_history'])

X_tfidf2.shape


# convert X to a pandas dataframe
# X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# X_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/p.mittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(100, 1000)

In [15]:
# Expand stop words list by adding numbers from 1 - 100000, Also add words that have digits in them
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stop_words2  = list(stop_words)

for i in range(1, 100000):
    stop_words2.append(str(i))

for word in vectorizer.get_feature_names_out():
    if any(char.isdigit() for char in word):
        stop_words2.append(word)


vectorizer = CountVectorizer(stop_words=stop_words2)

X = vectorizer.fit_transform(df['pn_history'])

X.shape


# convert X to a pandas dataframe
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/p.mittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,04,051,051ppd,10y,125mg,12x,12xday,12xmonth,1530min,17yearold,...,yesterday,yo,yof,yom,young,youngest,yperthyroidism,yr,yrs,zigzag
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [16]:
from sklearn.decomposition import LatentDirichletAllocation
# ----------------------------------
# Create topics from the DTM
# ----------------------------------


# Set the number of topics
num_topics = 10

# Create the LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit the model to the term-document matrix
lda.fit(X)

# Get the topics and their corresponding word probabilities
topics = lda.components_

# Print the top words for each topic
for topic_idx, topic in enumerate(topics):
    top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10:-1]]
    print(f"Topic {topic_idx+1}: {' '.join(top_words)}")



Topic 1: pain yrs denies pmh since motrin negative relief back
Topic 2: none denies hx meds episodes palpitations constant pain weeks
Topic 3: pain denies none years per recently nausea week diarrhea
Topic 4: weeks yo none years weight ros pain past ago
Topic 5: denies weeks pt mother asleep nervousness son feels falling
Topic 6: denies since pain yesterday ago weeks presents change sleep
Topic 7: pain none patient yo week weeks pmh per ago
Topic 8: pain ago chest morning change exercise trauma tried active
Topic 9: none ago denies periods days months changes years weeks
Topic 10: ago none pain chest denies palpitations use recent meds


In [17]:
# Identify the topic for each document
topic_assignments = lda.transform(X)

# Print the topic assignments for the first document
print(topic_assignments[0])


[0.00112378 0.00112373 0.0011238  0.00112373 0.00112373 0.00112367
 0.00112383 0.00112367 0.00112393 0.98988612]


In [18]:
# Create a column with the topic assignments
df['topic'] = topic_assignments.argmax(axis=1)

# Print the first few rows of the dataframe
print(df.head())


   pn_num  case_num                                         pn_history  topic
0     540         0  patient 17 year old male presents due 23 month...      9
1    1245         0  17 yo m patients comes office co palpitations ...      9
2    1848         0  pt 17 yo m presenting palpitations over past t...      9
3   10603         1  20 yo f presents ed right lower quadrant pain ...      2
4   10897         1  ms powelton 20 yo f co abdominal pain abdomina...      6


In [19]:
df['topic'].value_counts()[0:2]

8    26
6    19
Name: topic, dtype: int64

In [20]:
# Write code to include topic probabilities in the dataframe
# Create a dataframe with the topic probabilities
topic_probs = pd.DataFrame(topic_assignments, columns=[f"topic_{i}" for i in range(num_topics)])

# Concatenate the topic probabilities dataframe with the original dataframe
df = pd.concat([df, topic_probs], axis=1)

# Print the first few rows of the dataframe
print(df.head())


   pn_num  case_num                                         pn_history  topic  \
0     540         0  patient 17 year old male presents due 23 month...      9   
1    1245         0  17 yo m patients comes office co palpitations ...      9   
2    1848         0  pt 17 yo m presenting palpitations over past t...      9   
3   10603         1  20 yo f presents ed right lower quadrant pain ...      2   
4   10897         1  ms powelton 20 yo f co abdominal pain abdomina...      6   

    topic_0   topic_1   topic_2   topic_3   topic_4   topic_5   topic_6  \
0  0.001124  0.001124  0.001124  0.001124  0.001124  0.001124  0.001124   
1  0.001021  0.001021  0.001021  0.001021  0.001021  0.001021  0.001021   
2  0.000981  0.000980  0.000981  0.000981  0.000981  0.000980  0.000981   
3  0.001163  0.001163  0.989533  0.001163  0.001163  0.001163  0.001163   
4  0.001205  0.001205  0.001205  0.001205  0.001205  0.001205  0.507267   

    topic_7   topic_8   topic_9  
0  0.001124  0.001124  0.989

## <font color =darkblue> Correlated Topic Modeling (CTM)

In [23]:
from gensim.models import LdaModel

# Create a dictionary from the tokenized text
dictionary = gensim.corpora.Dictionary(df['pn_history'].apply(tokenize))

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in df['pn_history'].apply(tokenize)]

# # Train the CTM model
ctm_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)
# type(corpus)
corpus[0]


print(ctm_model.show_topic(0))
print(ctm_model.show_topic(1))
print(ctm_model.show_topic(2))
print(ctm_model.show_topic(4))

# Extract scores for each topic
topic_scores = ctm_model.get_document_topics(corpus)

# Add all the topics scores to the dataframe
for i in range(5):
    df[f'CTM_topic_{i+1}'] = [score for score in topic_scores]

df.shape




<IPython.core.display.Javascript object>

[('pain', 0.009291814), ('ago', 0.009101729), ('she', 0.0088106515), ('2', 0.008089366), ('no', 0.00804796), ('weeks', 0.007912603), ('none', 0.0067558917), ('denies', 0.0062724226), ('past', 0.0060825096), ('years', 0.005965885)]
[('none', 0.0143132415), ('pain', 0.013297848), ('ago', 0.012284461), ('2', 0.010037972), ('she', 0.00998282), ('weeks', 0.008909963), ('denies', 0.008081368), ('months', 0.0077617043), ('pmh', 0.007460067), ('no', 0.0072768335)]
[('pain', 0.020417651), ('she', 0.017842073), ('none', 0.014244257), ('denies', 0.012732814), ('no', 0.011942729), ('ago', 0.009944201), ('yo', 0.0071742786), ('2', 0.007144319), ('weeks', 0.0064724293), ('3', 0.0056549776)]
[('pain', 0.013665644), ('no', 0.012341363), ('she', 0.011489299), ('ago', 0.009771118), ('none', 0.009641998), ('2', 0.009535229), ('denies', 0.009341725), ('weeks', 0.0066018817), ('use', 0.006506514), ('nausea', 0.0060637426)]


(100, 19)

In [24]:
df.head()

Unnamed: 0,pn_num,case_num,pn_history,topic,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,CTM_topic_1,CTM_topic_2,CTM_topic_3,CTM_topic_4,CTM_topic_5
0,540,0,patient 17 year old male presents due 23 month...,9,0.001124,0.001124,0.001124,0.001124,0.001124,0.001124,0.001124,0.001124,0.001124,0.989886,"[(1, 0.5530196), (2, 0.44103643)]","[(1, 0.5492442), (2, 0.44481185)]","[(1, 0.5539463), (2, 0.44010967)]","[(1, 0.55023694), (2, 0.4438191)]","[(1, 0.5553565), (2, 0.43869945)]"
1,1245,0,17 yo m patients comes office co palpitations ...,9,0.001021,0.001021,0.001021,0.001021,0.001021,0.001021,0.001021,0.00102,0.158528,0.833307,"[(1, 0.93574804), (4, 0.058647506)]","[(1, 0.92730355), (4, 0.06709205)]","[(1, 0.93035775), (4, 0.064037845)]","[(1, 0.92205465), (4, 0.072340965)]","[(1, 0.92927194), (4, 0.06512364)]"
2,1848,0,pt 17 yo m presenting palpitations over past t...,9,0.000981,0.00098,0.000981,0.000981,0.000981,0.00098,0.000981,0.00098,0.000981,0.991175,"[(2, 0.97770005), (4, 0.017021244)]","[(2, 0.97132915), (4, 0.023392161)]","[(2, 0.9714798), (4, 0.023241589)]","[(2, 0.98557734)]","[(2, 0.97566473), (4, 0.019056587)]"
3,10603,1,20 yo f presents ed right lower quadrant pain ...,2,0.001163,0.001163,0.989533,0.001163,0.001163,0.001163,0.001163,0.001163,0.001163,0.001163,"[(2, 0.99178284)]","[(2, 0.9917852)]","[(2, 0.9917835)]","[(2, 0.99178535)]","[(2, 0.9917832)]"
4,10897,1,ms powelton 20 yo f co abdominal pain abdomina...,6,0.001205,0.001205,0.001205,0.001205,0.001205,0.001205,0.507267,0.001205,0.483094,0.001205,"[(2, 0.99166924)]","[(2, 0.991671)]","[(2, 0.9916698)]","[(2, 0.99167186)]","[(2, 0.99167204)]"
