## <font color = darkblue>  Latent Semantic Analysis (also known as Latent Semantic Indexing)


In [2]:

import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time

print(time.strftime("%H:%M:%S"))


# ----------------------------------
# load patient notes data
# ----------------------------------

df = pd.read_csv('../Assignment1/patient_notes.csv')

df.shape
print(time.strftime("%H:%M:%S"))


df.head()



15:49:26
15:49:26


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


## <font color = darkblue>  Text Preprocessing


In [3]:
# ----------------------------------
# Remove punctuation
# ----------------------------------

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

df['pn_history'] = df['pn_history'].apply(remove_punctuation)

# ----------------------------------
# Remove stop words
# ----------------------------------
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['pn_history'] = df['pn_history'].apply(remove_stop_words)

# ----------------------------------
# Lower case
# ----------------------------------
def lower_case(text):
    return text.lower()

df['pn_history'] = df['pn_history'].apply(lower_case)

# ----------------------------------
# Tokenization
# ----------------------------------
def tokenize(text):
    return text.split()

df['pn_history'] = df['pn_history'].apply(tokenize)

# ----------------------------------
# # Stemming
# ----------------------------------
# from nltk.stem import PorterStemmer

# stemmer = PorterStemmer()

# def stem(text):
#     return [stemmer.stem(word) for word in text]

# df['note'] = df['note'].apply(stem)

# # Lemmatization
# from nltk.stem import WordNetLemmatizer

# lemmatizer = WordNetLemmatizer()

# def lemmatize(text):

#     return [lemmatizer.lemmatize(word) for word in text]

# df['note'] = df['note'].apply(lemmatize)

# ----------------------------------
# Convert to string
# ----------------------------------
def to_string(text):
    return ' '.join(text)

df['pn_history'] = df['pn_history'].apply(to_string)

print(df.head())


vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['pn_history'])

X.shape


# convert X to a pandas dataframe
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caoyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   pn_num  case_num                                         pn_history
0       0         0  17yearold male come student health clinic comp...
1       1         0  17 yo male recurrent palpitations past 3 mo la...
2       2         0  dillon cleveland 17 yo male patient significan...
3       3         0  17 yo co palpitation started 3 mos ago nothing...
4       4         0  17yo male pmh evaluation palpitations states l...


Unnamed: 0,00,000,0000,004am,00h,01,010,010510,011,0110,...,zeromonth,zexually,zigzag,ziminopril,zno,zolpidem,zone,zones,zopidem,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## <font color = darkblue>  Alternatively Apply tfidfvectorizer



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_tfidf  = vectorizer.fit_transform(df['pn_history'])

X_tfidf.shape



(42146, 64598)

In [5]:
# -------------------------------------------------------------------------------------------
# Expand stop words list by adding numbers from 1 - 100000, Also add words that have digits in them
# -------------------------------------------------------------------------------------------

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stop_words2  = list(stop_words)

for i in range(1, 100000):
    stop_words2.append(str(i))

for word in vectorizer.get_feature_names_out():
    if any(char.isdigit() for char in word):
        stop_words2.append(word)


vectorizer = TfidfVectorizer(stop_words=stop_words2, max_features=1000)

X_tfidf2  = vectorizer.fit_transform(df['pn_history'])

X_tfidf2.shape


# convert X_tfidf2 to a pandas dataframe
X_tfidf2_df = pd.DataFrame(X_tfidf2.toarray(), columns=vectorizer.get_feature_names_out())

X_tfidf2_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caoyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,abd,abdomen,abdominal,able,abnormal,accident,accompanied,ache,aches,aching,...,worsens,worst,would,wt,year,years,yesterday,yo,yr,yrs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.23489,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083552,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.140955,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047798,0.0,0.0
3,0.0,0.0,0.106871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057753,0.166152,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## <font color = darkblue> Back to CountVectorizer

In [6]:
# Expand stop words list by adding numbers from 1 - 100000, Also add words that have digits in them
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stop_words2  = list(stop_words)

for i in range(1, 100000):
    stop_words2.append(str(i))

for word in vectorizer.get_feature_names_out():
    if any(char.isdigit() for char in word):
        stop_words2.append(word)


vectorizer = CountVectorizer(stop_words=stop_words2)

X = vectorizer.fit_transform(df['pn_history'])

X.shape


# convert X to a pandas dataframe
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caoyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,00,000,0000,004am,00h,01,010,010510,011,0110,...,zeromonth,zexually,zigzag,ziminopril,zno,zolpidem,zone,zones,zopidem,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.decomposition import PCA

# Create the PCA model with 100 components
pca = PCA(n_components=100)

# Fit the model to the DTM
pca.fit(X.toarray())

# Calculate the cumulative explained variance ratio
explained_variance_ratio_cumulative = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that explain 90% of the variance
num_components = np.argmax(explained_variance_ratio_cumulative >= 0.9) + 1

# Reduce the dimensionality of the DTM to the selected number of components
X_reduced = pca.transform(X.toarray())[:, :num_components]

# Print the shape of the reduced DTM
print("Shape of the reduced DTM:", X_reduced.shape)

print(X_reduced.shape)

X_reduced

# Print the number of components that explain 90% of the variance
print("Number of components:", num_components)

# Create a DataFrame with the reduced DTM
X_reduced_df = pd.DataFrame(X_reduced, columns=[f"PC_{i+1}" for i in range(num_components)])


# Merge it with the orifinal df
df = pd.concat([df, X_reduced_df], axis=1)

df.head()

In [44]:
# ----------------------------------
# Latent Semantic Analysis
# ----------------------------------

# LSA
lsa = TruncatedSVD(n_components=90)
lsa.fit(X)

# ---------------------------
# Extracting LSA compontens
# ---------------------------
print(lsa.components_.shape)
print(lsa.components_)

explained_variance_ratio = lsa.explained_variance_ratio_
for i, ratio in enumerate(explained_variance_ratio):
    print(f"Component {i+1}: {ratio:.2%} variance explained")

explained_variance_ratio = lsa.explained_variance_ratio_
total_variance = 0
num_components = 0

for i, ratio in enumerate(explained_variance_ratio):
    total_variance += ratio
    num_components += 1
    if total_variance >= 0.90:  # Change the percentage as per your requirement
        break

print(f"Number of components that explain at least 90% variance: {num_components}")


(90, 1790)
[[ 0.00170971  0.00860116  0.00333325 ...  0.00351005  0.02855242
   0.0018558 ]
 [ 0.00153775  0.01691541  0.00938176 ... -0.00638481  0.00660773
   0.00051902]
 [ 0.0002745   0.00437036  0.00130679 ... -0.00287603 -0.02371675
  -0.00340923]
 ...
 [-0.02240225  0.0052338   0.00276324 ...  0.02223586  0.03875896
  -0.01826308]
 [-0.00978506 -0.01322255  0.01278545 ...  0.02236606 -0.03451825
  -0.02097498]
 [ 0.00794157  0.00379635 -0.00193329 ... -0.0049159   0.01448065
  -0.01911571]]
Component 1: 1.81% variance explained
Component 2: 8.04% variance explained
Component 3: 4.08% variance explained
Component 4: 3.98% variance explained
Component 5: 3.42% variance explained
Component 6: 2.66% variance explained
Component 7: 2.37% variance explained
Component 8: 2.21% variance explained
Component 9: 2.05% variance explained
Component 10: 1.97% variance explained
Component 11: 1.80% variance explained
Component 12: 1.72% variance explained
Component 13: 1.70% variance explained

In [45]:
# # ---------------------------
# # Extracting LSA compontens
# # ---------------------------

# print(lsa.components_.shape)
# print(lsa.components_)

# explained_variance_ratio = lsa.explained_variance_ratio_
# for i, ratio in enumerate(explained_variance_ratio):
#     print(f"Component {i+1}: {ratio:.2%} variance explained")

# explained_variance_ratio = lsa.explained_variance_ratio_
# total_variance = 0
# num_components = 0

# for i, ratio in enumerate(explained_variance_ratio):
#     total_variance += ratio
#     num_components += 1
#     if total_variance >= 0.90:  # Change the percentage as per your requirement
#         break

# print(f"Number of components that explain at least 90% variance: {num_components}")


In [46]:
# -------------------------------------------------------------------------------------------
# Describe the first 5 components of LSA by looking at the top 10 words in each component
# Get the feature names
# -------------------------------------------------------------------------------------------

feature_names = vectorizer.get_feature_names_out()
print(feature_names)

# Get the top 10 words for each component
for i, component in enumerate(lsa.components_[:5]):
    top_words = [feature_names[j] for j in component.argsort()[-10:]]
    print(f"Component {i+1}:", top_words)
    



['04' '051' '051ppd' ... 'yr' 'yrs' 'zigzag']
Component 1: ['meds', 'years', 'yo', 'months', 'pmh', 'weeks', 'ago', 'denies', 'none', 'pain']
Component 2: ['construction', 'food', 'patient', 'uncle', 'week', 'epigastric', 'tums', 'motrin', 'back', 'pain']
Component 3: ['normal', 'last', 'episodes', 'days', 'every', 'day', 'none', 'periods', 'ago', 'months']
Component 4: ['year', 'cancer', 'vaginal', 'htn', 'irregular', 'past', 'every', 'denies', 'days', 'periods']
Component 5: ['days', 'ibuprofen', 'use', 'periods', 'headache', 'since', 'patient', 'denies', 'yesterday', 'none']


In [47]:
# ------------------------------------
# Extract scores for each component
# ------------------------------------
component_scores = lsa.transform(X)
print(component_scores.shape)


# Add all the components scores to the dataframe
for i in range(num_components):
    df[f'LSA_{i+1}'] = component_scores[:, i]

df.head()

(100, 90)


Unnamed: 0,pn_num,case_num,pn_history,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,...,LSA_65,LSA_66,LSA_67,LSA_68,LSA_69,LSA_70,LSA_71,LSA_72,LSA_73,LSA_74
0,540,0,patient 17 year old male presents due 23 month...,-0.640445,0.816118,-1.539145,1.983328,0.223373,1.379197,0.728186,...,0.009001,0.570278,0.458625,0.1304,-0.225655,0.302108,-0.015225,-0.27193,-0.749545,1.363679
1,1245,0,17 yo m patients comes office co palpitations ...,-1.261102,0.837092,-1.406931,1.063434,0.538726,2.264147,-1.067496,...,-0.076688,-0.24143,0.680854,-0.570548,0.683425,0.211308,-0.435842,-0.651714,-0.213078,0.61741
2,1848,0,pt 17 yo m presenting palpitations over past t...,-1.173276,1.152738,-0.616357,1.531858,1.960402,0.765176,2.719037,...,-0.949345,-1.193617,-0.575358,-0.023955,-0.100563,-0.14171,0.485539,0.960647,0.100114,-1.282241
3,10603,1,20 yo f presents ed right lower quadrant pain ...,1.447289,1.770569,-1.013297,0.882138,-0.705942,1.155483,-0.522995,...,0.757571,0.241125,-0.980508,0.334099,-1.340436,-0.748753,-0.162576,-0.761103,-0.279417,-0.933587
4,10897,1,ms powelton 20 yo f co abdominal pain abdomina...,0.597881,1.458727,0.222601,2.373978,-1.228166,0.763608,-2.129697,...,0.868775,0.302114,0.779619,-0.598002,-0.297459,2.194918,0.223274,-0.570086,0.002412,-0.9142


In [49]:
list(df)
df.loc[:,['PC_1', 'LSA_1']]

Unnamed: 0,PC_1,LSA_1
0,-0.640445,5.050122
1,-1.261102,5.313515
2,-1.173276,5.596960
3,1.447289,5.660821
4,0.597881,6.139865
...,...,...
95,-0.418901,5.361001
96,2.073084,6.634255
97,-0.864343,2.640556
98,0.622461,4.600048


In [52]:
# ----------------------------------------------------------------------
# Testing if LSA and PCA create similar components
# Running a correlation matrix of first five components of PCA and LSA
# ----------------------------------------------------------------------


column_names = [f'PC_{i+1}' for i in range(5)] + [f'LSA_{i+1}' for i in range(5)]
df.loc[:, column_names].corr()


Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,LSA_1,LSA_2,LSA_3,LSA_4,LSA_5
PC_1,1.0,4.599583e-15,1.068756e-15,-7.948522e-16,2.178643e-16,0.534981,0.990092,0.085329,-0.014594,0.02669
PC_2,4.599583e-15,1.0,-3.300208e-16,-6.762344e-16,-2.437471e-16,0.644769,-0.124464,0.947152,-0.127,0.150809
PC_3,1.068756e-15,-3.300208e-16,1.0,9.123664e-16,-9.367883e-17,-0.030327,0.004976,0.142869,0.99,-0.017272
PC_4,-7.948522e-16,-6.762344e-16,9.123664e-16,1.0,3.6179760000000003e-17,0.139471,-0.020879,-0.177857,0.043569,0.981033
PC_5,2.178643e-16,-2.437471e-16,-9.367883e-17,3.6179760000000003e-17,1.0,0.106728,-0.014136,-0.065294,0.01333,-0.047283
LSA_1,0.5349814,0.644769,-0.03032653,0.1394708,0.1067282,1.0,0.414093,0.519738,-0.09277,0.19047
LSA_2,0.9900915,-0.1244643,0.004976105,-0.0208794,-0.01413566,0.414093,1.0,-0.016314,0.002912,-0.005978
LSA_3,0.08532938,0.9471523,0.1428687,-0.1778572,-0.06529445,0.519738,-0.016314,1.0,0.003655,-0.007504
LSA_4,-0.01459351,-0.1269996,0.9899999,0.04356931,0.01333003,-0.09277,0.002912,0.003655,1.0,0.001339
LSA_5,0.02669007,0.1508093,-0.0172724,0.9810328,-0.04728346,0.19047,-0.005978,-0.007504,0.001339,1.0
