### This program can be used to create a structured df from the Patient's notes data

Importing all necessary libraries

In [168]:
# All necessary imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

from time import time
import numpy as np

import os
from sklearn.utils import resample
import re

from sklearn.decomposition import PCA
from sklearn.manifold import (
    TSNE,
    Isomap,
    LocallyLinearEmbedding,
)
import umap

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

import plotly.express as px

from sklearn import metrics
import warnings
warnings.simplefilter(action='ignore')


Reading in the data and taking a sample

NOTE: in the interest of time, I took a sample size of ___. The output of the methods employed here will differ based on the sample size used, as that affects the data that the models see.

In [169]:

patient = pd.read_csv("patient_notes.csv")
print(f"Original data shape: {patient.shape}")
n = 10000

patient = resample(patient, n_samples=n)
print(f"Sample data shape: {patient.shape}")
patient.head()

Original data shape: (42146, 3)
Sample data shape: (10000, 3)


Unnamed: 0,pn_num,case_num,pn_history
12778,37895,3,Mr Hamilton is a 35 year old male complaining ...
3583,20555,2,44 year old f with 3 year history or unpredict...
29227,70540,7,35 yo F complains of irregular menstrual cycle...
35191,82503,8,67 yo F present with insomina for the last 3 w...
37385,90404,9,20-year-old female co headache x 1 day \r\n* s...


Cleaning html tags, emails, numbers, and punctuation from the patient notes.

In [170]:
# ---------------------------------------------------------------
# Downloading necessary NLTK resources needed for preprocessing
# ---------------------------------------------------------------

# -------------------------
# Text Cleaning Function
# -------------------------
def clean_text(text):
    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Removing emails
    text = re.sub(r'\S+@\S+', '', text)

    # Removing numbers
    text = re.sub(r'\d+', '', text)

    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text


# -------------------------
# # Cleaning the text in the column
# -------------------------
patient["pn_history_clean"] = patient["pn_history"].apply(clean_text)

patient.head()

Unnamed: 0,pn_num,case_num,pn_history,pn_history_clean
12778,37895,3,Mr Hamilton is a 35 year old male complaining ...,Mr Hamilton is a year old male complaining up...
3583,20555,2,44 year old f with 3 year history or unpredict...,year old f with year history or unpredictabl...
29227,70540,7,35 yo F complains of irregular menstrual cycle...,yo F complains of irregular menstrual cycles ...
35191,82503,8,67 yo F present with insomina for the last 3 w...,yo F present with insomina for the last week...
37385,90404,9,20-year-old female co headache x 1 day \r\n* s...,yearold female co headache x day \r\n strated...


Instantiating the stemmer and lemmatizer and downloading the stop words used

In [171]:
# ---------------------------------------------------------------------------------
# Tokenization, Lowercasing, Stop Words Removal, Stemming, and Lemmatization
# ---------------------------------------------------------------------------------

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/btwitchell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/btwitchell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/btwitchell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Applying the stop word removal, stemming, and lemmatization to the dataset. Also tokenizing the documents by word.

In [172]:
def process_text(text):
    """
    This function processes the given text by performing tokenization, lowercasing, stop words removal, stemming, and lemmatization.

    Parameters:
    - text: The input text to be processed.

    Returns:
    - processed_text: The processed text after tokenization, lowercasing, stop words removal, stemming, and lemmatization.
    """
    # Tokenization and Lowercasing (puts words into a list delimited by spaces and newlines)
    tokens = nltk.word_tokenize(text.lower())
    
    # Stop Words Removal
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming and Lemmatization
    stemmed = [porter.stem(word) for word in filtered_tokens]
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    processed_text = ' '.join(lemmatized)

    return processed_text

# ---------------------------------------------------------------------------------
# Apply processing
# ---------------------------------------------------------------------------------
patient["pn_history_preprocessed"] = patient["pn_history_clean"].apply(process_text)
patient

Unnamed: 0,pn_num,case_num,pn_history,pn_history_clean,pn_history_preprocessed
12778,37895,3,Mr Hamilton is a 35 year old male complaining ...,Mr Hamilton is a year old male complaining up...,mr hamilton year old male complain stomach pro...
3583,20555,2,44 year old f with 3 year history or unpredict...,year old f with year history or unpredictabl...,year old f year histori unpredict period rang ...
29227,70540,7,35 yo F complains of irregular menstrual cycle...,yo F complains of irregular menstrual cycles ...,yo f complain irregular menstrual cycl period ...
35191,82503,8,67 yo F present with insomina for the last 3 w...,yo F present with insomina for the last week...,yo f present insomina last week patient suffer...
37385,90404,9,20-year-old female co headache x 1 day \r\n* s...,yearold female co headache x day \r\n strated...,yearold femal co headach x day strate suddenli...
...,...,...,...,...,...
31562,72988,7,35 yo f comes to the office co of problem with...,yo f comes to the office co of problem with p...,yo f come offic co problem period irregularit ...
16275,41537,4,Ms. Moore is a 45 year old female who presents...,Ms Moore is a year old female who presents fo...,m moor year old femal present evalu feel nervo...
40432,93558,9,HPI: Patient is a 20 year old female complaini...,HPI Patient is a year old female complaining ...,hpi patient year old femal complain headach si...
14672,39819,3,"35 y/o M c/o ""stomach really bothering me"" \r\...",yo M co stomach really bothering me \r\nHPI A...,yo co stomach realli bother hpi start moago ch...


Representing the tokenized words as a TF-IDF matrix, which accounts for frequency of tokens both locally to a document (increasing the importance of the token) aswell as globally for a corpus (decreasing importance of the token).

In [173]:

# ---------------------------------------------------------------------------------
# TF-IDF Vectorization
# ---------------------------------------------------------------------------------

vectorizer = TfidfVectorizer(strip_accents='unicode',max_df=0.5, min_df=5)#ngram_range=(1,1), (2,2), (3,3)
t0 = time()
sparse_tfidf_matrix = vectorizer.fit_transform(patient["pn_history_preprocessed"])
print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {sparse_tfidf_matrix.shape[0]}, n_features: {sparse_tfidf_matrix.shape[1]}")
print(f"sparseness: {sparse_tfidf_matrix.nnz / np.prod(sparse_tfidf_matrix.shape):.3f} of the entries of the TFIDF matrix are non-zero.")

vectorization done in 0.401 s
n_samples: 10000, n_features: 3591
sparseness: 0.017 of the entries of the TFIDF matrix are non-zero.


Apply data reducgion via Latent Semantic Analysis (LSM), which can be conceived of as PCA applied to sparse matrices.

Because the TF-IDF matrix is sparse, I determined (contrary to the assignment's instructions) that outlier detection  could negatively affect the output of truncated Singular Value Decomposition (SVD). Of course, while trying out various methods and obtaining results is more ideal, in the interest and of time and simplicity I omitted this step. Per [Scikit-Learn's guidance for clustering with text documents](https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py) I determined that normalization of the data would be appropriate only after I first obtain the truncated SVD matrix and keeping the first k columns that capture 90% of total variance.

In [174]:

n_components = min(sparse_tfidf_matrix.shape)
svd = TruncatedSVD(n_components=n_components)
    
t0 = time()
X_lsa = svd.fit(sparse_tfidf_matrix)
cumulative_variance = np.cumsum(svd.explained_variance_ratio_)

# Keep first k components that explain 90% of total variance
k = np.where(cumulative_variance >= 0.90)[0][0] + 1
k_singular_values = svd.transform(sparse_tfidf_matrix)[:, :k]

print(f"LSA done in {time() - t0:.3f} s")
print(f"Number of components explaining 90% of variance: {k}")

# normalize the singular values
X = Normalizer(copy=False).fit_transform(k_singular_values)
df = pd.concat([patient[['pn_num','case_num','pn_history']].reset_index(drop=True),pd.DataFrame(X).reset_index(drop=True)], axis=1)
df


LSA done in 54.120 s
Number of components explaining 90% of variance: 1404


Unnamed: 0,pn_num,case_num,pn_history,0,1,2,3,4,5,6,...,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403
0,37895,3,Mr Hamilton is a 35 year old male complaining ...,0.359222,0.343675,0.108686,0.103118,-0.055795,-0.047713,-0.004512,...,0.004821,-0.002703,0.014817,0.006085,0.001674,0.000387,-0.008683,-0.011946,-0.008967,-0.004861
1,20555,2,44 year old f with 3 year history or unpredict...,0.288994,-0.139732,0.137617,0.021303,0.041342,0.013760,0.015951,...,0.004470,-0.003090,0.007753,-0.015211,-0.003799,0.012597,-0.000897,-0.008394,-0.018405,-0.020047
2,70540,7,35 yo F complains of irregular menstrual cycle...,0.337212,-0.240513,0.359017,0.072237,0.069259,0.040155,-0.027472,...,0.004874,-0.001152,-0.004981,-0.008906,-0.004106,0.008707,-0.003222,0.008107,0.016686,-0.008986
3,82503,8,67 yo F present with insomina for the last 3 w...,0.205558,-0.102330,-0.176173,0.195617,-0.081420,-0.038823,0.012653,...,0.015851,0.008453,0.014499,-0.007084,-0.009744,0.019334,-0.002475,0.035429,0.023690,-0.010920
4,90404,9,20-year-old female co headache x 1 day \r\n* s...,0.272254,0.042140,-0.104269,-0.080158,0.271678,-0.052540,-0.079025,...,-0.008419,-0.011026,-0.005550,-0.010678,-0.026419,-0.009070,0.025168,-0.026367,0.018814,0.002551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,72988,7,35 yo f comes to the office co of problem with...,0.358864,-0.296778,0.412633,0.092737,0.068084,0.020012,0.022336,...,0.009990,0.002986,0.001396,0.015227,-0.012737,0.007229,0.012555,-0.001133,0.004790,-0.009789
9996,41537,4,Ms. Moore is a 45 year old female who presents...,0.288686,-0.052411,-0.151272,0.063106,-0.035046,0.317724,-0.031872,...,0.008227,-0.015428,0.021300,0.013940,0.010022,0.004411,-0.017486,-0.027004,-0.000958,-0.030432
9997,93558,9,HPI: Patient is a 20 year old female complaini...,0.346863,-0.004302,-0.105752,-0.058549,0.356642,-0.056624,-0.097660,...,0.002809,-0.005362,0.000881,-0.005994,0.002043,-0.002307,0.000333,0.004124,0.003317,0.006118
9998,39819,3,"35 y/o M c/o ""stomach really bothering me"" \r\...",0.295769,0.262856,0.064290,0.054147,-0.054798,-0.017688,-0.014978,...,0.007419,-0.004900,0.003524,-0.015975,0.008274,-0.000274,0.012374,0.008897,-0.003220,0.002562


## Manifold techniques

I used various manifold techniques to again perform dimension reduction, representing the data in 2, 3, and 4 dimensions. 

In [181]:

def embeddings_dict_n_components(n_components=2, n_neighbors=30, n_jobs=-1):
    """Returns a dictionary of embeddings with n components
    Embeddings used are PCA, t-SNE, Isomap, LLE, Modified LLE, & UMAP
    """
    embeddings = {
        f"PCA_{n_components}_comp": PCA(
                                        n_components=n_components,
                                        random_state=42,
                                        ),
        f"t-SNE_{n_components}_comp": TSNE(
                                        n_components=n_components,
                                        n_iter=500,
                                        n_iter_without_progress=150,
                                        random_state=42,
                                        method='exact',
                                        n_jobs=n_jobs
                                        ),
        f"Isomap_{n_components}_comp": Isomap(n_neighbors=n_neighbors,
                                             n_components=n_components,
                                             n_jobs=n_jobs),
        f"Stand-LLE_{n_components}_comp": LocallyLinearEmbedding(
                                                        n_neighbors=n_neighbors,
                                                        n_components=n_components,
                                                        method="standard",
                                                        n_jobs=n_jobs
                                                        ),
        f"Mod-LLE_{n_components}_comp": LocallyLinearEmbedding(
                                            n_neighbors=n_neighbors,
                                            n_components=n_components,
                                            method="modified",
                                            n_jobs=n_jobs
                                            ),
        f"HessianLLE_{n_components}_comp": LocallyLinearEmbedding(
                                                        n_neighbors=n_neighbors,
                                                        n_components=n_components,
                                                        method="hessian",
                                                        eigen_solver='dense',
                                                        n_jobs=n_jobs
                                                        ),
        f"UMAP_{n_components}_comp": umap.UMAP(
                                                n_neighbors=n_neighbors,
                                                n_components=n_components,)
    }
    return embeddings

embeddings = []
dimensions_upper_bound = 4
for n_comp in range(2,dimensions_upper_bound+1):
    embeddings.append(embeddings_dict_n_components(n_components=n_comp))



The 2 and 3 dimension representations are then plotted to show how well each technique was able to separate the data. The original labels for the patiet's case were used to color the datapoints, and the data for almost all the techniques are separated according to these labels.

It's important to note that the goal with these unsupervised methods is not necessarily to separate the data according to these labels, as other types of groupings could prove to provide useful information. However, it is also helpful to see that LSM combined with these various embeddings performed very well in separating the data according to the pre-defined cases.

In [182]:

def plot_embeddings(X, y, title, axes, dimensions=2, shorten_axes_labels=True):
    """Plot the embedding with either 2 or 3 dimensions"""
    y=y.astype(str)
    height = 400
    width = 500

    if shorten_axes_labels:
        for i in range(len(axes)):
            axes[i] = axes[i][-3:]

    if dimensions == 2:
        fig = px.scatter(x=X[0], y=X[1], color=y, title=title)
        fig.update_layout(width=width,height=height,xaxis_title=axes[0],yaxis_title=axes[1])

    if dimensions == 3:
        fig = px.scatter_3d(x=X[0], y=X[1], z=X[2],color=y, title=title)
        fig.update_layout(scene = dict(xaxis_title=axes[0],yaxis_title=axes[1],zaxis_title=axes[2]),
                          width=width,height=height)
    
    return fig


In [183]:
embeddings_dfs = {}
figs = []

for i in range(len(embeddings)):
    for title, model in embeddings[i].items():
        
        print(f"Fitting {title}...",end='', flush=True)
        t0 = time()
        X_reduced =  model.fit_transform(X)
        print("Done ✔️")
        fit_time = f" - {time() - t0:.3f} sec."
        X_reduced = X_reduced.T
        columns = [f"{title}_c_{i}" for i in range(1,i+3)]
        embedding_df = pd.DataFrame(X_reduced.T, columns=columns)
        embedding_df['case_num'] = df['case_num']
        if i < 2:
            figs.append(plot_embeddings(X_reduced, df['case_num'], title+fit_time, columns, model.n_components))
        embeddings_dfs[title] = embedding_df

Fitting PCA_2_comp...Done ✔️
Fitting t-SNE_2_comp...Done ✔️
Fitting Isomap_2_comp...Done ✔️
Fitting Stand-LLE_2_comp...Done ✔️
Fitting Mod-LLE_2_comp...Done ✔️
Fitting HessianLLE_2_comp...Done ✔️
Fitting UMAP_2_comp...Done ✔️
Fitting PCA_3_comp...Done ✔️
Fitting t-SNE_3_comp...Done ✔️
Fitting Isomap_3_comp...Done ✔️
Fitting Stand-LLE_3_comp...Done ✔️
Fitting Mod-LLE_3_comp...Done ✔️
Fitting HessianLLE_3_comp...Done ✔️
Fitting UMAP_3_comp...Done ✔️
Fitting PCA_4_comp...Done ✔️
Fitting t-SNE_4_comp...

We note from the figures below that separation and density among the clusters varies according to the embedding employed and the number of dimensions.

UMAP in 3 dimensions and T-SNE in 2 dimensions perform exceptionally well (at least from simply looking at the plots).

In [None]:
for fig in figs:
    fig.show()

In [None]:
# print(f"Embedding DFs created:")
# [print(f"\t{key}") for key in embeddings_dfs.keys()]
# print()
# embeddings_dfs['PCA_3_comp']

## Clustering methods
I used **k-means** clustering to assess clustering performance. K-means is a centroid based method that partitions the data into k spherical clusters. 

I used 10 centroids (the same number as the number of cases). However, any number of centroids could be used, given that we're assuming that we don't actually have labeled data.

In the interest of time, I did not implement the following methods. However, these would have provided a better overall indication of clustering performance:
- **DBSCAN**: a density based method (grouping points that are closely packed together), better suited to arbitrarily shaped clusters with plenty of noise; it doesn't require a predefined number of clusters
- **Agglomerative clustering**: a bottom-up hierarchical partitioning of points that's tolerant to noise, though computationally expensive


In [None]:
n_clusters = 10
random_state = 42
kmeans_model = KMeans(n_clusters=n_clusters, random_state=random_state)

## Clustering metrics
I referenced [This scikit learn page](https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation) in determining how to measure clustering performance. There are a few different methods to measure performance, but I went with the **silhoutte score.**

The silhouette score measures how well each sample in a cluster is separated from samples in other clusters. It takes into account both the cohesion (how close the samples are to each other within the same cluster) and the separation (how far the samples are from samples in other clusters). The silhouette score ranges from -1 to 1, where a higher value indicates better clustering. A score close to 1 suggests that the samples are well-clustered, while a score close to -1 indicates that the samples may have been assigned to the wrong clusters.

In [None]:
clusters = {}
for embedding_name, df in embeddings_dfs.items():
    # extract only components (i.e. no labels)
    X_1 = df.iloc[:,:-1]

    # fit clusters
    kmeans = kmeans_model.fit(X_1)
    
    # extract cluster labels
    kmeans_labels = kmeans.labels_

    silhouette_score = metrics.silhouette_score(X_1, kmeans_labels, metric='euclidean')
    clusters[embedding_name] = {
        "silhouette_score":silhouette_score,
        }

In [None]:
silhouette_scores = pd.DataFrame(clusters).T
silhouette_scores.columns = ['silhouette_score']
silhouette_scores = silhouette_scores.sort_values(by='silhouette_score', ascending=False)
silhouette_scores

Unnamed: 0,silhouette_score
UMAP_2_comp,0.691186
HessianLLE_4_comp,0.679967
HessianLLE_3_comp,0.672246
Mod-LLE_4_comp,0.667523
UMAP_3_comp,0.652429
Stand-LLE_3_comp,0.629521
HessianLLE_2_comp,0.611089
Mod-LLE_2_comp,0.606089
Stand-LLE_2_comp,0.603844
Stand-LLE_4_comp,0.601834


In graphing the silhouette scores, we note that the embedding with the highest performance is ____, though all methods produced above 0, indicating that they all performed reasonable well on all dimension sizes. This suggests that after applying truncated SVD, the data were 

The number of components used, ranging from 2 to 4, didn't appear to affect clustering performance across all models, as there's a mix of all dimensions among the embeddings with the highest scores.

The __ dimensional ___ embedding is clearly a good method, as the clusters are distinct both in the plotted visual as well as in the silhouette metrics shown below.

In [None]:
fig = px.bar(silhouette_scores, x=silhouette_scores.index, y='silhouette_score', 
             labels={'silhouette_score': 'Silhouette Scores'})

fig.update_layout(title="Clustering Performance - Highest to Lowest",xaxis_title="Embedding",xaxis_tickangle=45)

fig.show()
