In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-le

## LDA with basic data preprocessing

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim 
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import CoherenceModel
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df = pd.read_pickle("/content/drive/MyDrive/mimic_cleaned_2000")

In [None]:
df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,service addendum radiologic studies radiologi...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,f service micu and then to medicine history o...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,service cardiothoracic allergies amlodipine a...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,service medicine allergies amlodipine attendi...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,m service medicine allergies patient recorded...


In [None]:
doc_notes_lda = df['TEXT'].to_list()

In [None]:
doc_notes_lda[0]

' service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious processtuberculosis this also moderatesized left pleural effusion head c head ct showed no intracranial hemorrhage or mass effect but old infarction consistent with past medical history abdominal c abdominal ct showed lesions of t and sacrum most likely secondary to osteoporosis these can be followed by repeat imaging as an outpatient md dictated by medquist '

## Cleaning TEXT using NLTK

In [None]:
def clean_text(lst):
    cleaned_text = []
    stopword = stopwords.words("english")
    
    ## Text Cleaning (Removing Punctuations, Stopwords, Tokenization and Lemmatization)
    for text in lst:
        text = str(text).lower()
        text = re.sub(r'[^\w ]+', "", text)
        text = " ".join([lemmatizer.lemmatize(word,pos='v') for word in word_tokenize(text) if not word in set(stopword) and len(word)>3])
        cleaned_text.append(text)
        
    return cleaned_text
  
    
def make_biagram(data,tokens):
    bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in tokens]  
 

cleaned_notes = clean_text(doc_notes_lda)

NameError: ignored

In [None]:
cleaned_notes[0]

'service addendum radiologic study radiologic study also include chest confirm cavitary lesions leave lung apex consistent infectious processtuberculosis also moderatesized leave pleural effusion head head show intracranial hemorrhage mass effect infarction consistent past medical history abdominal abdominal show lesions sacrum likely secondary osteoporosis follow repeat image outpatient dictate medquist'

## Topic modelling using LDA 

In [None]:
def topic_modeling(data):
    ### Tokens
    tokens = []
    for text in data:
        text = word_tokenize(text)
        tokens.append(text)
        
    ### Make Biagrams
    tokens = make_biagram(data=data,tokens=tokens)

    ### Corpora Dictionary
    dictionary = corpora.Dictionary(tokens)
    
    ### Creating Document Term Matrix
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokens]

    ### Training The LDA Model
    lda_model =  gensim.models.LdaModel(doc_term_matrix,   ## Document Term Matrix
                                       num_topics = 12,     ## Number of Topics
                                       id2word = dictionary,     ## Word and Frequency Dictionary                                
                                       passes = 10,        ## Number of passes throw the corpus during training (similar to epochs in neural networks)
                                       chunksize=10,       ## Number of documents to be used in each training chunk
                                       update_every=1,     ## Number of documents to be iterated through for each update.
                                       alpha='auto',       ## number of expected topics that expresses
                                       per_word_topics=True,
                                       random_state=42)
    
    cm = CoherenceModel(model=lda_model, corpus=doc_term_matrix, texts=tokens, coherence='c_v')
    coherence_lda = cm.get_coherence()

    ### Exploring Common Words For Each Topic With Their Relative Words
    for idx, topic in lda_model.print_topics():
        print("Topic: {} \nWords: {}".format(idx, topic ))
        print("\n")
    print("Coherence score:",coherence_lda)

topic_modeling(cleaned_notes)

Topic: 0 
Words: 0.116*"fracture" + 0.040*"fall" + 0.036*"spine" + 0.026*"spinal" + 0.025*"vertebral" + 0.022*"cord" + 0.020*"signal" + 0.020*"reduce" + 0.019*"compression" + 0.019*"collar"


Topic: 1 
Words: 0.032*"leave" + 0.026*"right" + 0.015*"head" + 0.009*"follow" + 0.008*"intact" + 0.008*"mass" + 0.007*"bilaterally" + 0.007*"hemorrhage" + 0.007*"discharge" + 0.006*"weakness"


Topic: 2 
Words: 0.073*"patient" + 0.017*"history" + 0.014*"time" + 0.014*"discharge" + 0.012*"status" + 0.011*"patients" + 0.011*"rate" + 0.011*"pressure" + 0.010*"care" + 0.009*"hospital"


Topic: 3 
Words: 0.040*"inhalation" + 0.040*"albuterol" + 0.030*"breath" + 0.027*"puff" + 0.027*"cough" + 0.026*"prednisone" + 0.026*"copd" + 0.024*"lung" + 0.023*"pulmonary" + 0.023*"wheeze"


Topic: 4 
Words: 0.000*"hamate" + 0.000*"movment" + 0.000*"communited" + 0.000*"ctlso" + 0.000*"errands" + 0.000*"fracures" + 0.000*"toxicities" + 0.000*"styloid" + 0.000*"platform" + 0.000*"signd"


Topic: 5 
Words: 0.088*"tab

## Topic modelling using BERT topic

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 4), stop_words="english")

In [None]:
model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(doc_notes_lda)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2023-04-23 00:26:27,407 - BERTopic - Transformed documents to Embeddings
2023-04-23 00:26:46,297 - BERTopic - Reduced dimensionality
2023-04-23 00:26:46,634 - BERTopic - Clustered reduced embeddings


In [None]:
freq = model.get_topic_info()
freq

Unnamed: 0,Topic,Count,Name
0,-1,830,-1_mg_patient_po_blood
1,0,173,0_mg_tablet_daily_po
2,1,137,1_tablet_mg_po_sig
3,2,107,2_patient_mg_blood_po
4,3,90,3_mg_tablet_daily_po
5,4,83,4_right_left_mg_patient
6,5,75,5_mg_tablet_daily_po
7,6,55,6_patient_coronary_artery_mg
8,7,54,7_mg_patient_po_tablet
9,8,47,8_right_left_mg_patient


In [None]:
model.visualize_barchart(top_n_topics=12)

In [None]:
model.visualize_topics()

In [None]:
# New data for the review
new_review = "The patient needs help with transportation as they have no car."
# Find topics
num_of_topics = 5
similar_topics, similarity = model.find_topics(new_review, top_n=num_of_topics); 
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 5 similar topics are [20, 6, 11, 17, 22], and the similarities are [0.32 0.3  0.3  0.26 0.25]


In [None]:
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(model.get_topic(similar_topics[i]))

The top keywords for topic 20 are:
[('patient', 0.019038962013316448), ('mg po', 0.010021392760706105), ('right', 0.00986214773807221), ('artery', 0.009662620182270222), ('mg', 0.00954520212818694), ('patients', 0.008717518478099496), ('po', 0.0082760366322519), ('coronary', 0.008226810325584663), ('day', 0.00800870406553924), ('graft', 0.006694676469375733)]
The top keywords for topic 6 are:
[('patient', 0.02029989848191447), ('coronary', 0.01319908688685837), ('artery', 0.012611822665058198), ('mg', 0.01188980951660459), ('day', 0.010541173150436012), ('coronary artery', 0.009930185367098773), ('catheterization', 0.009088447409219848), ('left', 0.009069532611270918), ('mg po', 0.00811004867790935), ('po', 0.007434632067548949)]
The top keywords for topic 11 are:
[('patient', 0.022718963810201675), ('mg po', 0.01904545513652299), ('mg', 0.01710121099378197), ('po', 0.015198543167793508), ('postoperative', 0.012480006263268891), ('day', 0.011864148669848645), ('qd', 0.01182104828773299

In [None]:
model.save("BERT_topic_vector")

## BERTTopic with UMAP modelling

In [None]:
# Initiate UMAP
from umap import UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(cleaned_notes)

2023-04-23 00:51:11,029 - BERTopic - Transformed documents to Embeddings
2023-04-23 00:51:19,730 - BERTopic - Reduced dimensionality
2023-04-23 00:51:19,922 - BERTopic - Clustered reduced embeddings


In [None]:
topic_model.visualize_barchart(top_n_topics=12)

In [None]:
topic_model.visualize_topics()

In [None]:
# New data for the review
new_review = "The patient needs help with transportation as they have no car."
# Find topics
num_of_topics = 5
similar_topics, similarity = model.find_topics(new_review, top_n=num_of_topics); 
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 5 similar topics are [20, 6, 11, 17, 22], and the similarities are [0.32 0.3  0.3  0.26 0.25]


In [None]:
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(model.get_topic(similar_topics[i]))

The top keywords for topic 20 are:
[('patient', 0.019038962013316448), ('mg po', 0.010021392760706105), ('right', 0.00986214773807221), ('artery', 0.009662620182270222), ('mg', 0.00954520212818694), ('patients', 0.008717518478099496), ('po', 0.0082760366322519), ('coronary', 0.008226810325584663), ('day', 0.00800870406553924), ('graft', 0.006694676469375733)]
The top keywords for topic 6 are:
[('patient', 0.02029989848191447), ('coronary', 0.01319908688685837), ('artery', 0.012611822665058198), ('mg', 0.01188980951660459), ('day', 0.010541173150436012), ('coronary artery', 0.009930185367098773), ('catheterization', 0.009088447409219848), ('left', 0.009069532611270918), ('mg po', 0.00811004867790935), ('po', 0.007434632067548949)]
The top keywords for topic 11 are:
[('patient', 0.022718963810201675), ('mg po', 0.01904545513652299), ('mg', 0.01710121099378197), ('po', 0.015198543167793508), ('postoperative', 0.012480006263268891), ('day', 0.011864148669848645), ('qd', 0.01182104828773299

In [None]:
topic_model.save("BERT_topic_umap")

In [None]:
df_transport = pd.read_pickle("/content/drive/MyDrive/transport_notes_cleaned_2000")

In [None]:
df_transport.head()

Unnamed: 0,index,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,TRANSPORTATION
0,48,195,22180,162436.0,2134-03-09,,,Discharge summary,Report,,,service medicine allergies patient recorded a...,1
1,490,500,17977,198062.0,2159-11-10,,,Discharge summary,Report,,,f service medicine allergies losartan aspirin...,1
2,1581,1546,79061,127004.0,2153-07-28,,,Discharge summary,Report,,,m service medicine allergies patient recorded...,1
3,1673,1314,75733,151447.0,2132-05-23,,,Discharge summary,Report,,,m service medicine allergies ampicillin penic...,1
4,2086,1865,22208,143545.0,2179-03-22,,,Discharge summary,Report,,,m service medicine allergies benzodiazepines ...,1


In [None]:
df_transport = df_transport.iloc[:479]

In [None]:
doc_notes = df_transport['TEXT'].to_list()

## LDA with 480 text files

In [None]:
def clean_text(lst):
    cleaned_text = []
    stopword = stopwords.words("english")
    
    ## Text Cleaning (Removing Punctuations, Stopwords, Tokenization and Lemmatization)
    for text in lst:
        text = str(text).lower()
        text = re.sub(r'[^\w ]+', "", text)
        text = " ".join([lemmatizer.lemmatize(word,pos='v') for word in word_tokenize(text) if not word in set(stopword) and len(word)>3])
        cleaned_text.append(text)
        
    return cleaned_text
  
    
def make_biagram(data,tokens):
    bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in tokens] 

In [None]:
cleaned_notes_1 = clean_text(doc_notes)

In [None]:
def topic_modeling(data):
    ### Tokens
    tokens = []
    for text in data:
        text = word_tokenize(text)
        tokens.append(text)
        
    ### Make Biagrams
    tokens = make_biagram(data=data,tokens=tokens)

    ### Corpora Dictionary
    dictionary = corpora.Dictionary(tokens)
    
    ### Creating Document Term Matrix
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokens]

    ### Training The LDA Model
    lda_model =  gensim.models.LdaModel(doc_term_matrix,       ## Document Term Matrix
                                       num_topics = 12,          ## Number of Topics
                                       id2word = dictionary,     ## Word and Frequency Dictionary                                
                                       passes = 10,            ## Number of passes throw the corpus during training (similar to epochs in neural networks)
                                       chunksize=10,           ## Number of documents to be used in each training chunk
                                       update_every=1,        ## Number of documents to be iterated through for each update.
                                       alpha='auto',           ## number of expected topics that expresses
                                       per_word_topics=True,
                                       random_state=42)
    
    cm = CoherenceModel(model=lda_model, corpus=doc_term_matrix, texts=tokens, coherence='c_v')
    coherence_lda = cm.get_coherence()

    ### Exploring Common Words For Each Topic With Their Relative Words
    for idx, topic in lda_model.print_topics():
        print("Topic: {} \nWords: {}".format(idx, topic ))
        print("\n")
    print("Coherence score:",coherence_lda)

topic_modeling(cleaned_notes_1)

Topic: 0 
Words: 0.036*"mother" + 0.034*"infant" + 0.024*"transportation" + 0.018*"support" + 0.017*"family" + 0.017*"plan" + 0.016*"work" + 0.016*"social" + 0.016*"visit" + 0.015*"state"


Topic: 1 
Words: 0.015*"normal" + 0.011*"patient" + 0.011*"history" + 0.010*"chest" + 0.009*"closely" + 0.009*"interest" + 0.009*"negative" + 0.008*"blood" + 0.008*"daily" + 0.008*"leave"


Topic: 2 
Words: 0.099*"spit" + 0.057*"loop" + 0.046*"cpap" + 0.034*"encourage" + 0.030*"jaundice" + 0.018*"pillow" + 0.015*"problems" + 0.011*"brother" + 0.010*"rebound" + 0.010*"dress"


Topic: 3 
Words: 0.031*"drift" + 0.016*"extend" + 0.012*"safety" + 0.012*"intervention" + 0.011*"cord" + 0.009*"discussion" + 0.009*"unremarkable" + 0.009*"pattern" + 0.009*"impression" + 0.009*"long"


Topic: 4 
Words: 0.073*"understand" + 0.022*"secretions" + 0.019*"yellow" + 0.007*"thick" + 0.005*"role" + 0.004*"trach" + 0.003*"peep" + 0.002*"device" + 0.001*"multifocal" + 0.001*"copd"


Topic: 5 
Words: 0.005*"approach" + 0

## BERT Topic modelling with 480 text files

In [None]:
model_480 = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = model_480.fit_transform(cleaned_notes_1)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

2023-04-23 01:01:41,706 - BERTopic - Transformed documents to Embeddings
2023-04-23 01:01:44,798 - BERTopic - Reduced dimensionality
2023-04-23 01:01:44,834 - BERTopic - Clustered reduced embeddings


In [None]:
model_480.visualize_barchart(top_n_topics=12)

In [None]:
model_480.visualize_topics()

In [None]:
# New data for the review
new_review = "The patient needs help with transportation as they have no car."
# Find topics
num_of_topics = 5
similar_topics, similarity = model_480.find_topics(new_review, top_n=num_of_topics); 
# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 5 similar topics are [5, 0, 2, 1, 3], and the similarities are [0.31 0.23 0.21 0.21 0.2 ]


In [None]:
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(model_480.get_topic(similar_topics[i]))

The top keywords for topic 5 are:
[('neonatology', 0.04105440500625986), ('neonatology attend', 0.03424970052150271), ('feed', 0.03357602605115394), ('transportation', 0.026613272511747044), ('attend', 0.025834709842008925), ('stool', 0.02564450356642722), ('attend weeks', 0.02488249432387608), ('neonatology attend weeks', 0.02488249432387608), ('stable', 0.024395688119004822), ('continue', 0.023184240386368057)]
The top keywords for topic 0 are:
[('mother', 0.03684280401395678), ('family', 0.022967075362640903), ('work', 0.022229658340846902), ('transportation', 0.01888877709922989), ('state', 0.01709890806088182), ('meet', 0.016450909799992235), ('social', 0.01573693533634609), ('social work', 0.01528084842612946), ('need', 0.014527504529420447), ('home', 0.014390117694534608)]
The top keywords for topic 2 are:
[('infant', 0.034827343584622444), ('feed', 0.02714307995015563), ('cont', 0.022109810364896388), ('monitor', 0.019035508709726916), ('continue', 0.01892886703117622), ('care'