# LDA and pyLDAvis with medical transcriptions and AWS SageMaker

* Gratefully indebted to my sources, from whom I have lifted quite a bit of the code in this notebook:
* https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb
* https://github.com/XuanX111/Friends_text_generator/blob/master/Friends_LDAvis_Xuan_Qi.ipynb

In [1]:
! pip install gensim

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
! pip install pyLDAvis

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models
from gensim.corpora import Dictionary, MmCorpus

In [4]:
'''
Loading nltk libraries
'''

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from nltk.corpus import stopwords


In [5]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import pandas as pd
import numpy as np
np.random.seed(400)
np.set_printoptions(precision=3, suppress=True)

In [7]:
# accessing the SageMaker Python SDK
import boto3
import sagemaker
from sagemaker.amazon.common import numpy_to_record_serializer
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker import get_execution_role

In [8]:
from __future__ import print_function
from IPython.core.debugger import set_trace

import tempfile
import string
import funcy as fp
import sys
import logging
import os
import pickle
import re

In [9]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [10]:
import logging
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

## Getting the data

In [11]:
# get the data
df = pd.read_csv('mtsamples.csv').drop(['Unnamed: 0'], axis=1)

print(df.columns)
df.head()

Index(['description', 'medical_specialty', 'sample_name', 'transcription',
       'keywords'],
      dtype='object')


Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [12]:
df['transcription'].isnull().sum()
df=df.dropna(subset=['transcription']).copy()
df['transcription'].isnull().sum()

0

In [13]:
df['medical_specialty']=df['medical_specialty'].str.strip()
spec_list = df['medical_specialty'].value_counts().head(3).index.tolist()
spec_list

['Surgery', 'Consult - History and Phy.', 'Cardiovascular / Pulmonary']

In [14]:
surgery = df[df['medical_specialty']==spec_list[0]]['transcription']
consult = df[df['medical_specialty']==spec_list[1]]['transcription']
cardio = df[df['medical_specialty']==spec_list[2]]['transcription']

# A) Surgery

#### Step 1. Data Preprocessing
We will perform the following steps:

* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed - words are reduced to their root form.

In [15]:
specialty=surgery

In [16]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''

# Tokenize 
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token.lower())           
    return result

In [17]:
processed_docs = []
for doc in specialty:
    processed_docs.append(preprocess(doc))
print(len(processed_docs))

1088


#### Step 2. Corpus and dictionary

In [18]:
stop = set(stopwords.words('english'))

In [19]:
# prepare the corpus

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=3, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
#     print(dictionary)
    stopwords = nltk_stopwords().union(additional_stopwords)
#     print(stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
#     print(stopword_ids)
    dictionary.filter_tokens(stopword_ids)
#     print(dictionary)
    dictionary.compactify()
#     print(dictionary)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)

    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return dictionary, corpus

In [20]:
dictionary, corpus = prep_corpus(processed_docs)

Building dictionary...
Building corpus...


#### Step 3. Modeling & Visualization with pyLDAvis

In [21]:
lda_model = models.ldamodel.LdaModel(corpus=corpus,
         id2word=dictionary,
         num_topics=10,
         eval_every=10,
         passes=50,
         iterations=5000,
         random_state=np.random.RandomState(15))

In [22]:
vis_data1 = gensimvis.prepare(lda_model, corpus, dictionary)
surgery_lda = open('surgery.html', 'w')
pyLDAvis.save_html(vis_data1, surgery_lda)

# B) Consultations

#### Step 1. Data Preprocessing

In [23]:
specialty=consult

In [24]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''

# Tokenize 
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token.lower())           
    return result

In [25]:
processed_docs = []
for doc in specialty:
    processed_docs.append(preprocess(doc))
print(len(processed_docs))

516


#### Step 2. Corpus and dictionary

In [26]:
stop = set(stopwords.words('english'))

In [27]:
# prepare the corpus

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=3, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
#     print(dictionary)
    stopwords = nltk_stopwords().union(additional_stopwords)
#     print(stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
#     print(stopword_ids)
    dictionary.filter_tokens(stopword_ids)
#     print(dictionary)
    dictionary.compactify()
#     print(dictionary)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)

    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return dictionary, corpus

In [28]:
dictionary, corpus = prep_corpus(processed_docs)

Building dictionary...
Building corpus...


#### Step 3. Modeling & Visualization with pyLDAvis

In [29]:
lda_model = models.ldamodel.LdaModel(corpus=corpus,
         id2word=dictionary,
         num_topics=10,
         eval_every=10,
         passes=50,
         iterations=5000,
         random_state=np.random.RandomState(15))

In [30]:
vis_data1 = gensimvis.prepare(lda_model, corpus, dictionary)
consult_lda = open('consult.html', 'w')
pyLDAvis.save_html(vis_data1, consult_lda)

# C) Cardio

#### Step 1. Data Preprocessing
We will perform the following steps:

* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed - words are reduced to their root form.

In [31]:
specialty=cardio

In [32]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''

# Tokenize 
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token.lower())           
    return result

In [33]:
processed_docs = []
for doc in specialty:
    processed_docs.append(preprocess(doc))
print(len(processed_docs))

371


#### Step 2. Corpus and dictionary

In [34]:
stop = set(stopwords.words('english'))

In [35]:
# prepare the corpus

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=3, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
#     print(dictionary)
    stopwords = nltk_stopwords().union(additional_stopwords)
#     print(stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
#     print(stopword_ids)
    dictionary.filter_tokens(stopword_ids)
#     print(dictionary)
    dictionary.compactify()
#     print(dictionary)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)

    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return dictionary, corpus

In [36]:
dictionary, corpus = prep_corpus(processed_docs)

Building dictionary...
Building corpus...


#### Step 3. Modeling & Visualization with pyLDAvis

In [37]:
lda_model = models.ldamodel.LdaModel(corpus=corpus,
         id2word=dictionary,
         num_topics=10,
         eval_every=10,
         passes=50,
         iterations=5000,
         random_state=np.random.RandomState(15))

In [38]:
vis_data1 = gensimvis.prepare(lda_model, corpus, dictionary)
cardio_lda = open('cardio.html', 'w')
pyLDAvis.save_html(vis_data1, cardio_lda)